diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1774,13 +1774,15 @@ WorklistInserter AddNodes(*this); + DAG.AssignTopologicalOrder(); + // Add all the dag nodes to the worklist. // // Note: All nodes are not added to PruningList here, this is because the only // nodes which can be deleted are those which have no uses and all other nodes // which would otherwise be added to the worklist by the first call to // getNextWorklistEntry are already present in it. - for (SDNode &Node : DAG.allnodes()) + for (SDNode &Node : reverse(DAG.allnodes())) AddToWorklist(&Node, /* IsCandidateForPruning */ Node.use_empty()); // Create a dummy node (which is not added to allnodes), that adds a reference diff --git a/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll b/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll --- a/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll +++ b/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll @@ -12,14 +12,14 @@ ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %eax, (%esp) +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %eax, (%esp) ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK-NEXT: movq (%esp), %mm0 -; CHECK-NEXT: movq {{[0-9]+}}(%esp), %mm1 +; CHECK-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; CHECK-NEXT: movq (%esp), %mm1 ; CHECK-NEXT: maskmovq %mm0, %mm1 ; CHECK-NEXT: addl $16, %esp ; CHECK-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/2008-06-13-VolatileLoadStore.ll b/llvm/test/CodeGen/X86/2008-06-13-VolatileLoadStore.ll --- a/llvm/test/CodeGen/X86/2008-06-13-VolatileLoadStore.ll +++ b/llvm/test/CodeGen/X86/2008-06-13-VolatileLoadStore.ll @@ -10,11 +10,13 @@ ; CHECK-LABEL: f: ; CHECK: # %bb.0: ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: movsd %xmm1, atomic -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: movsd %xmm1, atomic2 -; CHECK-NEXT: movsd %xmm0, anything +; CHECK-NEXT: movsd %xmm0, atomic +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movsd %xmm0, atomic2 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %ecx, anything+4 +; CHECK-NEXT: movl %eax, anything ; CHECK-NEXT: movl ioport, %ecx ; CHECK-NEXT: movl ioport, %eax ; CHECK-NEXT: shrl $16, %eax diff --git a/llvm/test/CodeGen/X86/2008-12-02-dagcombine-1.ll b/llvm/test/CodeGen/X86/2008-12-02-dagcombine-1.ll --- a/llvm/test/CodeGen/X86/2008-12-02-dagcombine-1.ll +++ b/llvm/test/CodeGen/X86/2008-12-02-dagcombine-1.ll @@ -8,8 +8,11 @@ ; CHECK-LABEL: test: ; CHECK: ## %bb.0: ## %entry ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: subl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl $-2, %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl %ecx, %edx +; CHECK-NEXT: subl %edx, %eax +; CHECK-NEXT: leal -2(%eax,%ecx), %eax ; CHECK-NEXT: retl entry: %0 = ptrtoint ptr %a to i32 diff --git a/llvm/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll b/llvm/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll --- a/llvm/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll +++ b/llvm/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll @@ -16,9 +16,9 @@ ; CHECK-NEXT: ## InlineAsm Start ; CHECK-NEXT: cpuid ; CHECK-NEXT: ## InlineAsm End -; CHECK-NEXT: movl %ebx, 8(%esi) -; CHECK-NEXT: movl %ecx, 12(%esi) ; CHECK-NEXT: movl %edx, 16(%esi) +; CHECK-NEXT: movl %ecx, 12(%esi) +; CHECK-NEXT: movl %ebx, 8(%esi) ; CHECK-NEXT: movl %eax, 4(%esi) ; CHECK-NEXT: popl %esi ; CHECK-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll b/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll --- a/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll +++ b/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll @@ -9,9 +9,8 @@ ; CHECK-NEXT: movl (%rdi,%rdx,4), %edx ; CHECK-NEXT: movzbl %dl, %r10d ; CHECK-NEXT: addl $4, %r10d -; CHECK-NEXT: shrq $6, %rdx -; CHECK-NEXT: andl $67108860, %edx # imm = 0x3FFFFFC -; CHECK-NEXT: movl (%rdi,%rdx), %edx +; CHECK-NEXT: shrq $8, %rdx +; CHECK-NEXT: movl (%rdi,%rdx,4), %edx ; CHECK-NEXT: movzbl %dl, %edi ; CHECK-NEXT: shrl $8, %edx ; CHECK-NEXT: addl $5, %esi diff --git a/llvm/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll b/llvm/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll --- a/llvm/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll +++ b/llvm/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll @@ -17,17 +17,17 @@ ; CHECK-NEXT: movq %rdx, (%rsp) ; CHECK-NEXT: movq 24(%rdi), %rdx ; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rsi, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq 16(%rdi), %rdx +; CHECK-NEXT: movq 56(%rdi), %rdx ; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq 32(%rdi), %rdx +; CHECK-NEXT: movq 48(%rdi), %rdx ; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq 40(%rdi), %rdx ; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq 48(%rdi), %rdx +; CHECK-NEXT: movq 32(%rdi), %rdx ; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq 56(%rdi), %rdx +; CHECK-NEXT: movq 16(%rdi), %rdx ; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rsi, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %al, (%rsp) ; CHECK-NEXT: movb %cl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq __stack_chk_guard(%rip), %rax diff --git a/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll b/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll --- a/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll +++ b/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll @@ -17,21 +17,19 @@ define dso_local i32 @main() nounwind uwtable { ; CHECK-LABEL: main: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl i(%rip), %esi -; CHECK-NEXT: movl j(%rip), %eax -; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: movq i(%rip), %rdx +; CHECK-NEXT: movq j(%rip), %rsi +; CHECK-NEXT: movsbl %sil, %eax +; CHECK-NEXT: idivb %dl +; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $8, %edx -; CHECK-NEXT: movsbl %al, %ecx -; CHECK-NEXT: shrl $8, %eax -; CHECK-NEXT: cbtw +; CHECK-NEXT: shrl $8, %esi +; CHECK-NEXT: movsbl %sil, %eax ; CHECK-NEXT: idivb %dl -; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: idivb %sil -; CHECK-NEXT: movzbl %dl, %ecx +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: movd %ecx, %xmm0 ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: pinsrb $1, %ecx, %xmm0 +; CHECK-NEXT: pinsrb $1, %eax, %xmm0 ; CHECK-NEXT: pextrw $0, %xmm0, res(%rip) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll --- a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll +++ b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll @@ -110,12 +110,12 @@ ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB2_2: # %.lr.ph ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups %ymm0, (%rsi) +; CHECK-NEXT: vmovups %xmm0, (%rsi) +; CHECK-NEXT: vmovups %xmm0, 16(%rsi) ; CHECK-NEXT: addq $32, %rsi ; CHECK-NEXT: decl %edi ; CHECK-NEXT: jne .LBB2_2 ; CHECK-NEXT: .LBB2_3: # %._crit_edge -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = icmp sgt i32 %count, 0 br i1 %1, label %.lr.ph, label %._crit_edge diff --git a/llvm/test/CodeGen/X86/WidenArith.ll b/llvm/test/CodeGen/X86/WidenArith.ll --- a/llvm/test/CodeGen/X86/WidenArith.ll +++ b/llvm/test/CodeGen/X86/WidenArith.ll @@ -9,8 +9,15 @@ ; X86-NEXT: vmulps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vsubps %ymm2, %ymm1, %ymm3 ; X86-NEXT: vcmpltps %ymm1, %ymm0, %ymm0 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; X86-NEXT: vcmpltps %ymm3, %ymm2, %ymm1 -; X86-NEXT: vandps %ymm1, %ymm0, %ymm0 +; X86-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7] +; X86-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-NEXT: retl ; @@ -20,8 +27,15 @@ ; X64-NEXT: vmulps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vsubps %ymm2, %ymm1, %ymm3 ; X64-NEXT: vcmpltps %ymm1, %ymm0, %ymm0 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; X64-NEXT: vcmpltps %ymm3, %ymm2, %ymm1 -; X64-NEXT: vandps %ymm1, %ymm0, %ymm0 +; X64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7] +; X64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-NEXT: retq %c1 = fadd <8 x float> %a, %b diff --git a/llvm/test/CodeGen/X86/abds.ll b/llvm/test/CodeGen/X86/abds.ll --- a/llvm/test/CodeGen/X86/abds.ll +++ b/llvm/test/CodeGen/X86/abds.ll @@ -20,13 +20,15 @@ ; ; X64-LABEL: abd_ext_i8: ; X64: # %bb.0: -; X64-NEXT: movsbl %sil, %eax -; X64-NEXT: movsbl %dil, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movsbq %dil, %rcx +; X64-NEXT: movsbq %sil, %rax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $al killed $al killed $rax ; X64-NEXT: retq %aext = sext i8 %a to i64 %bext = sext i8 %b to i64 @@ -50,13 +52,15 @@ ; ; X64-LABEL: abd_ext_i8_undef: ; X64: # %bb.0: -; X64-NEXT: movsbl %sil, %eax -; X64-NEXT: movsbl %dil, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movsbq %dil, %rcx +; X64-NEXT: movsbq %sil, %rax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $al killed $al killed $rax ; X64-NEXT: retq %aext = sext i8 %a to i64 %bext = sext i8 %b to i64 @@ -80,13 +84,15 @@ ; ; X64-LABEL: abd_ext_i16: ; X64: # %bb.0: -; X64-NEXT: movswl %si, %eax -; X64-NEXT: movswl %di, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movswq %di, %rcx +; X64-NEXT: movswq %si, %rax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $ax killed $ax killed $rax ; X64-NEXT: retq %aext = sext i16 %a to i64 %bext = sext i16 %b to i64 @@ -110,13 +116,15 @@ ; ; X64-LABEL: abd_ext_i16_undef: ; X64: # %bb.0: -; X64-NEXT: movswl %si, %eax -; X64-NEXT: movswl %di, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movswq %di, %rcx +; X64-NEXT: movswq %si, %rax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $ax killed $ax killed $rax ; X64-NEXT: retq %aext = sext i16 %a to i64 %bext = sext i16 %b to i64 @@ -129,13 +137,19 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind { ; X86-LABEL: abd_ext_i32: ; X86: # %bb.0: +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: sarl $31, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: negl %edx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: sarl $31, %esi ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovlel %edx, %eax +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: xorl %esi, %eax +; X86-NEXT: subl %esi, %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: abd_ext_i32: @@ -159,13 +173,19 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind { ; X86-LABEL: abd_ext_i32_undef: ; X86: # %bb.0: +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: sarl $31, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: negl %edx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: sarl $31, %esi ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovlel %edx, %eax +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: xorl %esi, %eax +; X86-NEXT: subl %esi, %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: abd_ext_i32_undef: diff --git a/llvm/test/CodeGen/X86/abdu.ll b/llvm/test/CodeGen/X86/abdu.ll --- a/llvm/test/CodeGen/X86/abdu.ll +++ b/llvm/test/CodeGen/X86/abdu.ll @@ -20,13 +20,13 @@ ; ; X64-LABEL: abd_ext_i8: ; X64: # %bb.0: -; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: movzbl %dil, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $al killed $al killed $rax ; X64-NEXT: retq %aext = zext i8 %a to i64 %bext = zext i8 %b to i64 @@ -50,13 +50,13 @@ ; ; X64-LABEL: abd_ext_i8_undef: ; X64: # %bb.0: -; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: movzbl %dil, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $al killed $al killed $rax ; X64-NEXT: retq %aext = zext i8 %a to i64 %bext = zext i8 %b to i64 @@ -80,13 +80,13 @@ ; ; X64-LABEL: abd_ext_i16: ; X64: # %bb.0: -; X64-NEXT: movzwl %si, %eax ; X64-NEXT: movzwl %di, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: movzwl %si, %eax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $ax killed $ax killed $rax ; X64-NEXT: retq %aext = zext i16 %a to i64 %bext = zext i16 %b to i64 @@ -110,13 +110,13 @@ ; ; X64-LABEL: abd_ext_i16_undef: ; X64: # %bb.0: -; X64-NEXT: movzwl %si, %eax ; X64-NEXT: movzwl %di, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: movzwl %si, %eax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $ax killed $ax killed $rax ; X64-NEXT: retq %aext = zext i16 %a to i64 %bext = zext i16 %b to i64 @@ -129,13 +129,13 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind { ; X86-LABEL: abd_ext_i32: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: negl %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %ecx, %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovbel %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_ext_i32: @@ -159,13 +159,13 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind { ; X86-LABEL: abd_ext_i32_undef: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: negl %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %ecx, %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovbel %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_ext_i32_undef: diff --git a/llvm/test/CodeGen/X86/absolute-constant.ll b/llvm/test/CodeGen/X86/absolute-constant.ll --- a/llvm/test/CodeGen/X86/absolute-constant.ll +++ b/llvm/test/CodeGen/X86/absolute-constant.ll @@ -10,7 +10,10 @@ define void @bar(ptr %x) { ; CHECK-LABEL: bar: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: testb $foo, (%rdi) +; CHECK-NEXT: movsbl (%rdi), %eax +; CHECK-NEXT: movl $foo, %ecx +; CHECK-NEXT: movsbl %cl, %ecx +; CHECK-NEXT: testl %ecx, %eax ; CHECK-NEXT: je .LBB0_1 ; CHECK-NEXT: # %bb.2: # %if.then ; CHECK-NEXT: xorl %eax, %eax @@ -20,7 +23,10 @@ ; ; PIC-LABEL: bar: ; PIC: # %bb.0: # %entry -; PIC-NEXT: testb $foo, (%rdi) +; PIC-NEXT: movsbl (%rdi), %eax +; PIC-NEXT: movl $foo, %ecx +; PIC-NEXT: movsbl %cl, %ecx +; PIC-NEXT: testl %ecx, %eax ; PIC-NEXT: je .LBB0_1 ; PIC-NEXT: # %bb.2: # %if.then ; PIC-NEXT: xorl %eax, %eax diff --git a/llvm/test/CodeGen/X86/add-cmov.ll b/llvm/test/CodeGen/X86/add-cmov.ll --- a/llvm/test/CodeGen/X86/add-cmov.ll +++ b/llvm/test/CodeGen/X86/add-cmov.ll @@ -368,7 +368,7 @@ ; CHECK-NEXT: addq $66, %rsi ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: cmovneq %rax, %rsi -; CHECK-NEXT: decw (%rdx,%rsi) +; CHECK-NEXT: decw (%rsi,%rdx) ; CHECK-NEXT: retq %i = ptrtoint ptr %ptr to i64 %i66 = add i64 %i, 66 @@ -414,7 +414,7 @@ ; CHECK-NEXT: addq $66, %rdx ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: cmovneq %rax, %rdx -; CHECK-NEXT: decw (%rsi,%rdx) +; CHECK-NEXT: decw (%rdx,%rsi) ; CHECK-NEXT: retq %i = ptrtoint ptr %ptr to i64 %i66 = add i64 %idx, 66 diff --git a/llvm/test/CodeGen/X86/add-of-mul.ll b/llvm/test/CodeGen/X86/add-of-mul.ll --- a/llvm/test/CodeGen/X86/add-of-mul.ll +++ b/llvm/test/CodeGen/X86/add-of-mul.ll @@ -26,7 +26,8 @@ define <4 x i32> @test_vector(<4 x i32> %x) { ; CHECK-LABEL: test_vector: ; CHECK: # %bb.0: -; CHECK-NEXT: pslld $2, %xmm0 +; CHECK-NEXT: paddd %xmm0, %xmm0 +; CHECK-NEXT: paddd %xmm0, %xmm0 ; CHECK-NEXT: retq %mul = mul <4 x i32> %x, %add = add <4 x i32> %mul, %x diff --git a/llvm/test/CodeGen/X86/add-sub-bool.ll b/llvm/test/CodeGen/X86/add-sub-bool.ll --- a/llvm/test/CodeGen/X86/add-sub-bool.ll +++ b/llvm/test/CodeGen/X86/add-sub-bool.ll @@ -344,7 +344,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: adcl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; @@ -367,7 +367,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: adcl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; @@ -430,10 +430,10 @@ ; X86-LABEL: test_i32_add_sub_var: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: adcl $0, %eax ; X86-NEXT: retl ; @@ -455,10 +455,10 @@ ; X86-LABEL: test_i32_add_sub_commute_var: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: adcl $0, %eax ; X86-NEXT: retl ; @@ -480,10 +480,10 @@ ; X86-LABEL: test_i32_sub_add_var: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: retl ; @@ -508,7 +508,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; @@ -559,7 +559,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: adcl $0, %eax ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl @@ -584,7 +584,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; @@ -605,10 +605,10 @@ ; X86-LABEL: test_i32_sub_sum_var: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: negl %eax ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll --- a/llvm/test/CodeGen/X86/addcarry.ll +++ b/llvm/test/CodeGen/X86/addcarry.ll @@ -316,21 +316,13 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: addq (%rsi), %rdx -; CHECK-NEXT: movq 8(%rsi), %rdi -; CHECK-NEXT: adcq $0, %rdi -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: movzbl %r10b, %r10d -; CHECK-NEXT: addq %rcx, %rdi -; CHECK-NEXT: adcq 16(%rsi), %r10 -; CHECK-NEXT: setb %cl -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: addq %r8, %r10 -; CHECK-NEXT: adcq 24(%rsi), %rcx -; CHECK-NEXT: addq %r9, %rcx -; CHECK-NEXT: movq %rdx, (%rax) -; CHECK-NEXT: movq %rdi, 8(%rax) -; CHECK-NEXT: movq %r10, 16(%rax) -; CHECK-NEXT: movq %rcx, 24(%rax) +; CHECK-NEXT: adcq 8(%rsi), %rcx +; CHECK-NEXT: adcq 16(%rsi), %r8 +; CHECK-NEXT: adcq 24(%rsi), %r9 +; CHECK-NEXT: movq %rdx, (%rdi) +; CHECK-NEXT: movq %rcx, 8(%rdi) +; CHECK-NEXT: movq %r8, 16(%rdi) +; CHECK-NEXT: movq %r9, 24(%rdi) ; CHECK-NEXT: retq entry: %0 = extractvalue %S %arg.b, 0 @@ -391,15 +383,15 @@ define i128 @addcarry_to_subcarry(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: addcarry_to_subcarry: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq %rsi, %rcx +; CHECK-NEXT: notq %rcx +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: addq %rdi, %rcx +; CHECK-NEXT: setb %dl +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpq %rsi, %rdi -; CHECK-NEXT: notq %rsi -; CHECK-NEXT: setae %cl -; CHECK-NEXT: addb $-1, %cl -; CHECK-NEXT: adcq $0, %rax -; CHECK-NEXT: setb %cl -; CHECK-NEXT: movzbl %cl, %edx -; CHECK-NEXT: addq %rsi, %rax +; CHECK-NEXT: setae %al +; CHECK-NEXT: addq %rcx, %rax ; CHECK-NEXT: adcq $0, %rdx ; CHECK-NEXT: retq %notb = xor i64 %b, -1 @@ -418,9 +410,12 @@ ; CHECK-LABEL: addcarry_2x64: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: addq %rcx, %rsi +; CHECK-NEXT: setb %dil ; CHECK-NEXT: addq %rdx, %rax -; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: adcq $0, %rsi ; CHECK-NEXT: setb %cl +; CHECK-NEXT: orb %dil, %cl ; CHECK-NEXT: movq %rsi, %rdx ; CHECK-NEXT: retq %t0 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %x0, i64 %y0) @@ -448,9 +443,12 @@ ; CHECK-LABEL: addcarry_hidden_2x64: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: addq %rcx, %rsi +; CHECK-NEXT: setb %dil ; CHECK-NEXT: addq %rdx, %rax -; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: adcq $0, %rsi ; CHECK-NEXT: setb %cl +; CHECK-NEXT: orb %dil, %cl ; CHECK-NEXT: movq %rsi, %rdx ; CHECK-NEXT: retq %t0 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %x0, i64 %y0) @@ -482,9 +480,12 @@ ; CHECK-LABEL: addcarry_hidden2_2x64: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: addq %rcx, %rsi +; CHECK-NEXT: setb %dil ; CHECK-NEXT: addq %rdx, %rax -; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: adcq $0, %rsi ; CHECK-NEXT: setb %cl +; CHECK-NEXT: orb %dil, %cl ; CHECK-NEXT: movq %rsi, %rdx ; CHECK-NEXT: retq %t0 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %x0, i64 %y0) @@ -516,9 +517,12 @@ ; CHECK-LABEL: addcarry_2x64_or_reversed: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: addq %rcx, %rsi +; CHECK-NEXT: setb %dil ; CHECK-NEXT: addq %rdx, %rax -; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: adcq $0, %rsi ; CHECK-NEXT: setb %cl +; CHECK-NEXT: orb %dil, %cl ; CHECK-NEXT: movq %rsi, %rdx ; CHECK-NEXT: retq %t0 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %x0, i64 %y0) @@ -546,9 +550,12 @@ ; CHECK-LABEL: addcarry_2x64_xor_reversed: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: addq %rcx, %rsi +; CHECK-NEXT: setb %dil ; CHECK-NEXT: addq %rdx, %rax -; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: adcq $0, %rsi ; CHECK-NEXT: setb %cl +; CHECK-NEXT: xorb %dil, %cl ; CHECK-NEXT: movq %rsi, %rdx ; CHECK-NEXT: retq %t0 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %x0, i64 %y0) @@ -576,10 +583,13 @@ ; CHECK-LABEL: addcarry_2x64_and_reversed: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: addq %rcx, %rsi +; CHECK-NEXT: setb %dil ; CHECK-NEXT: addq %rdx, %rax -; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: adcq $0, %rsi +; CHECK-NEXT: setb %cl +; CHECK-NEXT: andb %dil, %cl ; CHECK-NEXT: movq %rsi, %rdx -; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: retq %t0 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %x0, i64 %y0) %s0 = extractvalue { i64, i1 } %t0, 0 @@ -636,10 +646,13 @@ define { i64, i1 } @addcarry_fake_carry(i64 %a, i64 %b, i1 %carryin) nounwind { ; CHECK-LABEL: addcarry_fake_carry: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: btl $0, %edx -; CHECK-NEXT: adcq %rsi, %rax +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: addq %rsi, %rdi +; CHECK-NEXT: setb %cl +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: addq %rdi, %rax ; CHECK-NEXT: setb %dl +; CHECK-NEXT: orb %cl, %dl ; CHECK-NEXT: retq %t1 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %partial = extractvalue { i64, i1 } %t1, 0 @@ -742,17 +755,20 @@ define i32 @add_U320_without_i128_add(ptr nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) nounwind { ; CHECK-LABEL: add_U320_without_i128_add: ; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movq 16(%rdi), %rax ; CHECK-NEXT: movq 24(%rdi), %r10 ; CHECK-NEXT: movq 32(%rdi), %r11 +; CHECK-NEXT: addq 8(%rdi), %rdx +; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: adcq %rcx, %rbx ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq %rdx, 8(%rdi) -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: adcq %rcx, %rdx +; CHECK-NEXT: adcq $0, %rdx +; CHECK-NEXT: adcq $0, %rbx ; CHECK-NEXT: addq %rcx, %rax ; CHECK-NEXT: movq %r10, %rcx ; CHECK-NEXT: adcq %r8, %rcx -; CHECK-NEXT: cmpq %rax, %rdx +; CHECK-NEXT: cmpq %rax, %rbx ; CHECK-NEXT: adcq $0, %rcx ; CHECK-NEXT: leaq (%r11,%r9), %rsi ; CHECK-NEXT: addq %r8, %r10 @@ -764,10 +780,12 @@ ; CHECK-NEXT: cmpq %rsi, %r8 ; CHECK-NEXT: setb %al ; CHECK-NEXT: addq %r9, %r11 -; CHECK-NEXT: movq %rdx, 16(%rdi) +; CHECK-NEXT: movq %rdx, 8(%rdi) +; CHECK-NEXT: movq %rbx, 16(%rdi) ; CHECK-NEXT: movq %rcx, 24(%rdi) ; CHECK-NEXT: movq %r8, 32(%rdi) ; CHECK-NEXT: adcl $0, %eax +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: retq %7 = load i64, ptr %0, align 8 %8 = getelementptr inbounds %struct.U320, ptr %0, i64 0, i32 0, i64 1 @@ -820,10 +838,22 @@ define i32 @add_U320_without_i128_or(ptr nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) nounwind { ; CHECK-LABEL: add_U320_without_i128_or: ; CHECK: # %bb.0: +; CHECK-NEXT: addq 8(%rdi), %rdx +; CHECK-NEXT: setb %al ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: adcq $0, %rdx +; CHECK-NEXT: setb %sil +; CHECK-NEXT: orb %al, %sil +; CHECK-NEXT: addq 24(%rdi), %r8 +; CHECK-NEXT: setb %al +; CHECK-NEXT: addb $-1, %sil ; CHECK-NEXT: adcq %rcx, 16(%rdi) -; CHECK-NEXT: adcq %r8, 24(%rdi) +; CHECK-NEXT: adcq $0, %r8 +; CHECK-NEXT: setb %cl +; CHECK-NEXT: orb %al, %cl +; CHECK-NEXT: movq %rdx, 8(%rdi) +; CHECK-NEXT: movq %r8, 24(%rdi) +; CHECK-NEXT: addb $-1, %cl ; CHECK-NEXT: adcq %r9, 32(%rdi) ; CHECK-NEXT: setb %al ; CHECK-NEXT: movzbl %al, %eax @@ -875,10 +905,22 @@ define i32 @add_U320_without_i128_xor(ptr nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) nounwind { ; CHECK-LABEL: add_U320_without_i128_xor: ; CHECK: # %bb.0: +; CHECK-NEXT: addq 8(%rdi), %rdx +; CHECK-NEXT: setb %al ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: adcq $0, %rdx +; CHECK-NEXT: setb %sil +; CHECK-NEXT: xorb %al, %sil +; CHECK-NEXT: addq 24(%rdi), %r8 +; CHECK-NEXT: setb %al +; CHECK-NEXT: addb $-1, %sil ; CHECK-NEXT: adcq %rcx, 16(%rdi) -; CHECK-NEXT: adcq %r8, 24(%rdi) +; CHECK-NEXT: adcq $0, %r8 +; CHECK-NEXT: setb %cl +; CHECK-NEXT: xorb %al, %cl +; CHECK-NEXT: movq %rdx, 8(%rdi) +; CHECK-NEXT: movq %r8, 24(%rdi) +; CHECK-NEXT: addb $-1, %cl ; CHECK-NEXT: adcq %r9, 32(%rdi) ; CHECK-NEXT: setb %al ; CHECK-NEXT: movzbl %al, %eax @@ -932,9 +974,15 @@ define i32 @bogus_add_U320_without_i128_and(ptr nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) nounwind { ; CHECK-LABEL: bogus_add_U320_without_i128_and: ; CHECK: # %bb.0: +; CHECK-NEXT: addq 8(%rdi), %rdx +; CHECK-NEXT: setb %al ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq %rdx, 8(%rdi) -; CHECK-NEXT: addq %rcx, 16(%rdi) +; CHECK-NEXT: adcq $0, %rdx +; CHECK-NEXT: setb %sil +; CHECK-NEXT: andb %al, %sil +; CHECK-NEXT: addb $-1, %sil +; CHECK-NEXT: movq %rdx, 8(%rdi) +; CHECK-NEXT: adcq %rcx, 16(%rdi) ; CHECK-NEXT: addq %r8, 24(%rdi) ; CHECK-NEXT: addq %r9, 32(%rdi) ; CHECK-NEXT: xorl %eax, %eax @@ -986,11 +1034,25 @@ define void @add_U320_without_i128_or_no_ret(ptr nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) nounwind { ; CHECK-LABEL: add_U320_without_i128_or_no_ret: ; CHECK: # %bb.0: +; CHECK-NEXT: addq 8(%rdi), %rdx +; CHECK-NEXT: setb %al ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: adcq $0, %rdx +; CHECK-NEXT: setb %sil +; CHECK-NEXT: orb %al, %sil +; CHECK-NEXT: addq 24(%rdi), %r8 +; CHECK-NEXT: setb %al +; CHECK-NEXT: addb $-1, %sil ; CHECK-NEXT: adcq %rcx, 16(%rdi) -; CHECK-NEXT: adcq %r8, 24(%rdi) -; CHECK-NEXT: adcq %r9, 32(%rdi) +; CHECK-NEXT: adcq $0, %r8 +; CHECK-NEXT: setb %cl +; CHECK-NEXT: addq 32(%rdi), %r9 +; CHECK-NEXT: orb %al, %cl +; CHECK-NEXT: movzbl %cl, %eax +; CHECK-NEXT: addq %r9, %rax +; CHECK-NEXT: movq %rdx, 8(%rdi) +; CHECK-NEXT: movq %r8, 24(%rdi) +; CHECK-NEXT: movq %rax, 32(%rdi) ; CHECK-NEXT: retq %7 = load i64, ptr %0, align 8 %8 = getelementptr inbounds %struct.U320, ptr %0, i64 0, i32 0, i64 1 @@ -1035,12 +1097,24 @@ define i32 @add_U320_uaddo(ptr nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) nounwind { ; CHECK-LABEL: add_U320_uaddo: ; CHECK: # %bb.0: +; CHECK-NEXT: addq 8(%rdi), %rdx +; CHECK-NEXT: setb %al ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: adcq $0, %rdx +; CHECK-NEXT: setb %sil +; CHECK-NEXT: orb %al, %sil +; CHECK-NEXT: addq 24(%rdi), %r8 +; CHECK-NEXT: setb %al +; CHECK-NEXT: addb $-1, %sil ; CHECK-NEXT: adcq %rcx, 16(%rdi) -; CHECK-NEXT: adcq %r8, 24(%rdi) +; CHECK-NEXT: adcq $0, %r8 +; CHECK-NEXT: setb %cl +; CHECK-NEXT: orb %al, %cl +; CHECK-NEXT: addb $-1, %cl ; CHECK-NEXT: adcq %r9, 32(%rdi) ; CHECK-NEXT: setb %al +; CHECK-NEXT: movq %rdx, 8(%rdi) +; CHECK-NEXT: movq %r8, 24(%rdi) ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: retq %7 = load i64, ptr %0, align 8 @@ -1103,14 +1177,22 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movq (%rsi), %rcx -; CHECK-NEXT: addq (%rdx), %rcx -; CHECK-NEXT: movq %rcx, (%rdi) -; CHECK-NEXT: movq 8(%rsi), %rcx -; CHECK-NEXT: adcq 8(%rdx), %rcx -; CHECK-NEXT: movq %rcx, 8(%rdi) -; CHECK-NEXT: movq 16(%rsi), %rcx -; CHECK-NEXT: adcq 16(%rdx), %rcx -; CHECK-NEXT: movq %rcx, 16(%rdi) +; CHECK-NEXT: movq (%rdx), %rdi +; CHECK-NEXT: leaq (%rcx,%rdi), %r8 +; CHECK-NEXT: movq %r8, (%rax) +; CHECK-NEXT: movq 8(%rsi), %r8 +; CHECK-NEXT: addq 8(%rdx), %r8 +; CHECK-NEXT: setb %r9b +; CHECK-NEXT: addq %rdi, %rcx +; CHECK-NEXT: adcq $0, %r8 +; CHECK-NEXT: setb %cl +; CHECK-NEXT: orb %r9b, %cl +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: movq %r8, 8(%rax) +; CHECK-NEXT: movq 16(%rsi), %rsi +; CHECK-NEXT: addq 16(%rdx), %rsi +; CHECK-NEXT: addq %rcx, %rsi +; CHECK-NEXT: movq %rsi, 16(%rax) ; CHECK-NEXT: retq %4 = load i64, ptr %1, align 8 %5 = load i64, ptr %2, align 8 @@ -1150,9 +1232,12 @@ define zeroext i1 @uaddo_U128_without_i128_or(i64 %0, i64 %1, i64 %2, i64 %3, ptr nocapture %4) nounwind { ; CHECK-LABEL: uaddo_U128_without_i128_or: ; CHECK: # %bb.0: +; CHECK-NEXT: addq %rcx, %rsi +; CHECK-NEXT: setb %cl ; CHECK-NEXT: addq %rdx, %rdi -; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: adcq $0, %rsi ; CHECK-NEXT: setb %al +; CHECK-NEXT: orb %cl, %al ; CHECK-NEXT: movq %rsi, (%r8) ; CHECK-NEXT: movq %rdi, 8(%r8) ; CHECK-NEXT: retq @@ -1177,12 +1262,18 @@ ; CHECK-LABEL: add_U192_without_i128_or: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: addq %r9, %rdx +; CHECK-NEXT: setb %dil ; CHECK-NEXT: addq %r8, %rsi -; CHECK-NEXT: adcq %r9, %rdx -; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: movq %rcx, (%rdi) -; CHECK-NEXT: movq %rdx, 8(%rdi) -; CHECK-NEXT: movq %rsi, 16(%rdi) +; CHECK-NEXT: adcq $0, %rdx +; CHECK-NEXT: setb %r8b +; CHECK-NEXT: orb %dil, %r8b +; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: movzbl %r8b, %edi +; CHECK-NEXT: addq %rcx, %rdi +; CHECK-NEXT: movq %rdi, (%rax) +; CHECK-NEXT: movq %rdx, 8(%rax) +; CHECK-NEXT: movq %rsi, 16(%rax) ; CHECK-NEXT: retq %8 = add i64 %4, %1 %9 = icmp ult i64 %8, %1 @@ -1214,9 +1305,14 @@ ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movq (%rdx), %rcx ; CHECK-NEXT: movq 8(%rdx), %rdi +; CHECK-NEXT: addq 8(%rsi), %rdi +; CHECK-NEXT: setb %r8b ; CHECK-NEXT: addq (%rsi), %rcx -; CHECK-NEXT: adcq 8(%rsi), %rdi +; CHECK-NEXT: adcq $0, %rdi +; CHECK-NEXT: setb %r9b +; CHECK-NEXT: orb %r8b, %r9b ; CHECK-NEXT: movq 16(%rdx), %r8 +; CHECK-NEXT: addb $-1, %r9b ; CHECK-NEXT: adcq 16(%rsi), %r8 ; CHECK-NEXT: movq 24(%rdx), %rdx ; CHECK-NEXT: adcq 24(%rsi), %rdx @@ -1274,15 +1370,22 @@ ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movq (%rdx), %rcx ; CHECK-NEXT: movq 8(%rdx), %rdi +; CHECK-NEXT: addq 8(%rsi), %rdi +; CHECK-NEXT: setb %r8b ; CHECK-NEXT: addq (%rsi), %rcx -; CHECK-NEXT: adcq 8(%rsi), %rdi +; CHECK-NEXT: adcq $0, %rdi +; CHECK-NEXT: setb %r9b +; CHECK-NEXT: orb %r8b, %r9b ; CHECK-NEXT: movq 16(%rdx), %r8 ; CHECK-NEXT: movq 24(%rdx), %rdx -; CHECK-NEXT: adcq 16(%rsi), %r8 +; CHECK-NEXT: addq 16(%rsi), %r8 ; CHECK-NEXT: adcq 24(%rsi), %rdx +; CHECK-NEXT: movzbl %r9b, %esi +; CHECK-NEXT: addq %r8, %rsi +; CHECK-NEXT: adcq $0, %rdx ; CHECK-NEXT: movq %rcx, (%rax) ; CHECK-NEXT: movq %rdi, 8(%rax) -; CHECK-NEXT: movq %r8, 16(%rax) +; CHECK-NEXT: movq %rsi, 16(%rax) ; CHECK-NEXT: movq %rdx, 24(%rax) ; CHECK-NEXT: retq %4 = load i64, ptr %1, align 8 diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -316,7 +316,7 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -326,7 +326,7 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -336,7 +336,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -976,38 +976,77 @@ ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; -; AVX512F-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] -; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpermd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq ; -; AVX512DQ-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512F-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,3] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq ; -; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,7] -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512DQ-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,3] +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,7] +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,3] +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias @@ -1027,22 +1066,22 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -1050,21 +1089,21 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> -; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: pshufb %xmm3, %xmm2 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE42-NEXT: pshufb %xmm3, %xmm2 -; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -1072,21 +1111,21 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -1160,20 +1199,20 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1181,19 +1220,19 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] -; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] -; SSE42-NEXT: pshufb %xmm1, %xmm3 -; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] -; SSE42-NEXT: pshufb %xmm1, %xmm0 -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm3 -; SSE42-NEXT: movdqa %xmm3, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] +; SSE42-NEXT: pshufb %xmm2, %xmm3 +; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; SSE42-NEXT: pshufb %xmm2, %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm3 +; SSE42-NEXT: movdqa %xmm3, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1201,18 +1240,18 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1285,20 +1324,20 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1306,19 +1345,19 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] -; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] -; SSE42-NEXT: pshufb %xmm1, %xmm3 -; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] -; SSE42-NEXT: pshufb %xmm1, %xmm0 -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm3 -; SSE42-NEXT: movdqa %xmm3, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] +; SSE42-NEXT: pshufb %xmm2, %xmm3 +; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; SSE42-NEXT: pshufb %xmm2, %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm3 +; SSE42-NEXT: movdqa %xmm3, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1326,18 +1365,18 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1410,19 +1449,19 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1430,17 +1469,17 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm1 ; SSE42-NEXT: movdqa 32(%rdi), %xmm2 ; SSE42-NEXT: movdqa 48(%rdi), %xmm3 -; SSE42-NEXT: paddb 48(%rsi), %xmm3 ; SSE42-NEXT: paddb 32(%rsi), %xmm2 +; SSE42-NEXT: paddb 48(%rsi), %xmm3 ; SSE42-NEXT: paddb (%rsi), %xmm1 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE42-NEXT: movdqa %xmm1, %xmm4 -; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm4 -; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm1 -; SSE42-NEXT: paddb 16(%rdx), %xmm1 -; SSE42-NEXT: paddb (%rdx), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm4 +; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: paddb 16(%rdx), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1448,16 +1487,16 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm2 +; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1535,25 +1574,25 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,1,2,3,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1561,20 +1600,20 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: pshufb %xmm3, %xmm2 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE42-NEXT: pshufb %xmm3, %xmm2 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1582,19 +1621,20 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15] +; AVX-NEXT: # xmm3 = mem[0,0] ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1665,21 +1705,21 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 -; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa 48(%rdi), %xmm2 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,0,65535,65535,65535] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1687,16 +1727,16 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] ; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1704,16 +1744,16 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1785,19 +1825,19 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1805,15 +1845,15 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1821,15 +1861,15 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1901,19 +1941,19 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: @@ -1921,16 +1961,16 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: @@ -1945,11 +1985,11 @@ ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2059,15 +2099,15 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: @@ -2075,15 +2115,15 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: @@ -2097,11 +2137,11 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2211,15 +2251,15 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: @@ -2227,15 +2267,15 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: @@ -2249,11 +2289,11 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2285,12 +2325,12 @@ ; ; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,0,7] -; AVX512F-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-FAST-NEXT: vzeroupper @@ -2311,12 +2351,12 @@ ; ; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,0,7] -; AVX512DQ-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-FAST-NEXT: vzeroupper @@ -2337,9 +2377,9 @@ ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,13,0,15] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper @@ -2414,22 +2454,23 @@ ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm2 +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -2439,11 +2480,11 @@ ; ; AVX512F-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -2455,11 +2496,11 @@ ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -2548,10 +2589,10 @@ ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -2580,13 +2621,14 @@ ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] -; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2598,13 +2640,14 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] -; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2683,36 +2726,36 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm2 ; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 @@ -2725,10 +2768,10 @@ ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm2 ; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 @@ -2803,8 +2846,8 @@ ; SSE42-NEXT: paddb %xmm1, %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: @@ -2818,11 +2861,11 @@ ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: @@ -2851,7 +2894,6 @@ ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -2869,7 +2911,6 @@ ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -2953,36 +2994,36 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm2 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 @@ -2995,10 +3036,10 @@ ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm2 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 @@ -3084,13 +3125,13 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -3120,7 +3161,6 @@ ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -3138,7 +3178,6 @@ ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -3189,10 +3228,10 @@ ; SSE2-NEXT: paddb (%rdx), %xmm3 ; SSE2-NEXT: movdqa 16(%rdx), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: paddb 48(%rdx), %xmm1 ; SSE2-NEXT: paddb 32(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) +; SSE2-NEXT: paddb 48(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 48(%rcx) +; SSE2-NEXT: movdqa %xmm0, 32(%rcx) ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) ; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: retq @@ -3211,10 +3250,10 @@ ; SSE42-NEXT: paddb (%rdx), %xmm4 ; SSE42-NEXT: movdqa 16(%rdx), %xmm0 ; SSE42-NEXT: paddb %xmm1, %xmm0 -; SSE42-NEXT: paddb 48(%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 48(%rcx) +; SSE42-NEXT: movdqa %xmm1, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: movdqa %xmm4, (%rcx) ; SSE42-NEXT: retq @@ -3230,12 +3269,12 @@ ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: retq ; @@ -3345,8 +3384,8 @@ ; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm1 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: paddb 16(%rdx), %xmm3 -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm3, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: @@ -3360,8 +3399,8 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: @@ -3486,18 +3525,18 @@ ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] @@ -3514,9 +3553,8 @@ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -3530,9 +3568,8 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -3616,10 +3653,10 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -3641,35 +3678,37 @@ ; ; AVX512F-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3746,26 +3785,26 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm2 -; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX2-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3879,10 +3918,10 @@ ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5],xmm3[6],xmm1[7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -3904,35 +3943,35 @@ ; ; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4008,12 +4047,12 @@ ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: retq ; @@ -4116,8 +4155,8 @@ ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: @@ -4130,8 +4169,8 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: @@ -4150,13 +4189,14 @@ ; ; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX512F-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -4164,13 +4204,14 @@ ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -4247,29 +4288,29 @@ ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd %xmm0, %xmm2 -; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm1 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX2-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5,6,7] ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4282,11 +4323,11 @@ ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,0,15] ; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4299,11 +4340,11 @@ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,0,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4378,10 +4419,10 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -4411,11 +4452,11 @@ ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,0] ; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4428,11 +4469,11 @@ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,0] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4511,17 +4552,19 @@ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3],ymm3[4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: @@ -4548,11 +4591,11 @@ ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,15] ; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4565,11 +4608,11 @@ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4608,8 +4651,8 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -4622,8 +4665,8 @@ ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -4636,8 +4679,8 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -4760,15 +4803,15 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[2] -; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm3 +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm3, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -4796,11 +4839,11 @@ ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4813,11 +4856,11 @@ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4870,8 +4913,8 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: @@ -4884,8 +4927,8 @@ ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: @@ -4896,11 +4939,11 @@ ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -4988,17 +5031,17 @@ ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdx), %xmm3 +; SSE-NEXT: movdqa 32(%rdx), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, 48(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rcx) +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: @@ -5007,14 +5050,14 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: @@ -5022,10 +5065,10 @@ ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5034,10 +5077,10 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5046,10 +5089,10 @@ ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5078,17 +5121,17 @@ ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdx), %xmm3 +; SSE-NEXT: movdqa 32(%rdx), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, 48(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rcx) +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: @@ -5096,14 +5139,14 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: @@ -5111,10 +5154,10 @@ ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5123,10 +5166,10 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5135,10 +5178,10 @@ ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5167,17 +5210,17 @@ ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdx), %xmm3 +; SSE-NEXT: movdqa 32(%rdx), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, 48(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rcx) +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: @@ -5185,14 +5228,14 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: @@ -5200,10 +5243,10 @@ ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5212,10 +5255,10 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5224,10 +5267,10 @@ ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5255,31 +5298,31 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdx), %xmm3 +; SSE-NEXT: movdqa 32(%rdx), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, 48(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rcx) +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: @@ -5287,10 +5330,10 @@ ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5299,10 +5342,10 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5311,10 +5354,10 @@ ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5342,44 +5385,44 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: paddb 16(%rsi), %xmm1 -; SSE-NEXT: movdqa 16(%rdx), %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: movdqa (%rdx), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 48(%rdx), %xmm1 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: paddb %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 ; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm1 ; SSE-NEXT: movdqa %xmm1, 48(%rcx) -; SSE-NEXT: movdqa %xmm3, (%rcx) -; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm0, 32(%rcx) +; SSE-NEXT: movdqa %xmm3, 16(%rcx) +; SSE-NEXT: movdqa %xmm2, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5387,10 +5430,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5398,10 +5441,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5547,44 +5590,44 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: paddb 16(%rsi), %xmm1 -; SSE-NEXT: movdqa 16(%rdx), %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: movdqa (%rdx), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 48(%rdx), %xmm1 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: paddb %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 ; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm1 ; SSE-NEXT: movdqa %xmm1, 48(%rcx) -; SSE-NEXT: movdqa %xmm3, (%rcx) -; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm0, 32(%rcx) +; SSE-NEXT: movdqa %xmm3, 16(%rcx) +; SSE-NEXT: movdqa %xmm2, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5592,10 +5635,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5603,10 +5646,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -283,7 +283,7 @@ ; AVX512F-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper @@ -292,7 +292,7 @@ ; AVX512DQ-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper @@ -301,7 +301,7 @@ ; AVX512BW-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper @@ -609,16 +609,16 @@ ; ; AVX512F-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX512F-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512F-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX512F-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512F-NEXT: movl (%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX512F-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] +; AVX512F-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] +; AVX512F-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper @@ -626,40 +626,37 @@ ; ; AVX512DQ-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-NEXT: vmovd %xmm0, %eax -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX512DQ-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQ-NEXT: movl (%rdi), %eax +; AVX512DQ-NEXT: vmovd %eax, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] +; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] +; AVX512DQ-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq -; -; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0 -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7] -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX512BW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512BW-NEXT: movl (%rdi), %eax +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX512BW-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] +; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] +; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <8 x i32> @@ -711,12 +708,12 @@ ; ; AVX512F-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] -; AVX512F-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512F-NEXT: movl (%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper @@ -724,36 +721,29 @@ ; ; AVX512DQ-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-NEXT: vmovd %xmm0, %eax -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQ-NEXT: movl (%rdi), %eax +; AVX512DQ-NEXT: vmovd %eax, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq -; -; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7] -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX512BW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512BW-NEXT: movl (%rdi), %eax +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <8 x i32> @@ -855,19 +845,19 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm3 -; SSE2-NEXT: movdqa %xmm3, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -876,17 +866,17 @@ ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> -; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: pshufb %xmm3, %xmm2 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE42-NEXT: pshufb %xmm3, %xmm2 -; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -896,16 +886,16 @@ ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -966,18 +956,18 @@ ; SSE2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -989,10 +979,10 @@ ; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; SSE42-NEXT: pshufb %xmm2, %xmm0 -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1003,10 +993,10 @@ ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1064,18 +1054,18 @@ ; SSE2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,1,0,1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1087,10 +1077,10 @@ ; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; SSE42-NEXT: pshufb %xmm2, %xmm0 -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1101,10 +1091,10 @@ ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1164,15 +1154,15 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pandn (%rdi), %xmm1 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1180,24 +1170,24 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm1 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE42-NEXT: movdqa %xmm1, %xmm2 -; SSE42-NEXT: pblendvb %xmm0, 32(%rdi), %xmm2 -; SSE42-NEXT: pblendvb %xmm0, 48(%rdi), %xmm1 -; SSE42-NEXT: paddb 16(%rsi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm2 -; SSE42-NEXT: movdqa %xmm2, (%rdx) -; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: pblendvb %xmm0, 48(%rdi), %xmm2 +; SSE42-NEXT: pblendvb %xmm0, 32(%rdi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: paddb 16(%rsi), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vpblendvb %xmm1, 32(%rdi), %xmm0, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm2 +; AVX-NEXT: vpblendvb %xmm1, 32(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1268,10 +1258,10 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1279,16 +1269,16 @@ ; SSE42-NEXT: movdqa 32(%rdi), %xmm0 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; SSE42-NEXT: pshufb %xmm2, %xmm0 +; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = mem[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm3, %xmm4 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE42-NEXT: pshufb %xmm2, %xmm1 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE42-NEXT: paddb 16(%rsi), %xmm3 -; SSE42-NEXT: paddb (%rsi), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rdx) -; SSE42-NEXT: movdqa %xmm3, 16(%rdx) +; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE42-NEXT: pshufb %xmm2, %xmm0 +; SSE42-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE42-NEXT: paddb (%rsi), %xmm3 +; SSE42-NEXT: paddb 16(%rsi), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rdx) +; SSE42-NEXT: movdqa %xmm3, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1296,15 +1286,15 @@ ; AVX-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = mem[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = mem[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1361,30 +1351,30 @@ ; SSE2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,0,65535,65535,65535] -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,1,0,1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1392,10 +1382,10 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1452,27 +1442,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,65535] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pandn (%rdi), %xmm1 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1480,10 +1470,10 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1542,22 +1532,22 @@ ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: paddb 16(%rsi), %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm1 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: @@ -1565,11 +1555,11 @@ ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],mem[1,3],ymm0[4,4],mem[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1626,35 +1616,35 @@ ; SSE2-NEXT: movaps (%rdi), %xmm0 ; SSE2-NEXT: movaps 32(%rdi), %xmm1 ; SSE2-NEXT: movaps 48(%rdi), %xmm2 -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: paddb 16(%rsi), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1738,36 +1728,36 @@ ; SSE2-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movapd (%rdi), %xmm0 -; SSE2-NEXT: movapd 32(%rdi), %xmm1 +; SSE2-NEXT: movapd 48(%rdi), %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],mem[1] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],mem[4,5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1780,62 +1770,32 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vpbroadcastq (%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-SLOW-NEXT: vzeroupper -; AVX512F-SLOW-NEXT: retq -; -; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] -; AVX512F-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-FAST-NEXT: vzeroupper -; AVX512F-FAST-NEXT: retq -; -; AVX512DQ-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vpbroadcastq (%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-SLOW-NEXT: vzeroupper -; AVX512DQ-SLOW-NEXT: retq -; -; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] -; AVX512DQ-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-FAST-NEXT: vzeroupper -; AVX512DQ-FAST-NEXT: retq +; AVX512F-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq ; -; AVX512BW-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vpbroadcastq (%rdi), %ymm0 -; AVX512BW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq +; AVX512DQ-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] -; AVX512BW-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm1, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX512BW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> %broadcast.of.zextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> poison, <4 x i32> @@ -1930,11 +1890,10 @@ ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1945,11 +1904,10 @@ ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2020,10 +1978,10 @@ ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; @@ -2048,10 +2006,12 @@ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] ; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2062,10 +2022,12 @@ ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2159,11 +2121,10 @@ ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX512F-NEXT: vpternlogd $202, (%rdi){1to8}, %ymm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2173,11 +2134,10 @@ ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX512DQ-NEXT: vpternlogd $202, (%rdi){1to8}, %ymm0, %ymm1 ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2229,13 +2189,13 @@ ; SSE42-NEXT: palignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: movdqa 16(%rsi), %xmm2 ; SSE42-NEXT: paddb %xmm1, %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rdx) -; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: @@ -2247,12 +2207,12 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: @@ -2271,33 +2231,31 @@ ; ; AVX512F-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] -; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] +; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2392,11 +2350,10 @@ ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpternlogq $202, (%rdi){1to4}, %ymm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2406,11 +2363,10 @@ ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpternlogq $202, (%rdi){1to4}, %ymm0, %ymm1 ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2502,33 +2458,31 @@ ; ; AVX512F-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] -; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] +; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2567,10 +2521,10 @@ ; SSE2-NEXT: paddb (%rsi), %xmm2 ; SSE2-NEXT: movdqa 16(%rsi), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb 32(%rsi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) +; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 48(%rdx) +; SSE2-NEXT: movdqa %xmm0, 32(%rdx) ; SSE2-NEXT: movdqa %xmm3, 16(%rdx) ; SSE2-NEXT: movdqa %xmm2, (%rdx) ; SSE2-NEXT: retq @@ -2585,10 +2539,10 @@ ; SSE42-NEXT: paddb (%rsi), %xmm3 ; SSE42-NEXT: movdqa 16(%rsi), %xmm0 ; SSE42-NEXT: paddb %xmm1, %xmm0 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rdx) +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 48(%rdx) +; SSE42-NEXT: movdqa %xmm1, 32(%rdx) ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) ; SSE42-NEXT: movdqa %xmm3, (%rdx) ; SSE42-NEXT: retq @@ -2600,12 +2554,12 @@ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX-NEXT: vpblendvb %xmm0, 48(%rdi), %xmm1, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) ; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; @@ -2700,8 +2654,8 @@ ; SSE42-NEXT: pblendvb %xmm0, 48(%rdi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm1 ; SSE42-NEXT: paddb 16(%rsi), %xmm2 -; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: @@ -2712,8 +2666,8 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: @@ -2927,10 +2881,10 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; @@ -2950,10 +2904,12 @@ ; AVX512F-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] -; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm1 +; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],mem[1,2],xmm2[3],mem[4,5],xmm2[6],mem[7] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) @@ -2963,10 +2919,12 @@ ; AVX512DQ-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],mem[1,2],xmm2[3],mem[4,5],xmm2[6],mem[7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) @@ -3029,10 +2987,10 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; @@ -3135,13 +3093,13 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; @@ -3243,13 +3201,13 @@ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa (%rdi), %xmm3 ; AVX-NEXT: vpaddb 32(%rsi), %xmm3, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; @@ -3333,10 +3291,10 @@ ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) +; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: @@ -3344,10 +3302,10 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: @@ -3440,13 +3398,13 @@ ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[1,3],ymm1[4,4],ymm0[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,1,0,1] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -3468,11 +3426,11 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,0,15] ; AVX512F-NEXT: vpermd (%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3480,11 +3438,11 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,0,15] ; AVX512DQ-NEXT: vpermd (%rdi), %zmm0, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3571,11 +3529,11 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,0] ; AVX512F-NEXT: vpermd (%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3583,11 +3541,11 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,0] ; AVX512DQ-NEXT: vpermd (%rdi), %zmm0, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3644,14 +3602,14 @@ ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa (%rdi), %xmm2 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3 ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3 ; AVX-NEXT: vmovdqa %xmm3, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -3673,11 +3631,11 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,15] ; AVX512F-NEXT: vpermd (%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3685,11 +3643,11 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,15] ; AVX512DQ-NEXT: vpermd (%rdi), %zmm0, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3715,14 +3673,14 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %in.elt.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movaps 48(%rdi), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm1 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) ; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -3731,10 +3689,10 @@ ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) +; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -3830,14 +3788,14 @@ ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa (%rdi), %xmm2 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3 ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3 ; AVX-NEXT: vmovdqa %xmm3, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -3858,11 +3816,11 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3870,11 +3828,11 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermq (%rdi), %zmm0, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3904,10 +3862,10 @@ ; SSE2-NEXT: movapd 48(%rdi), %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: @@ -3916,10 +3874,10 @@ ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) +; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: @@ -3927,11 +3885,11 @@ ; AVX-NEXT: vmovapd (%rdi), %ymm0 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm0[0,1] ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -3987,40 +3945,40 @@ ; SSE: # %bb.0: ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: ; AVX: # %bb.0: ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4028,10 +3986,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4039,10 +3997,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4065,40 +4023,40 @@ ; SSE-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss (%rdi), %ymm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4106,10 +4064,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4117,10 +4075,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4143,40 +4101,40 @@ ; SSE-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4184,10 +4142,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4195,10 +4153,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4221,39 +4179,39 @@ ; SSE-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) -; AVX-NEXT: retq +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4261,10 +4219,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4272,10 +4230,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4299,59 +4257,59 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rsi), %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 48(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: paddb %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 ; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, 48(%rdx) -; SSE-NEXT: movdqa %xmm3, (%rdx) -; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: movdqa %xmm3, 16(%rdx) +; SSE-NEXT: movdqa %xmm2, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4374,40 +4332,40 @@ ; SSE-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss (%rdi), %ymm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4415,10 +4373,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4426,10 +4384,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4454,40 +4412,40 @@ ; SSE-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4495,10 +4453,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4506,10 +4464,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4534,39 +4492,39 @@ ; SSE-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4574,10 +4532,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4585,10 +4543,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4614,59 +4572,59 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rsi), %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 48(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: paddb %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 ; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, 48(%rdx) -; SSE-NEXT: movdqa %xmm3, (%rdx) -; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: movdqa %xmm3, 16(%rdx) +; SSE-NEXT: movdqa %xmm2, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4691,60 +4649,60 @@ ; SSE-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastd (%rdi), %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastd (%rdi), %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4769,59 +4727,59 @@ ; SSE-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastd (%rdi), %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastd (%rdi), %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4847,59 +4805,59 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rsi), %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 48(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: paddb %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 ; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, 48(%rdx) -; SSE-NEXT: movdqa %xmm3, (%rdx) -; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: movdqa %xmm3, 16(%rdx) +; SSE-NEXT: movdqa %xmm2, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4924,59 +4882,59 @@ ; SSE-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastq (%rdi), %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastq (%rdi), %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5002,59 +4960,59 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rsi), %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 48(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: paddb %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 ; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, 48(%rdx) -; SSE-NEXT: movdqa %xmm3, (%rdx) -; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: movdqa %xmm3, 16(%rdx) +; SSE-NEXT: movdqa %xmm2, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5080,22 +5038,22 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 48(%rsi), %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: paddb 16(%rsi), %xmm1 -; SSE-NEXT: movdqa 32(%rsi), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm3, 32(%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) -; SSE-NEXT: movdqa %xmm2, 48(%rdx) +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: paddb %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 +; SSE-NEXT: paddb 32(%rsi), %xmm0 +; SSE-NEXT: paddb 48(%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm1, 48(%rdx) +; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: movdqa %xmm3, 16(%rdx) +; SSE-NEXT: movdqa %xmm2, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: pushq %rbx -; AVX-NEXT: movq 16(%rdi), %rax +; AVX-NEXT: movq (%rdi), %rax ; AVX-NEXT: movq %rax, %rcx ; AVX-NEXT: movq %rax, %r8 ; AVX-NEXT: movq %rax, %r9 @@ -5115,7 +5073,7 @@ ; AVX-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 ; AVX-NEXT: shrq $48, %r8 ; AVX-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 -; AVX-NEXT: movq 24(%rdi), %rax +; AVX-NEXT: movq 8(%rdi), %rax ; AVX-NEXT: shrq $56, %rcx ; AVX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 ; AVX-NEXT: movl %eax, %ecx @@ -5137,7 +5095,7 @@ ; AVX-NEXT: movq %rax, %rcx ; AVX-NEXT: shrq $48, %rcx ; AVX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movq (%rdi), %rcx +; AVX-NEXT: movq 16(%rdi), %rcx ; AVX-NEXT: shrq $56, %rax ; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX-NEXT: movl %ecx, %eax @@ -5159,7 +5117,7 @@ ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: shrq $48, %rax ; AVX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX-NEXT: movq 8(%rdi), %rax +; AVX-NEXT: movq 24(%rdi), %rax ; AVX-NEXT: shrq $56, %rcx ; AVX-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 ; AVX-NEXT: movl %eax, %ecx @@ -5183,14 +5141,14 @@ ; AVX-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 ; AVX-NEXT: shrq $56, %rax ; AVX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: popq %rbx ; AVX-NEXT: retq ; @@ -5286,39 +5244,310 @@ ; AVX2-NEXT: shrq $56, %rax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: movq 16(%rdi), %rax +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: movq %rax, %r8 +; AVX512F-NEXT: movq %rax, %r9 +; AVX512F-NEXT: movq %rax, %r10 +; AVX512F-NEXT: movl %eax, %r11d +; AVX512F-NEXT: movl %eax, %ebx +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: shrl $16, %ebx +; AVX512F-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 +; AVX512F-NEXT: shrl $24, %r11d +; AVX512F-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; AVX512F-NEXT: shrq $32, %r10 +; AVX512F-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0 +; AVX512F-NEXT: shrq $40, %r9 +; AVX512F-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 +; AVX512F-NEXT: shrq $48, %r8 +; AVX512F-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 +; AVX512F-NEXT: movq 24(%rdi), %rax +; AVX512F-NEXT: shrq $56, %rcx +; AVX512F-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $8, %ecx +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $16, %ecx +; AVX512F-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $24, %ecx +; AVX512F-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $32, %rcx +; AVX512F-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $40, %rcx +; AVX512F-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $48, %rcx +; AVX512F-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movq (%rdi), %rcx +; AVX512F-NEXT: shrq $56, %rax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: movl %ecx, %eax +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: vmovd %ecx, %xmm1 +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: movl %ecx, %eax +; AVX512F-NEXT: shrl $16, %eax +; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: movl %ecx, %eax +; AVX512F-NEXT: shrl $24, %eax +; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: movq %rcx, %rax +; AVX512F-NEXT: shrq $32, %rax +; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: movq %rcx, %rax +; AVX512F-NEXT: shrq $40, %rax +; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: movq %rcx, %rax +; AVX512F-NEXT: shrq $48, %rax +; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: movq 8(%rdi), %rax +; AVX512F-NEXT: shrq $56, %rcx +; AVX512F-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $8, %ecx +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $16, %ecx +; AVX512F-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $24, %ecx +; AVX512F-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $32, %rcx +; AVX512F-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $40, %rcx +; AVX512F-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $48, %rcx +; AVX512F-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: shrq $56, %rax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: popq %rbx ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: pushq %rbx +; AVX512DQ-NEXT: movq 16(%rdi), %rax +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: movq %rax, %r9 +; AVX512DQ-NEXT: movq %rax, %r10 +; AVX512DQ-NEXT: movl %eax, %r11d +; AVX512DQ-NEXT: movl %eax, %ebx +; AVX512DQ-NEXT: vmovd %eax, %xmm0 +; AVX512DQ-NEXT: shrl $8, %eax +; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrl $16, %ebx +; AVX512DQ-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrl $24, %r11d +; AVX512DQ-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrq $32, %r10 +; AVX512DQ-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrq $40, %r9 +; AVX512DQ-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrq $48, %r8 +; AVX512DQ-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq 24(%rdi), %rax +; AVX512DQ-NEXT: shrq $56, %rcx +; AVX512DQ-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movl %eax, %ecx +; AVX512DQ-NEXT: shrl $8, %ecx +; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movl %eax, %ecx +; AVX512DQ-NEXT: shrl $16, %ecx +; AVX512DQ-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movl %eax, %ecx +; AVX512DQ-NEXT: shrl $24, %ecx +; AVX512DQ-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shrq $32, %rcx +; AVX512DQ-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shrq $40, %rcx +; AVX512DQ-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shrq $48, %rcx +; AVX512DQ-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq (%rdi), %rcx +; AVX512DQ-NEXT: shrq $56, %rax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: movl %ecx, %eax +; AVX512DQ-NEXT: shrl $8, %eax +; AVX512DQ-NEXT: vmovd %ecx, %xmm1 +; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: movl %ecx, %eax +; AVX512DQ-NEXT: shrl $16, %eax +; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: movl %ecx, %eax +; AVX512DQ-NEXT: shrl $24, %eax +; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shrq $32, %rax +; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shrq $40, %rax +; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shrq $48, %rax +; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: movq 8(%rdi), %rax +; AVX512DQ-NEXT: shrq $56, %rcx +; AVX512DQ-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: movl %eax, %ecx +; AVX512DQ-NEXT: shrl $8, %ecx +; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: movl %eax, %ecx +; AVX512DQ-NEXT: shrl $16, %ecx +; AVX512DQ-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: movl %eax, %ecx +; AVX512DQ-NEXT: shrl $24, %ecx +; AVX512DQ-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shrq $32, %rcx +; AVX512DQ-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shrq $40, %rcx +; AVX512DQ-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shrq $48, %rcx +; AVX512DQ-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: shrq $56, %rax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: popq %rbx ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: movq 16(%rdi), %rax +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: movq %rax, %r9 +; AVX512BW-NEXT: movq %rax, %r10 +; AVX512BW-NEXT: movl %eax, %r11d +; AVX512BW-NEXT: movl %eax, %ebx +; AVX512BW-NEXT: vmovd %eax, %xmm0 +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: shrl $16, %ebx +; AVX512BW-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 +; AVX512BW-NEXT: shrl $24, %r11d +; AVX512BW-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; AVX512BW-NEXT: shrq $32, %r10 +; AVX512BW-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0 +; AVX512BW-NEXT: shrq $40, %r9 +; AVX512BW-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 +; AVX512BW-NEXT: shrq $48, %r8 +; AVX512BW-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 +; AVX512BW-NEXT: movq 24(%rdi), %rax +; AVX512BW-NEXT: shrq $56, %rcx +; AVX512BW-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $8, %ecx +; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $24, %ecx +; AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shrq $32, %rcx +; AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shrq $40, %rcx +; AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shrq $48, %rcx +; AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movq (%rdi), %rcx +; AVX512BW-NEXT: shrq $56, %rax +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: movl %ecx, %eax +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: vmovd %ecx, %xmm1 +; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: movl %ecx, %eax +; AVX512BW-NEXT: shrl $16, %eax +; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: movl %ecx, %eax +; AVX512BW-NEXT: shrl $24, %eax +; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shrq $32, %rax +; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shrq $40, %rax +; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shrq $48, %rax +; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: movq 8(%rdi), %rax +; AVX512BW-NEXT: shrq $56, %rcx +; AVX512BW-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $8, %ecx +; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $24, %ecx +; AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shrq $32, %rcx +; AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shrq $40, %rcx +; AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shrq $48, %rcx +; AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: shrq $56, %rax +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: popq %rbx ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64 diff --git a/llvm/test/CodeGen/X86/atomic-fp.ll b/llvm/test/CodeGen/X86/atomic-fp.ll --- a/llvm/test/CodeGen/X86/atomic-fp.ll +++ b/llvm/test/CodeGen/X86/atomic-fp.ll @@ -93,8 +93,8 @@ ; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOSSE-NEXT: movl %ecx, (%esp) ; X86-NOSSE-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %ecx, (%esp) ; X86-NOSSE-NEXT: fildll (%esp) ; X86-NOSSE-NEXT: fistpll (%eax) ; X86-NOSSE-NEXT: movl %ebp, %esp @@ -258,8 +258,8 @@ ; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: fildll (%esp) ; X86-NOSSE-NEXT: fistpll glob64 ; X86-NOSSE-NEXT: movl %ebp, %esp @@ -421,8 +421,8 @@ ; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: fildll (%esp) ; X86-NOSSE-NEXT: fistpll -559038737 ; X86-NOSSE-NEXT: movl %ebp, %esp @@ -589,8 +589,8 @@ ; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: fildll (%esp) ; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl %ebp, %esp @@ -691,8 +691,8 @@ ; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOSSE-NEXT: movl %edx, (%esp) ; X86-NOSSE-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %edx, (%esp) ; X86-NOSSE-NEXT: fildll (%esp) ; X86-NOSSE-NEXT: fistpll (%ecx,%eax,8) ; X86-NOSSE-NEXT: leal -4(%ebp), %esp diff --git a/llvm/test/CodeGen/X86/atomic-idempotent.ll b/llvm/test/CodeGen/X86/atomic-idempotent.ll --- a/llvm/test/CodeGen/X86/atomic-idempotent.ll +++ b/llvm/test/CodeGen/X86/atomic-idempotent.ll @@ -202,10 +202,10 @@ ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SSE2-NEXT: movl %edi, 8(%esi) -; X86-SSE2-NEXT: movl %edx, 12(%esi) -; X86-SSE2-NEXT: movl %eax, (%esi) +; X86-SSE2-NEXT: movl %edi, 12(%esi) +; X86-SSE2-NEXT: movl %edx, 8(%esi) ; X86-SSE2-NEXT: movl %ecx, 4(%esi) +; X86-SSE2-NEXT: movl %eax, (%esi) ; X86-SSE2-NEXT: movl %esi, %eax ; X86-SSE2-NEXT: leal -8(%ebp), %esp ; X86-SSE2-NEXT: popl %esi @@ -242,10 +242,10 @@ ; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SLM-NEXT: movl %edi, 8(%esi) -; X86-SLM-NEXT: movl %edx, 12(%esi) -; X86-SLM-NEXT: movl %eax, (%esi) +; X86-SLM-NEXT: movl %edi, 12(%esi) +; X86-SLM-NEXT: movl %edx, 8(%esi) ; X86-SLM-NEXT: movl %ecx, 4(%esi) +; X86-SLM-NEXT: movl %eax, (%esi) ; X86-SLM-NEXT: movl %esi, %eax ; X86-SLM-NEXT: leal -8(%ebp), %esp ; X86-SLM-NEXT: popl %esi @@ -282,11 +282,11 @@ ; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-ATOM-NEXT: movl %eax, 8(%esi) -; X86-ATOM-NEXT: movl %edi, 12(%esi) -; X86-ATOM-NEXT: movl %ecx, (%esi) -; X86-ATOM-NEXT: movl %esi, %eax +; X86-ATOM-NEXT: movl %eax, 12(%esi) +; X86-ATOM-NEXT: movl %edi, 8(%esi) ; X86-ATOM-NEXT: movl %edx, 4(%esi) +; X86-ATOM-NEXT: movl %esi, %eax +; X86-ATOM-NEXT: movl %ecx, (%esi) ; X86-ATOM-NEXT: leal -8(%ebp), %esp ; X86-ATOM-NEXT: popl %esi ; X86-ATOM-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/atomic-mi.ll b/llvm/test/CodeGen/X86/atomic-mi.ll --- a/llvm/test/CodeGen/X86/atomic-mi.ll +++ b/llvm/test/CodeGen/X86/atomic-mi.ll @@ -751,10 +751,10 @@ ; X32-NEXT: fistpll {{[0-9]+}}(%esp) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: andl 16(%ebp), %edx ; X32-NEXT: andl 12(%ebp), %ecx -; X32-NEXT: movl %ecx, (%esp) +; X32-NEXT: andl 16(%ebp), %edx ; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: movl %ecx, (%esp) ; X32-NEXT: fildll (%esp) ; X32-NEXT: fistpll (%eax) ; X32-NEXT: movl %ebp, %esp @@ -973,10 +973,10 @@ ; X32-NEXT: fistpll {{[0-9]+}}(%esp) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: orl 16(%ebp), %edx ; X32-NEXT: orl 12(%ebp), %ecx -; X32-NEXT: movl %ecx, (%esp) +; X32-NEXT: orl 16(%ebp), %edx ; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: movl %ecx, (%esp) ; X32-NEXT: fildll (%esp) ; X32-NEXT: fistpll (%eax) ; X32-NEXT: movl %ebp, %esp @@ -1195,10 +1195,10 @@ ; X32-NEXT: fistpll {{[0-9]+}}(%esp) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: xorl 16(%ebp), %edx ; X32-NEXT: xorl 12(%ebp), %ecx -; X32-NEXT: movl %ecx, (%esp) +; X32-NEXT: xorl 16(%ebp), %edx ; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: movl %ecx, (%esp) ; X32-NEXT: fildll (%esp) ; X32-NEXT: fistpll (%eax) ; X32-NEXT: movl %ebp, %esp @@ -1603,10 +1603,10 @@ ; X32-NEXT: fistpll {{[0-9]+}}(%esp) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: notl %edx ; X32-NEXT: notl %ecx -; X32-NEXT: movl %ecx, (%esp) +; X32-NEXT: notl %edx ; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: movl %ecx, (%esp) ; X32-NEXT: fildll (%esp) ; X32-NEXT: fistpll (%eax) ; X32-NEXT: movl %ebp, %esp diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll --- a/llvm/test/CodeGen/X86/atomic-non-integer.ll +++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll @@ -436,10 +436,10 @@ ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SSE-NEXT: movl %edi, 8(%esi) -; X86-SSE-NEXT: movl %edx, 12(%esi) -; X86-SSE-NEXT: movl %eax, (%esi) +; X86-SSE-NEXT: movl %edi, 12(%esi) +; X86-SSE-NEXT: movl %edx, 8(%esi) ; X86-SSE-NEXT: movl %ecx, 4(%esi) +; X86-SSE-NEXT: movl %eax, (%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $20, %esp ; X86-SSE-NEXT: .cfi_def_cfa_offset 12 @@ -517,10 +517,10 @@ ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOSSE-NEXT: movl %edi, 8(%esi) -; X86-NOSSE-NEXT: movl %edx, 12(%esi) -; X86-NOSSE-NEXT: movl %eax, (%esi) +; X86-NOSSE-NEXT: movl %edi, 12(%esi) +; X86-NOSSE-NEXT: movl %edx, 8(%esi) ; X86-NOSSE-NEXT: movl %ecx, 4(%esi) +; X86-NOSSE-NEXT: movl %eax, (%esi) ; X86-NOSSE-NEXT: movl %esi, %eax ; X86-NOSSE-NEXT: addl $20, %esp ; X86-NOSSE-NEXT: .cfi_def_cfa_offset 12 diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll --- a/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll +++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll @@ -1497,13 +1497,12 @@ ; CHECK-NEXT: lock cmpxchgq %rcx, (%rdi) ; CHECK-NEXT: jne .LBB51_1 ; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: movl $123, %ecx ; CHECK-NEXT: testb $32, %al +; CHECK-NEXT: movl $123, %eax ; CHECK-NEXT: jne .LBB51_4 ; CHECK-NEXT: # %bb.3: # %if.then -; CHECK-NEXT: movq 32(%rdi), %rcx +; CHECK-NEXT: movq 32(%rdi), %rax ; CHECK-NEXT: .LBB51_4: # %return -; CHECK-NEXT: movq %rcx, %rax ; CHECK-NEXT: retq entry: %0 = atomicrmw xor ptr %v, i64 16 monotonic, align 8 diff --git a/llvm/test/CodeGen/X86/atomic-xor.ll b/llvm/test/CodeGen/X86/atomic-xor.ll --- a/llvm/test/CodeGen/X86/atomic-xor.ll +++ b/llvm/test/CodeGen/X86/atomic-xor.ll @@ -40,10 +40,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/atomic128.ll b/llvm/test/CodeGen/X86/atomic128.ll --- a/llvm/test/CodeGen/X86/atomic128.ll +++ b/llvm/test/CodeGen/X86/atomic128.ll @@ -63,10 +63,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK32-NEXT: movl %edi, 8(%esi) -; CHECK32-NEXT: movl %edx, 12(%esi) -; CHECK32-NEXT: movl %eax, (%esi) +; CHECK32-NEXT: movl %edi, 12(%esi) +; CHECK32-NEXT: movl %edx, 8(%esi) ; CHECK32-NEXT: movl %ecx, 4(%esi) +; CHECK32-NEXT: movl %eax, (%esi) ; CHECK32-NEXT: movl %esi, %eax ; CHECK32-NEXT: addl $20, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 12 @@ -173,10 +173,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -241,10 +241,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -309,10 +309,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -377,10 +377,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -448,10 +448,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -519,10 +519,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -590,10 +590,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -661,10 +661,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -731,10 +731,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK32-NEXT: movl %edi, 8(%esi) -; CHECK32-NEXT: movl %edx, 12(%esi) -; CHECK32-NEXT: movl %eax, (%esi) +; CHECK32-NEXT: movl %edi, 12(%esi) +; CHECK32-NEXT: movl %edx, 8(%esi) ; CHECK32-NEXT: movl %ecx, 4(%esi) +; CHECK32-NEXT: movl %eax, (%esi) ; CHECK32-NEXT: movl %esi, %eax ; CHECK32-NEXT: addl $20, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 12 @@ -803,10 +803,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK32-NEXT: movl %edi, 8(%esi) -; CHECK32-NEXT: movl %edx, 12(%esi) -; CHECK32-NEXT: movl %eax, (%esi) +; CHECK32-NEXT: movl %edi, 12(%esi) +; CHECK32-NEXT: movl %edx, 8(%esi) ; CHECK32-NEXT: movl %ecx, 4(%esi) +; CHECK32-NEXT: movl %eax, (%esi) ; CHECK32-NEXT: movl %esi, %eax ; CHECK32-NEXT: addl $20, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 12 diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -16,9 +16,10 @@ ; ; AVX-LABEL: avg_v4i8: ; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovd %xmm0, (%rax) ; AVX-NEXT: retq %1 = load <4 x i8>, ptr %a @@ -42,13 +43,45 @@ ; SSE2-NEXT: movq %xmm1, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpavgb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovq %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <8 x i8>, ptr %a %2 = load <8 x i8>, ptr %b %3 = zext <8 x i8> %1 to <8 x i32> @@ -69,12 +102,54 @@ ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <16 x i8>, ptr %a %2 = load <16 x i8>, ptr %b %3 = zext <16 x i8> %1 to <16 x i32> @@ -90,28 +165,28 @@ define void @avg_v24i8(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v24i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pavgb (%rdi), %xmm0 -; SSE2-NEXT: pavgb 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: pavgb (%rsi), %xmm0 +; SSE2-NEXT: pavgb 16(%rsi), %xmm1 ; SSE2-NEXT: movq %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v24i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 ; AVX1-NEXT: vmovq %xmm1, (%rax) ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v24i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovq %xmm1, (%rax) ; AVX2-NEXT: vmovdqu %xmm0, (%rax) @@ -120,8 +195,8 @@ ; ; AVX512-LABEL: avg_v24i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmovq %xmm1, (%rax) ; AVX512-NEXT: vmovdqu %xmm0, (%rax) @@ -142,36 +217,89 @@ define void @avg_v32i8(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v32i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pavgb (%rdi), %xmm0 -; SSE2-NEXT: pavgb 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: pavgb (%rsi), %xmm0 +; SSE2-NEXT: pavgb 16(%rsi), %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpavgb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpavgb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: avg_v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vpavgb %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -190,12 +318,12 @@ define void @avg_v48i8(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v48i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: pavgb (%rdi), %xmm0 -; SSE2-NEXT: pavgb 16(%rdi), %xmm1 -; SSE2-NEXT: pavgb 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: pavgb (%rsi), %xmm0 +; SSE2-NEXT: pavgb 16(%rsi), %xmm1 +; SSE2-NEXT: pavgb 32(%rsi), %xmm2 ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) @@ -203,12 +331,12 @@ ; ; AVX1-LABEL: avg_v48i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX1-NEXT: vpavgb 32(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vpavgb 32(%rsi), %xmm2, %xmm2 +; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vmovdqu %xmm2, (%rax) @@ -216,10 +344,10 @@ ; ; AVX2-LABEL: avg_v48i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-NEXT: vpavgb 32(%rdi), %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-NEXT: vpavgb 32(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vmovdqu %xmm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -227,10 +355,10 @@ ; ; AVX512F-LABEL: avg_v48i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512F-NEXT: vpavgb 32(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512F-NEXT: vpavgb 32(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqu %xmm1, (%rax) ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) ; AVX512F-NEXT: vzeroupper @@ -238,8 +366,8 @@ ; ; AVX512BW-LABEL: avg_v48i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, (%rax) ; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -259,14 +387,14 @@ define void @avg_v64i8(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v64i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pavgb (%rdi), %xmm0 -; SSE2-NEXT: pavgb 16(%rdi), %xmm1 -; SSE2-NEXT: pavgb 32(%rdi), %xmm2 -; SSE2-NEXT: pavgb 48(%rdi), %xmm3 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: pavgb (%rsi), %xmm0 +; SSE2-NEXT: pavgb 16(%rsi), %xmm1 +; SSE2-NEXT: pavgb 32(%rsi), %xmm2 +; SSE2-NEXT: pavgb 48(%rsi), %xmm3 ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) @@ -275,46 +403,173 @@ ; ; AVX1-LABEL: avg_v64i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3 -; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpavgb 32(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vpavgb 48(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vmovdqu %xmm3, (%rax) -; AVX1-NEXT: vmovdqu %xmm2, (%rax) -; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpavgb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpavgb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpavgb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpavgb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm6, %ymm7, %ymm6 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm7, %ymm6, %ymm6 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpavgb %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpavgb %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqu %ymm2, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpavgb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpavgb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovdb %zmm7, %xmm7 +; AVX512F-NEXT: vpmovdb %zmm6, %xmm6 +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpavgb %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpmovdb %zmm5, %xmm3 +; AVX512F-NEXT: vpmovdb %zmm4, %xmm4 +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpavgb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqu %ymm2, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpavgb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovdb %zmm7, %xmm7 +; AVX512BW-NEXT: vpmovdb %zmm6, %xmm6 +; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX512BW-NEXT: vpmovdb %zmm5, %xmm5 +; AVX512BW-NEXT: vpmovdb %zmm4, %xmm4 +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512BW-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512BW-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpavgb %zmm4, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -339,13 +594,34 @@ ; SSE2-NEXT: movq %xmm1, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpavgw %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovq %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v4i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX512-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm0, (%rax) +; AVX512-NEXT: retq %1 = load <4 x i16>, ptr %a %2 = load <4 x i16>, ptr %b %3 = zext <4 x i16> %1 to <4 x i32> @@ -366,12 +642,41 @@ ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <8 x i16>, ptr %a %2 = load <8 x i16>, ptr %b %3 = zext <8 x i16> %1 to <8 x i32> @@ -387,36 +692,55 @@ define void @avg_v16i16(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pavgw (%rdi), %xmm0 -; SSE2-NEXT: pavgw 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: pavgw (%rsi), %xmm0 +; SSE2-NEXT: pavgw 16(%rsi), %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpavgw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpavgw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpavgw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: avg_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -435,14 +759,14 @@ define void @avg_v32i16(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v32i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pavgw (%rdi), %xmm0 -; SSE2-NEXT: pavgw 16(%rdi), %xmm1 -; SSE2-NEXT: pavgw 32(%rdi), %xmm2 -; SSE2-NEXT: pavgw 48(%rdi), %xmm3 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: pavgw (%rsi), %xmm0 +; SSE2-NEXT: pavgw 16(%rsi), %xmm1 +; SSE2-NEXT: pavgw 32(%rsi), %xmm2 +; SSE2-NEXT: pavgw 48(%rsi), %xmm3 ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) @@ -451,46 +775,93 @@ ; ; AVX1-LABEL: avg_v32i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3 -; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgw 16(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpavgw 32(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vpavgw 48(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vmovdqu %xmm3, (%rax) -; AVX1-NEXT: vmovdqu %xmm2, (%rax) -; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpavgw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpavgw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512BW-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpavgw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -509,16 +880,16 @@ define void @avg_v40i16(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v40i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pavgw (%rdi), %xmm0 -; SSE2-NEXT: pavgw 16(%rdi), %xmm1 -; SSE2-NEXT: pavgw 32(%rdi), %xmm2 -; SSE2-NEXT: pavgw 48(%rdi), %xmm3 -; SSE2-NEXT: movdqa 64(%rsi), %xmm4 -; SSE2-NEXT: pavgw 64(%rdi), %xmm4 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: pavgw (%rsi), %xmm0 +; SSE2-NEXT: pavgw 16(%rsi), %xmm1 +; SSE2-NEXT: pavgw 32(%rsi), %xmm2 +; SSE2-NEXT: pavgw 48(%rsi), %xmm3 +; SSE2-NEXT: movdqa 64(%rdi), %xmm4 +; SSE2-NEXT: pavgw 64(%rsi), %xmm4 ; SSE2-NEXT: movdqu %xmm4, (%rax) ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) @@ -528,16 +899,16 @@ ; ; AVX1-LABEL: avg_v40i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX1-NEXT: vpavgw 64(%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa (%rsi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm2 -; AVX1-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX1-NEXT: vmovdqa 48(%rsi), %xmm4 -; AVX1-NEXT: vpavgw (%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpavgw 16(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vpavgw 32(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vpavgw 48(%rdi), %xmm4, %xmm4 +; AVX1-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX1-NEXT: vpavgw 64(%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-NEXT: vpavgw (%rsi), %xmm1, %xmm1 +; AVX1-NEXT: vpavgw 16(%rsi), %xmm2, %xmm2 +; AVX1-NEXT: vpavgw 32(%rsi), %xmm3, %xmm3 +; AVX1-NEXT: vpavgw 48(%rsi), %xmm4, %xmm4 ; AVX1-NEXT: vmovdqu %xmm4, (%rax) ; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: vmovdqu %xmm2, (%rax) @@ -547,12 +918,12 @@ ; ; AVX2-LABEL: avg_v40i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX2-NEXT: vpavgw 64(%rdi), %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-NEXT: vpavgw 64(%rsi), %xmm2, %xmm2 ; AVX2-NEXT: vmovdqu %xmm2, (%rax) ; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rax) @@ -561,12 +932,12 @@ ; ; AVX512F-LABEL: avg_v40i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512F-NEXT: vpavgw 64(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512F-NEXT: vpavgw 64(%rsi), %xmm2, %xmm2 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) ; AVX512F-NEXT: vmovdqu %xmm2, (%rax) @@ -575,10 +946,10 @@ ; ; AVX512BW-LABEL: avg_v40i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512BW-NEXT: vpavgw 64(%rdi), %xmm1, %xmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512BW-NEXT: vpavgw 64(%rsi), %xmm1, %xmm1 ; AVX512BW-NEXT: vmovdqu %xmm1, (%rax) ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -606,9 +977,10 @@ ; ; AVX-LABEL: avg_v4i8_2: ; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovd %xmm0, (%rax) ; AVX-NEXT: retq %1 = load <4 x i8>, ptr %a @@ -632,13 +1004,45 @@ ; SSE2-NEXT: movq %xmm1, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v8i8_2: -; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v8i8_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v8i8_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v8i8_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <8 x i8>, ptr %a %2 = load <8 x i8>, ptr %b %3 = zext <8 x i8> %1 to <8 x i32> @@ -659,12 +1063,54 @@ ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v16i8_2: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v16i8_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v16i8_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v16i8_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <16 x i8>, ptr %a %2 = load <16 x i8>, ptr %b %3 = zext <16 x i8> %1 to <16 x i32> @@ -690,26 +1136,79 @@ ; ; AVX1-LABEL: avg_v32i8_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpavgb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i8_2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpavgb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: avg_v32i8_2: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vpavgb %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -740,28 +1239,96 @@ ; ; AVX1-LABEL: avg_v64i8_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps (%rsi), %ymm0 -; AVX1-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX1-NEXT: vmovups %ymm1, (%rax) -; AVX1-NEXT: vmovups %ymm0, (%rax) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v64i8_2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps (%rsi), %ymm0 -; AVX2-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX2-NEXT: vmovups %ymm1, (%rax) -; AVX2-NEXT: vmovups %ymm0, (%rax) +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqu %ymm2, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: avg_v64i8_2: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps (%rsi), %zmm0 -; AVX512-NEXT: vmovups %zmm0, (%rax) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: avg_v64i8_2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovdb %zmm3, (%rax) +; AVX512F-NEXT: vpmovdb %zmm2, (%rax) +; AVX512F-NEXT: vpmovdb %zmm1, (%rax) +; AVX512F-NEXT: vpmovdb %zmm0, (%rax) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: avg_v64i8_2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512BW-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %1 = load <64 x i8>, ptr %a %2 = load <64 x i8>, ptr %b %3 = zext <64 x i8> %1 to <64 x i32> @@ -784,13 +1351,34 @@ ; SSE2-NEXT: movq %xmm1, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v4i16_2: -; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v4i16_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v4i16_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v4i16_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX512-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm0, (%rax) +; AVX512-NEXT: retq %1 = load <4 x i16>, ptr %a %2 = load <4 x i16>, ptr %b %3 = zext <4 x i16> %1 to <4 x i32> @@ -811,12 +1399,41 @@ ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v8i16_2: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v8i16_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v8i16_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v8i16_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <8 x i16>, ptr %a %2 = load <8 x i16>, ptr %b %3 = zext <8 x i16> %1 to <8 x i32> @@ -842,26 +1459,45 @@ ; ; AVX1-LABEL: avg_v16i16_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpavgw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpavgw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v16i16_2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpavgw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: avg_v16i16_2: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -896,46 +1532,93 @@ ; ; AVX1-LABEL: avg_v32i16_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vpavgw 32(%rsi), %xmm2, %xmm2 -; AVX1-NEXT: vpavgw 48(%rsi), %xmm3, %xmm3 -; AVX1-NEXT: vmovdqu %xmm3, (%rax) -; AVX1-NEXT: vmovdqu %xmm2, (%rax) -; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i16_2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpavgw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpavgw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v32i16_2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v32i16_2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512BW-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpavgw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -961,7 +1644,9 @@ ; ; AVX-LABEL: avg_v4i8_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, (%rax) ; AVX-NEXT: retq @@ -982,12 +1667,35 @@ ; SSE2-NEXT: movq %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v8i8_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v8i8_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v8i8_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v8i8_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <8 x i8>, ptr %a %2 = zext <8 x i8> %1 to <8 x i32> %3 = add nuw nsw <8 x i32> %2, @@ -1005,12 +1713,40 @@ ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v16i8_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v16i8_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v16i8_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v16i8_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <16 x i8>, ptr %a %2 = zext <16 x i8> %1 to <16 x i32> %3 = add nuw nsw <16 x i32> %2, @@ -1033,17 +1769,40 @@ ; ; AVX1-LABEL: avg_v32i8_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX1-NEXT: # xmm0 = mem[0,0] -; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpavgb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i8_const: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -1051,7 +1810,11 @@ ; ; AVX512-LABEL: avg_v32i8_const: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper @@ -1084,41 +1847,109 @@ ; ; AVX1-LABEL: avg_v64i8_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX1-NEXT: # xmm0 = mem[0,0] -; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vpavgb 32(%rdi), %xmm0, %xmm3 -; AVX1-NEXT: vpavgb 48(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpavgb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpavgb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpavgb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpavgb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: vmovdqu %xmm3, (%rax) -; AVX1-NEXT: vmovdqu %xmm2, (%rax) ; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v64i8_const: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm1 -; AVX2-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpavgb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpavgb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) -; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vmovdqu %ymm2, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v64i8_const: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512F-NEXT: vpavgb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpavgb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) -; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vmovdqu %ymm2, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v64i8_const: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512BW-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -1142,7 +1973,8 @@ ; ; AVX-LABEL: avg_v4i16_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq @@ -1163,12 +1995,33 @@ ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v8i16_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v8i16_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v8i16_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v8i16_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <8 x i16>, ptr %a %2 = zext <8 x i16> %1 to <8 x i32> %3 = add nuw nsw <8 x i32> %2, @@ -1191,16 +2044,25 @@ ; ; AVX1-LABEL: avg_v16i16_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpavgw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpavgw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v16i16_const: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -1208,7 +2070,8 @@ ; ; AVX512-LABEL: avg_v16i16_const: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper @@ -1241,23 +2104,43 @@ ; ; AVX1-LABEL: avg_v32i16_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vpavgw 32(%rdi), %xmm0, %xmm3 -; AVX1-NEXT: vpavgw 48(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpavgw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpavgw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpavgw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpavgw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: vmovdqu %xmm3, (%rax) -; AVX1-NEXT: vmovdqu %xmm2, (%rax) ; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i16_const: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX2-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm1 -; AVX2-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpavgw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpavgw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vzeroupper @@ -1265,10 +2148,14 @@ ; ; AVX512F-LABEL: avg_v32i16_const: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vpavgw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vzeroupper @@ -1276,7 +2163,11 @@ ; ; AVX512BW-LABEL: avg_v32i16_const: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -1739,141 +2630,114 @@ ; SSE2-NEXT: pushq %r13 ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movaps (%rdi), %xmm1 -; SSE2-NEXT: movaps (%rsi), %xmm0 -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps (%rsi), %xmm1 +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: addq %rax, %rcx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: addq %rbp, %rax -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r13,%rbp), %r13 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r12,%rbp), %r12 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r15,%rbp), %r15 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r14,%rbp), %r14 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%rbx,%rbp), %rbx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r11,%rbp), %r11 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r10,%rbp), %r10 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r9,%rbp), %r9 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r8,%rbp), %r8 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%rdi,%rbp), %rdi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%rsi,%rbp), %rsi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%rdx,%rbp), %rdx -; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE2-NEXT: leaq -1(%rbp,%rdx), %rdx -; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE2-NEXT: leaq -1(%rbp,%rdx), %rdx -; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: xorl %ebp, %ebp -; SSE2-NEXT: addq $-1, %rcx -; SSE2-NEXT: movl $0, %edx -; SSE2-NEXT: adcq $-1, %rdx -; SSE2-NEXT: addq $-1, %rax -; SSE2-NEXT: adcq $-1, %rbp -; SSE2-NEXT: shldq $63, %rax, %rbp -; SSE2-NEXT: shldq $63, %rcx, %rdx -; SSE2-NEXT: movq %rdx, %xmm1 -; SSE2-NEXT: movq %rbp, %xmm0 -; SSE2-NEXT: shrq %r13 -; SSE2-NEXT: movq %r13, %xmm3 -; SSE2-NEXT: shrq %r12 -; SSE2-NEXT: movq %r12, %xmm2 -; SSE2-NEXT: shrq %r15 -; SSE2-NEXT: movq %r15, %xmm5 -; SSE2-NEXT: shrq %r14 -; SSE2-NEXT: movq %r14, %xmm4 -; SSE2-NEXT: shrq %rbx -; SSE2-NEXT: movq %rbx, %xmm6 -; SSE2-NEXT: shrq %r11 -; SSE2-NEXT: movq %r11, %xmm7 -; SSE2-NEXT: shrq %r10 -; SSE2-NEXT: movq %r10, %xmm9 -; SSE2-NEXT: shrq %r9 -; SSE2-NEXT: movq %r9, %xmm8 -; SSE2-NEXT: shrq %r8 -; SSE2-NEXT: movq %r8, %xmm10 -; SSE2-NEXT: shrq %rdi -; SSE2-NEXT: movq %rdi, %xmm11 -; SSE2-NEXT: shrq %rsi -; SSE2-NEXT: movq %rsi, %xmm12 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: shrq %rax -; SSE2-NEXT: movq %rax, %xmm13 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: shrq %rax -; SSE2-NEXT: movq %rax, %xmm14 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: shrq %rax -; SSE2-NEXT: movq %rax, %xmm15 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%rcx,%rsi), %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%rdx,%rsi), %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%rdi,%rsi), %edi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%r9,%rsi), %r9d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%rbx,%rsi), %ebx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%rbp,%rsi), %ebp +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%r12,%rsi), %r12d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%rax,%rsi), %eax +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%r13,%rsi), %r13d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%r15,%rsi), %r15d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%r14,%rsi), %r14d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%r11,%rsi), %r11d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%r10,%rsi), %esi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d +; SSE2-NEXT: leal -1(%r8,%r10), %r8d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: leal -1(%rcx,%r10), %r10d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE2-NEXT: leal -1(%rdx,%rcx), %ecx +; SSE2-NEXT: shrl %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: shrl %r10d +; SSE2-NEXT: movd %r10d, %xmm1 +; SSE2-NEXT: shrl %r8d +; SSE2-NEXT: movd %r8d, %xmm3 +; SSE2-NEXT: shrl %esi +; SSE2-NEXT: movd %esi, %xmm2 +; SSE2-NEXT: shrl %r11d +; SSE2-NEXT: movd %r11d, %xmm4 +; SSE2-NEXT: shrl %r14d +; SSE2-NEXT: movd %r14d, %xmm5 +; SSE2-NEXT: shrl %r15d +; SSE2-NEXT: movd %r15d, %xmm6 +; SSE2-NEXT: shrl %r13d +; SSE2-NEXT: movd %r13d, %xmm7 +; SSE2-NEXT: shrl %eax +; SSE2-NEXT: movd %eax, %xmm8 +; SSE2-NEXT: shrl %r12d +; SSE2-NEXT: movd %r12d, %xmm9 +; SSE2-NEXT: shrl %ebp +; SSE2-NEXT: movd %ebp, %xmm10 +; SSE2-NEXT: shrl %ebx +; SSE2-NEXT: movd %ebx, %xmm11 +; SSE2-NEXT: shrl %r9d +; SSE2-NEXT: movd %r9d, %xmm12 +; SSE2-NEXT: shrl %edi +; SSE2-NEXT: movd %edi, %xmm13 +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SSE2-NEXT: shrl %eax +; SSE2-NEXT: movd %eax, %xmm14 +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SSE2-NEXT: shrl %eax +; SSE2-NEXT: movd %eax, %xmm15 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE2-NEXT: psllq $48, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm4, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; SSE2-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm8, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; SSE2-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm13[0,1,2,3,4,5] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,0,1] -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm13, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE2-NEXT: movupd %xmm2, (%rax) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm7[0] +; SSE2-NEXT: movdqu %xmm15, (%rax) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -1894,102 +2758,92 @@ ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpextrw $4, %xmm0, %eax +; AVX1-NEXT: vpextrw $6, %xmm0, %eax ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $5, %xmm0, %eax -; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $6, %xmm0, %r10d -; AVX1-NEXT: vpextrw $7, %xmm0, %edx -; AVX1-NEXT: vpextrw $0, %xmm3, %edi -; AVX1-NEXT: vpextrw $1, %xmm3, %r8d +; AVX1-NEXT: vpextrw $7, %xmm0, %ecx +; AVX1-NEXT: vpextrw $0, %xmm0, %edx +; AVX1-NEXT: vpextrw $1, %xmm0, %esi +; AVX1-NEXT: vpextrw $2, %xmm0, %edi +; AVX1-NEXT: vpextrw $3, %xmm0, %r8d +; AVX1-NEXT: vpextrw $0, %xmm3, %r10d +; AVX1-NEXT: vpextrw $1, %xmm3, %r11d +; AVX1-NEXT: vpextrw $4, %xmm3, %r14d +; AVX1-NEXT: vpextrw $5, %xmm3, %r15d +; AVX1-NEXT: vpextrw $7, %xmm3, %r12d +; AVX1-NEXT: vpextrw $6, %xmm3, %r13d +; AVX1-NEXT: vpextrw $3, %xmm3, %eax ; AVX1-NEXT: vpextrw $2, %xmm3, %r9d -; AVX1-NEXT: vpextrw $3, %xmm3, %r11d -; AVX1-NEXT: vpextrw $4, %xmm3, %ebx -; AVX1-NEXT: vpextrw $5, %xmm3, %r14d -; AVX1-NEXT: vpextrw $6, %xmm3, %r15d -; AVX1-NEXT: vpextrw $7, %xmm3, %esi -; AVX1-NEXT: vpextrw $1, %xmm0, %r13d -; AVX1-NEXT: vpextrw $0, %xmm0, %r12d -; AVX1-NEXT: vpextrw $1, %xmm1, %ecx -; AVX1-NEXT: addq %r13, %rcx -; AVX1-NEXT: vpextrw $0, %xmm1, %eax -; AVX1-NEXT: addq %r12, %rax -; AVX1-NEXT: vpextrw $7, %xmm2, %r12d -; AVX1-NEXT: leaq -1(%rsi,%r12), %rsi -; AVX1-NEXT: vpextrw $6, %xmm2, %r12d -; AVX1-NEXT: leaq -1(%r15,%r12), %rbp -; AVX1-NEXT: vpextrw $5, %xmm2, %r15d -; AVX1-NEXT: leaq -1(%r14,%r15), %r13 -; AVX1-NEXT: vpextrw $4, %xmm2, %r14d -; AVX1-NEXT: leaq -1(%rbx,%r14), %r12 -; AVX1-NEXT: vpextrw $3, %xmm2, %ebx -; AVX1-NEXT: leaq -1(%r11,%rbx), %r15 -; AVX1-NEXT: vpextrw $2, %xmm2, %r11d -; AVX1-NEXT: leaq -1(%r9,%r11), %r14 +; AVX1-NEXT: vpextrw $2, %xmm2, %ebx +; AVX1-NEXT: leal -1(%r9,%rbx), %r9d +; AVX1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $3, %xmm2, %r9d +; AVX1-NEXT: leal -1(%rax,%r9), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $6, %xmm2, %eax +; AVX1-NEXT: leal -1(%r13,%rax), %ebp +; AVX1-NEXT: vpextrw $7, %xmm2, %eax +; AVX1-NEXT: leal -1(%r12,%rax), %eax +; AVX1-NEXT: vpextrw $5, %xmm2, %r9d +; AVX1-NEXT: leaq -1(%r15,%r9), %r13 +; AVX1-NEXT: vpextrw $4, %xmm2, %r9d +; AVX1-NEXT: leaq -1(%r14,%r9), %r12 ; AVX1-NEXT: vpextrw $1, %xmm2, %r9d -; AVX1-NEXT: leaq -1(%r8,%r9), %rbx -; AVX1-NEXT: vpextrw $0, %xmm2, %r8d -; AVX1-NEXT: leaq -1(%rdi,%r8), %r11 -; AVX1-NEXT: vpextrw $7, %xmm1, %edi -; AVX1-NEXT: leaq -1(%rdx,%rdi), %r9 -; AVX1-NEXT: vpextrw $6, %xmm1, %edx -; AVX1-NEXT: leaq -1(%r10,%rdx), %r8 -; AVX1-NEXT: vpextrw $5, %xmm1, %edx -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX1-NEXT: leaq -1(%rdi,%rdx), %rdi -; AVX1-NEXT: vpextrw $4, %xmm1, %edx -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX1-NEXT: leaq -1(%r10,%rdx), %rdx -; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $3, %xmm0, %edx -; AVX1-NEXT: vpextrw $3, %xmm1, %r10d -; AVX1-NEXT: leaq -1(%rdx,%r10), %rdx -; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $2, %xmm0, %edx -; AVX1-NEXT: vpextrw $2, %xmm1, %r10d -; AVX1-NEXT: leaq -1(%rdx,%r10), %rdx -; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: xorl %edx, %edx -; AVX1-NEXT: addq $-1, %rcx -; AVX1-NEXT: movl $0, %r10d -; AVX1-NEXT: adcq $-1, %r10 -; AVX1-NEXT: addq $-1, %rax -; AVX1-NEXT: adcq $-1, %rdx -; AVX1-NEXT: shldq $63, %rax, %rdx -; AVX1-NEXT: shldq $63, %rcx, %r10 -; AVX1-NEXT: shrq %rsi -; AVX1-NEXT: vmovq %rsi, %xmm0 -; AVX1-NEXT: shrq %rbp -; AVX1-NEXT: vmovq %rbp, %xmm1 +; AVX1-NEXT: leaq -1(%r11,%r9), %r15 +; AVX1-NEXT: vpextrw $0, %xmm2, %r9d +; AVX1-NEXT: leaq -1(%r10,%r9), %r14 +; AVX1-NEXT: vpextrw $3, %xmm1, %r9d +; AVX1-NEXT: leaq -1(%r8,%r9), %r11 +; AVX1-NEXT: vpextrw $2, %xmm1, %r8d +; AVX1-NEXT: leaq -1(%rdi,%r8), %r10 +; AVX1-NEXT: vpextrw $1, %xmm1, %edi +; AVX1-NEXT: leaq -1(%rsi,%rdi), %r8 +; AVX1-NEXT: vpextrw $0, %xmm1, %esi +; AVX1-NEXT: leaq -1(%rdx,%rsi), %rdi +; AVX1-NEXT: vpextrw $7, %xmm1, %edx +; AVX1-NEXT: leaq -1(%rcx,%rdx), %rsi +; AVX1-NEXT: vpextrw $6, %xmm1, %ecx +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX1-NEXT: leaq -1(%rdx,%rcx), %rdx +; AVX1-NEXT: vpextrw $5, %xmm0, %ecx +; AVX1-NEXT: vpextrw $5, %xmm1, %r9d +; AVX1-NEXT: leaq -1(%rcx,%r9), %rcx +; AVX1-NEXT: vpextrw $4, %xmm0, %r9d +; AVX1-NEXT: vpextrw $4, %xmm1, %ebx +; AVX1-NEXT: leaq -1(%r9,%rbx), %r9 +; AVX1-NEXT: shrl %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: shrl %ebp +; AVX1-NEXT: vmovd %ebp, %xmm1 ; AVX1-NEXT: shrq %r13 ; AVX1-NEXT: vmovq %r13, %xmm2 ; AVX1-NEXT: shrq %r12 ; AVX1-NEXT: vmovq %r12, %xmm3 +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; AVX1-NEXT: shrl %eax +; AVX1-NEXT: vmovd %eax, %xmm4 +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; AVX1-NEXT: shrl %eax +; AVX1-NEXT: vmovd %eax, %xmm5 ; AVX1-NEXT: shrq %r15 -; AVX1-NEXT: vmovq %r15, %xmm4 +; AVX1-NEXT: vmovq %r15, %xmm6 ; AVX1-NEXT: shrq %r14 -; AVX1-NEXT: vmovq %r14, %xmm5 -; AVX1-NEXT: shrq %rbx -; AVX1-NEXT: vmovq %rbx, %xmm6 +; AVX1-NEXT: vmovq %r14, %xmm7 ; AVX1-NEXT: shrq %r11 -; AVX1-NEXT: vmovq %r11, %xmm7 -; AVX1-NEXT: shrq %r9 -; AVX1-NEXT: vmovq %r9, %xmm8 +; AVX1-NEXT: vmovq %r11, %xmm8 +; AVX1-NEXT: shrq %r10 +; AVX1-NEXT: vmovq %r10, %xmm9 ; AVX1-NEXT: shrq %r8 -; AVX1-NEXT: vmovq %r8, %xmm9 +; AVX1-NEXT: vmovq %r8, %xmm10 ; AVX1-NEXT: shrq %rdi -; AVX1-NEXT: vmovq %rdi, %xmm10 -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm11 -; AVX1-NEXT: vmovq %r10, %xmm12 +; AVX1-NEXT: vmovq %rdi, %xmm11 +; AVX1-NEXT: shrq %rsi +; AVX1-NEXT: vmovq %rsi, %xmm12 +; AVX1-NEXT: shrq %rdx ; AVX1-NEXT: vmovq %rdx, %xmm13 -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm14 -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm15 +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: vmovq %rcx, %xmm14 +; AVX1-NEXT: shrq %r9 +; AVX1-NEXT: vmovq %r9, %xmm15 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] @@ -2003,14 +2857,13 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX1-NEXT: vpsllq $48, %xmm2, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX1-NEXT: vpslld $16, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: popq %rbx @@ -2029,187 +2882,140 @@ ; AVX2-NEXT: pushq %r13 ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-NEXT: vmovq %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX2-NEXT: vmovq %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-NEXT: vmovq %xmm7, %r13 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX2-NEXT: vmovq %xmm2, %rbp -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX2-NEXT: vmovq %xmm8, %r8 -; AVX2-NEXT: vpextrq $1, %xmm8, %r15 -; AVX2-NEXT: vpextrq $1, %xmm2, %r14 -; AVX2-NEXT: vpextrq $1, %xmm7, %rbx -; AVX2-NEXT: vpextrq $1, %xmm6, %rsi -; AVX2-NEXT: vpextrq $1, %xmm5, %rdx -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: vpextrq $1, %xmm3, %rax -; AVX2-NEXT: vmovq %xmm3, %rdi -; AVX2-NEXT: vpextrq $1, %xmm0, %r10 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm2 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm8 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-NEXT: vpextrq $1, %xmm9, %r11 -; AVX2-NEXT: addq %r15, %r11 -; AVX2-NEXT: vpextrq $1, %xmm8, %r9 -; AVX2-NEXT: addq %r14, %r9 -; AVX2-NEXT: movq %r9, %r14 -; AVX2-NEXT: vpextrq $1, %xmm7, %r9 -; AVX2-NEXT: addq %rbx, %r9 -; AVX2-NEXT: movq %r9, %rbx -; AVX2-NEXT: vpextrq $1, %xmm4, %r15 -; AVX2-NEXT: addq %rsi, %r15 -; AVX2-NEXT: vpextrq $1, %xmm5, %r12 -; AVX2-NEXT: addq %rdx, %r12 -; AVX2-NEXT: vpextrq $1, %xmm3, %r9 -; AVX2-NEXT: addq %rcx, %r9 -; AVX2-NEXT: vpextrq $1, %xmm6, %rsi -; AVX2-NEXT: addq %rax, %rsi -; AVX2-NEXT: vmovq %xmm6, %rdx -; AVX2-NEXT: addq %rdi, %rdx +; AVX2-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX2-NEXT: vpextrq $1, %xmm2, %rcx -; AVX2-NEXT: addq %r10, %rcx -; AVX2-NEXT: vmovq %xmm9, %r10 -; AVX2-NEXT: leaq -1(%r8,%r10), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm8, %rdi -; AVX2-NEXT: leaq -1(%rbp,%rdi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm7, %rdi -; AVX2-NEXT: leaq -1(%r13,%rdi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX2-NEXT: vmovq %xmm4, %rdx +; AVX2-NEXT: vpextrq $1, %xmm4, %rsi +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX2-NEXT: vmovq %xmm4, %rdi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: leaq -1(%rax,%rdi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm5, %rdi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: leaq -1(%rax,%rdi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm1, %rdi -; AVX2-NEXT: vmovq %xmm3, %r8 -; AVX2-NEXT: leaq -1(%rdi,%r8), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm0, %rdi -; AVX2-NEXT: vmovq %xmm2, %r8 -; AVX2-NEXT: leaq -1(%rdi,%r8), %rdi -; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: addq $-1, %r11 -; AVX2-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %r8d -; AVX2-NEXT: adcq $-1, %r8 -; AVX2-NEXT: addq $-1, %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %edi -; AVX2-NEXT: adcq $-1, %rdi -; AVX2-NEXT: addq $-1, %rbx -; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %r11d -; AVX2-NEXT: adcq $-1, %r11 -; AVX2-NEXT: addq $-1, %r15 -; AVX2-NEXT: movl $0, %r10d -; AVX2-NEXT: adcq $-1, %r10 -; AVX2-NEXT: addq $-1, %r12 -; AVX2-NEXT: movl $0, %r14d -; AVX2-NEXT: adcq $-1, %r14 -; AVX2-NEXT: addq $-1, %r9 -; AVX2-NEXT: movl $0, %ebp -; AVX2-NEXT: adcq $-1, %rbp -; AVX2-NEXT: addq $-1, %rsi -; AVX2-NEXT: movl $0, %r13d -; AVX2-NEXT: adcq $-1, %r13 -; AVX2-NEXT: addq $-1, %rdx -; AVX2-NEXT: movl $0, %ebx -; AVX2-NEXT: adcq $-1, %rbx -; AVX2-NEXT: addq $-1, %rcx -; AVX2-NEXT: movl $0, %eax -; AVX2-NEXT: adcq $-1, %rax -; AVX2-NEXT: shldq $63, %rcx, %rax -; AVX2-NEXT: shldq $63, %rdx, %rbx -; AVX2-NEXT: shldq $63, %rsi, %r13 -; AVX2-NEXT: shldq $63, %r9, %rbp -; AVX2-NEXT: shldq $63, %r12, %r14 -; AVX2-NEXT: shldq $63, %r15, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %rdi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %r8 -; AVX2-NEXT: vmovq %r8, %xmm0 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm1 -; AVX2-NEXT: vmovq %rdi, %xmm2 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm3 +; AVX2-NEXT: vpextrq $1, %xmm4, %r8 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vmovq %xmm1, %r10 +; AVX2-NEXT: vpextrq $1, %xmm1, %r11 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %r14 +; AVX2-NEXT: vpextrq $1, %xmm1, %r15 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %r9 +; AVX2-NEXT: vpextrq $1, %xmm1, %rbx +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %r13 +; AVX2-NEXT: vpextrq $1, %xmm1, %r12 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX2-NEXT: vmovq %xmm7, %rax +; AVX2-NEXT: leal -1(%r9,%rax), %eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrq $1, %xmm7, %rax +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-NEXT: leal -1(%rbx,%rax), %eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vmovq %xmm7, %rax +; AVX2-NEXT: leal -1(%r13,%rax), %ebp +; AVX2-NEXT: vpextrq $1, %xmm7, %rax +; AVX2-NEXT: leal -1(%r12,%rax), %r12d +; AVX2-NEXT: vpextrq $1, %xmm6, %rax +; AVX2-NEXT: leaq -1(%r15,%rax), %rax +; AVX2-NEXT: vmovq %xmm6, %r9 +; AVX2-NEXT: leaq -1(%r14,%r9), %r13 +; AVX2-NEXT: vpextrq $1, %xmm5, %r9 +; AVX2-NEXT: leaq -1(%r11,%r9), %r15 +; AVX2-NEXT: vmovq %xmm5, %r9 +; AVX2-NEXT: leaq -1(%r10,%r9), %r14 +; AVX2-NEXT: vpextrq $1, %xmm4, %r9 +; AVX2-NEXT: leaq -1(%r8,%r9), %r11 +; AVX2-NEXT: vmovq %xmm4, %r8 +; AVX2-NEXT: leaq -1(%rdi,%r8), %r10 +; AVX2-NEXT: vpextrq $1, %xmm3, %rdi +; AVX2-NEXT: leaq -1(%rsi,%rdi), %r8 +; AVX2-NEXT: vmovq %xmm3, %rsi +; AVX2-NEXT: leaq -1(%rdx,%rsi), %rdi +; AVX2-NEXT: vpextrq $1, %xmm2, %rdx +; AVX2-NEXT: leaq -1(%rcx,%rdx), %rsi +; AVX2-NEXT: vmovq %xmm2, %rcx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX2-NEXT: leaq -1(%rdx,%rcx), %rdx +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: vpextrq $1, %xmm1, %r9 +; AVX2-NEXT: leaq -1(%rcx,%r9), %rcx +; AVX2-NEXT: vmovq %xmm0, %r9 +; AVX2-NEXT: vmovq %xmm1, %rbx +; AVX2-NEXT: leaq -1(%r9,%rbx), %r9 +; AVX2-NEXT: shrq %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: shrq %r13 +; AVX2-NEXT: vmovq %r13, %xmm1 +; AVX2-NEXT: shrq %r15 +; AVX2-NEXT: vmovq %r15, %xmm2 +; AVX2-NEXT: shrq %r14 +; AVX2-NEXT: vmovq %r14, %xmm3 +; AVX2-NEXT: shrq %r11 ; AVX2-NEXT: vmovq %r11, %xmm4 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm5 -; AVX2-NEXT: vmovq %r10, %xmm6 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX2-NEXT: shrq %r10 +; AVX2-NEXT: vmovq %r10, %xmm5 +; AVX2-NEXT: shrq %r8 +; AVX2-NEXT: vmovq %r8, %xmm6 +; AVX2-NEXT: shrq %rdi +; AVX2-NEXT: vmovq %rdi, %xmm7 +; AVX2-NEXT: shrl %r12d +; AVX2-NEXT: vmovd %r12d, %xmm8 +; AVX2-NEXT: shrl %ebp +; AVX2-NEXT: vmovd %ebp, %xmm9 +; AVX2-NEXT: shrq %rsi +; AVX2-NEXT: vmovq %rsi, %xmm10 +; AVX2-NEXT: shrq %rdx +; AVX2-NEXT: vmovq %rdx, %xmm11 +; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; AVX2-NEXT: shrl %eax +; AVX2-NEXT: vmovd %eax, %xmm12 +; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; AVX2-NEXT: shrl %eax +; AVX2-NEXT: vmovd %eax, %xmm13 ; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm7 -; AVX2-NEXT: vmovq %r14, %xmm8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm9 -; AVX2-NEXT: vmovq %rbp, %xmm10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm11 -; AVX2-NEXT: vmovq %r13, %xmm12 -; AVX2-NEXT: vmovq %rbx, %xmm13 -; AVX2-NEXT: vmovq %rax, %xmm14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: vmovq %rax, %xmm15 +; AVX2-NEXT: vmovq %rcx, %xmm14 +; AVX2-NEXT: shrq %r9 +; AVX2-NEXT: vmovq %r9, %xmm15 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5],xmm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX2-NEXT: vpslld $16, %xmm3, %xmm3 -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: vpbroadcastw %xmm3, %xmm3 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: popq %r12 @@ -2228,160 +3034,140 @@ ; AVX512-NEXT: pushq %r13 ; AVX512-NEXT: pushq %r12 ; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm0 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: vpextrq $1, %xmm2, %rcx +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: vmovq %xmm4, %rdx +; AVX512-NEXT: vpextrq $1, %xmm4, %rsi ; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: vpextrq $1, %xmm4, %rbp -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero ; AVX512-NEXT: vmovq %xmm4, %rdi -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512-NEXT: vmovq %xmm5, %r8 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512-NEXT: vmovq %xmm3, %r9 -; AVX512-NEXT: vpextrq $1, %xmm3, %r10 -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-NEXT: vmovq %xmm3, %r11 -; AVX512-NEXT: vpextrq $1, %xmm3, %rbx -; AVX512-NEXT: vpextrq $1, %xmm5, %rax -; AVX512-NEXT: vpextrq $1, %xmm4, %r12 +; AVX512-NEXT: vpextrq $1, %xmm4, %r8 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512-NEXT: vmovq %xmm1, %r10 +; AVX512-NEXT: vpextrq $1, %xmm1, %r11 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vmovq %xmm1, %r14 ; AVX512-NEXT: vpextrq $1, %xmm1, %r15 -; AVX512-NEXT: vpextrq $1, %xmm0, %r14 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovq %xmm1, %r9 +; AVX512-NEXT: vpextrq $1, %xmm1, %rbx +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX512-NEXT: vmovq %xmm1, %r13 +; AVX512-NEXT: vpextrq $1, %xmm1, %r12 +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm3 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX512-NEXT: vmovq %xmm7, %rax +; AVX512-NEXT: leal -1(%r9,%rax), %eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrq $1, %xmm7, %rax ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512-NEXT: vpextrq $1, %xmm8, %rsi -; AVX512-NEXT: addq %rax, %rsi -; AVX512-NEXT: vpextrq $1, %xmm7, %rdx -; AVX512-NEXT: addq %r12, %rdx -; AVX512-NEXT: vpextrq $1, %xmm4, %rcx -; AVX512-NEXT: addq %r15, %rcx -; AVX512-NEXT: vpextrq $1, %xmm3, %rax -; AVX512-NEXT: addq %r14, %rax -; AVX512-NEXT: vpextrq $1, %xmm9, %r14 -; AVX512-NEXT: leaq -1(%rbx,%r14), %r13 -; AVX512-NEXT: vmovq %xmm9, %rbx -; AVX512-NEXT: leaq -1(%r11,%rbx), %r12 -; AVX512-NEXT: vpextrq $1, %xmm2, %r11 -; AVX512-NEXT: leaq -1(%r10,%r11), %r15 -; AVX512-NEXT: vmovq %xmm2, %r10 -; AVX512-NEXT: leaq -1(%r9,%r10), %r14 -; AVX512-NEXT: vmovq %xmm8, %r9 +; AVX512-NEXT: leal -1(%rbx,%rax), %eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vmovq %xmm7, %rax +; AVX512-NEXT: leal -1(%r13,%rax), %ebp +; AVX512-NEXT: vpextrq $1, %xmm7, %rax +; AVX512-NEXT: leal -1(%r12,%rax), %r12d +; AVX512-NEXT: vpextrq $1, %xmm6, %rax +; AVX512-NEXT: leaq -1(%r15,%rax), %rax +; AVX512-NEXT: vmovq %xmm6, %r9 +; AVX512-NEXT: leaq -1(%r14,%r9), %r13 +; AVX512-NEXT: vpextrq $1, %xmm5, %r9 +; AVX512-NEXT: leaq -1(%r11,%r9), %r15 +; AVX512-NEXT: vmovq %xmm5, %r9 +; AVX512-NEXT: leaq -1(%r10,%r9), %r14 +; AVX512-NEXT: vpextrq $1, %xmm4, %r9 ; AVX512-NEXT: leaq -1(%r8,%r9), %r11 -; AVX512-NEXT: vmovq %xmm7, %r8 -; AVX512-NEXT: leaq -1(%rdi,%r8), %r10 -; AVX512-NEXT: vpextrq $1, %xmm6, %rdi -; AVX512-NEXT: leaq -1(%rbp,%rdi), %r9 -; AVX512-NEXT: vmovq %xmm6, %rdi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: leaq -1(%r8,%rdi), %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vpextrq $1, %xmm5, %rdi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: leaq -1(%r8,%rdi), %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vmovq %xmm5, %rdi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: leaq -1(%r8,%rdi), %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vmovq %xmm1, %rdi ; AVX512-NEXT: vmovq %xmm4, %r8 -; AVX512-NEXT: leaq -1(%rdi,%r8), %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vmovq %xmm0, %rdi -; AVX512-NEXT: vmovq %xmm3, %r8 -; AVX512-NEXT: leaq -1(%rdi,%r8), %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: xorl %r8d, %r8d -; AVX512-NEXT: addq $-1, %rsi -; AVX512-NEXT: movl $0, %edi -; AVX512-NEXT: adcq $-1, %rdi -; AVX512-NEXT: addq $-1, %rdx -; AVX512-NEXT: movl $0, %ebp -; AVX512-NEXT: adcq $-1, %rbp -; AVX512-NEXT: addq $-1, %rcx -; AVX512-NEXT: movl $0, %ebx -; AVX512-NEXT: adcq $-1, %rbx -; AVX512-NEXT: addq $-1, %rax -; AVX512-NEXT: adcq $-1, %r8 -; AVX512-NEXT: shldq $63, %rax, %r8 -; AVX512-NEXT: shldq $63, %rcx, %rbx -; AVX512-NEXT: shldq $63, %rdx, %rbp -; AVX512-NEXT: shldq $63, %rsi, %rdi +; AVX512-NEXT: leaq -1(%rdi,%r8), %r10 +; AVX512-NEXT: vpextrq $1, %xmm3, %rdi +; AVX512-NEXT: leaq -1(%rsi,%rdi), %r8 +; AVX512-NEXT: vmovq %xmm3, %rsi +; AVX512-NEXT: leaq -1(%rdx,%rsi), %rdi +; AVX512-NEXT: vpextrq $1, %xmm2, %rdx +; AVX512-NEXT: leaq -1(%rcx,%rdx), %rsi +; AVX512-NEXT: vmovq %xmm2, %rcx +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX512-NEXT: leaq -1(%rdx,%rcx), %rdx +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: vpextrq $1, %xmm1, %r9 +; AVX512-NEXT: leaq -1(%rcx,%r9), %rcx +; AVX512-NEXT: vmovq %xmm0, %r9 +; AVX512-NEXT: vmovq %xmm1, %rbx +; AVX512-NEXT: leaq -1(%r9,%rbx), %r9 +; AVX512-NEXT: shrq %rax +; AVX512-NEXT: vmovq %rax, %xmm0 ; AVX512-NEXT: shrq %r13 -; AVX512-NEXT: vmovq %r13, %xmm0 -; AVX512-NEXT: shrq %r12 -; AVX512-NEXT: vmovq %r12, %xmm1 +; AVX512-NEXT: vmovq %r13, %xmm1 ; AVX512-NEXT: shrq %r15 ; AVX512-NEXT: vmovq %r15, %xmm2 ; AVX512-NEXT: shrq %r14 ; AVX512-NEXT: vmovq %r14, %xmm3 -; AVX512-NEXT: vmovq %rdi, %xmm4 ; AVX512-NEXT: shrq %r11 -; AVX512-NEXT: vmovq %r11, %xmm5 -; AVX512-NEXT: vmovq %rbp, %xmm6 +; AVX512-NEXT: vmovq %r11, %xmm4 ; AVX512-NEXT: shrq %r10 -; AVX512-NEXT: vmovq %r10, %xmm7 +; AVX512-NEXT: vmovq %r10, %xmm5 +; AVX512-NEXT: shrq %r8 +; AVX512-NEXT: vmovq %r8, %xmm6 +; AVX512-NEXT: shrq %rdi +; AVX512-NEXT: vmovq %rdi, %xmm7 +; AVX512-NEXT: shrl %r12d +; AVX512-NEXT: vmovd %r12d, %xmm8 +; AVX512-NEXT: shrl %ebp +; AVX512-NEXT: vmovd %ebp, %xmm9 +; AVX512-NEXT: shrq %rsi +; AVX512-NEXT: vmovq %rsi, %xmm10 +; AVX512-NEXT: shrq %rdx +; AVX512-NEXT: vmovq %rdx, %xmm11 +; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; AVX512-NEXT: shrl %eax +; AVX512-NEXT: vmovd %eax, %xmm12 +; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; AVX512-NEXT: shrl %eax +; AVX512-NEXT: vmovd %eax, %xmm13 +; AVX512-NEXT: shrq %rcx +; AVX512-NEXT: vmovq %rcx, %xmm14 ; AVX512-NEXT: shrq %r9 -; AVX512-NEXT: vmovq %r9, %xmm8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shrq %rax -; AVX512-NEXT: vmovq %rax, %xmm9 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shrq %rax -; AVX512-NEXT: vmovq %rax, %xmm10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shrq %rax -; AVX512-NEXT: vmovq %rax, %xmm11 -; AVX512-NEXT: vmovq %rbx, %xmm12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shrq %rax -; AVX512-NEXT: vmovq %rax, %xmm13 -; AVX512-NEXT: vmovq %r8, %xmm14 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shrq %rax -; AVX512-NEXT: vmovq %rax, %xmm15 +; AVX512-NEXT: vmovq %r9, %xmm15 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX512-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512-NEXT: vpsllq $48, %xmm2, %xmm2 +; AVX512-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; AVX512-NEXT: vpbroadcastw %xmm3, %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512-NEXT: vmovdqu %xmm0, (%rax) ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/avoid-sfb.ll b/llvm/test/CodeGen/X86/avoid-sfb.ll --- a/llvm/test/CodeGen/X86/avoid-sfb.ll +++ b/llvm/test/CodeGen/X86/avoid-sfb.ll @@ -561,12 +561,12 @@ ; CHECK-NEXT: movl %ecx, 28(%rdi) ; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edx -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %esi -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movl %edx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: retq ; ; DISABLED-LABEL: test_stack: @@ -579,8 +579,8 @@ ; DISABLED-NEXT: movups %xmm0, 16(%rdi) ; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; DISABLED-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; DISABLED-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) +; DISABLED-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; DISABLED-NEXT: retq ; ; AVX-LABEL: test_stack: diff --git a/llvm/test/CodeGen/X86/avx-logic.ll b/llvm/test/CodeGen/X86/avx-logic.ll --- a/llvm/test/CodeGen/X86/avx-logic.ll +++ b/llvm/test/CodeGen/X86/avx-logic.ll @@ -338,23 +338,25 @@ define <8 x i32> @andn_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) { ; AVX1-LABEL: andn_disguised_i8_elts: ; AVX1: # %bb.0: -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255] -; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpandn %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vandnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; INT256-LABEL: andn_disguised_i8_elts: ; INT256: # %bb.0: ; INT256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; INT256-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; INT256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; INT256-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; INT256-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; INT256-NEXT: retq %add = add <8 x i32> %y, %x @@ -417,17 +419,17 @@ define <8 x i32> @andn_variable_mask_operand_concat(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z, <8 x i32> %w) { ; AVX1-LABEL: andn_variable_mask_operand_concat: ; AVX1: # %bb.0: -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm1 -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vandnps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; INT256-LABEL: andn_variable_mask_operand_concat: diff --git a/llvm/test/CodeGen/X86/avx-shift.ll b/llvm/test/CodeGen/X86/avx-shift.ll --- a/llvm/test/CodeGen/X86/avx-shift.ll +++ b/llvm/test/CodeGen/X86/avx-shift.ll @@ -215,11 +215,12 @@ define <16 x i16> @sext_v16i16(<16 x i16> %a) { ; CHECK-LABEL: sext_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsllw $8, %xmm0, %xmm1 -; CHECK-NEXT: vpsraw $8, %xmm1, %xmm1 +; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm1 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpsllw $8, %xmm0, %xmm0 -; CHECK-NEXT: vpsraw $8, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxbw %xmm1, %xmm1 +; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %b = trunc <16 x i16> %a to <16 x i8> @@ -230,11 +231,12 @@ define <8 x i32> @sext_v8i32(<8 x i32> %a) { ; CHECK-LABEL: sext_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vpslld $16, %xmm0, %xmm1 -; CHECK-NEXT: vpsrad $16, %xmm1, %xmm1 +; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm1 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpslld $16, %xmm0, %xmm0 -; CHECK-NEXT: vpsrad $16, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxwd %xmm1, %xmm1 +; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %b = trunc <8 x i32> %a to <8 x i16> diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -33,8 +33,8 @@ ; X86-NEXT: movl (%ecx), %edx ; X86-NEXT: movl 4(%ecx), %esi ; X86-NEXT: vbroadcastsd (%ecx), %ymm0 -; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -601,8 +601,8 @@ ; X86-NEXT: movl (%ecx), %edx ; X86-NEXT: movl 4(%ecx), %esi ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/avx-vextractf128.ll b/llvm/test/CodeGen/X86/avx-vextractf128.ll --- a/llvm/test/CodeGen/X86/avx-vextractf128.ll +++ b/llvm/test/CodeGen/X86/avx-vextractf128.ll @@ -116,8 +116,8 @@ ; CHECK-LABEL: t9: ; CHECK: ## %bb.0: ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovups %ymm0, (%rdi) -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vmovups %xmm0, (%rdi) +; CHECK-NEXT: vmovups %xmm0, 16(%rdi) ; CHECK-NEXT: retq store i64 0, ptr %p %q = getelementptr i64, ptr %p, i64 1 diff --git a/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll b/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll --- a/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll +++ b/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll @@ -8,14 +8,14 @@ ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vmovaps (%ecx), %xmm0 ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: test1: ; X64: ## %bb.0: -; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: vmovaps (%rdi), %xmm0 ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vmovss %xmm0, (%rsi) ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll b/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll --- a/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll +++ b/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll @@ -82,14 +82,19 @@ define <8 x float> @test7(float %a, <8 x float> %b, <8 x float> %c) { ; X86-LABEL: test7: ; X86: # %bb.0: -; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm2 -; X86-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm1 +; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X86-NEXT: vxorps %xmm2, %xmm3, %xmm2 +; X86-NEXT: vbroadcastss %xmm2, %ymm2 +; X86-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 ; X86-NEXT: retl ; ; X64-LABEL: test7: ; X64: # %bb.0: +; X64-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X64-NEXT: vxorps %xmm3, %xmm0, %xmm0 ; X64-NEXT: vbroadcastss %xmm0, %ymm0 -; X64-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 +; X64-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 ; X64-NEXT: retq %t0 = insertelement <8 x float> undef, float %a, i32 0 %t1 = fsub <8 x float> , %t0 @@ -102,14 +107,19 @@ define <8 x float> @test8(float %a, <8 x float> %b, <8 x float> %c) { ; X86-LABEL: test8: ; X86: # %bb.0: -; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm2 -; X86-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm1 +; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X86-NEXT: vxorps %xmm2, %xmm3, %xmm2 +; X86-NEXT: vbroadcastss %xmm2, %ymm2 +; X86-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 ; X86-NEXT: retl ; ; X64-LABEL: test8: ; X64: # %bb.0: +; X64-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X64-NEXT: vxorps %xmm3, %xmm0, %xmm0 ; X64-NEXT: vbroadcastss %xmm0, %ymm0 -; X64-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 +; X64-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 ; X64-NEXT: retq %t0 = fsub float -0.0, %a %t1 = insertelement <8 x float> undef, float %t0, i32 0 diff --git a/llvm/test/CodeGen/X86/avx2-shift.ll b/llvm/test/CodeGen/X86/avx2-shift.ll --- a/llvm/test/CodeGen/X86/avx2-shift.ll +++ b/llvm/test/CodeGen/X86/avx2-shift.ll @@ -395,11 +395,21 @@ } define <16 x i16> @sext_v16i16(<16 x i16> %a) nounwind { -; CHECK-LABEL: sext_v16i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vpsllw $8, %ymm0, %ymm0 -; CHECK-NEXT: vpsraw $8, %ymm0, %ymm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: sext_v16i16: +; X86: # %bb.0: +; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpmovsxbw %xmm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: sext_v16i16: +; X64: # %bb.0: +; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpmovsxbw %xmm0, %ymm0 +; X64-NEXT: retq %b = trunc <16 x i16> %a to <16 x i8> %c = sext <16 x i8> %b to <16 x i16> ret <16 x i16> %c diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll b/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll --- a/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll +++ b/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll @@ -29,15 +29,15 @@ ; AVX512F-NEXT: vpmovdb %zmm4, %xmm4 ; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0 -; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm3 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpandq %zmm1, %zmm2, %zmm3 +; AVX512F-NEXT: vpternlogq $220, %zmm2, %zmm3, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-NEXT: vpaddb %ymm4, %ymm1, %ymm4 +; AVX512F-NEXT: vpaddb %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm4 -; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vpternlogq $226, %zmm4, %zmm2, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: add_v64i8_broadcasts: diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll --- a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll +++ b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll @@ -1695,13 +1695,13 @@ ; CHECK-LABEL: bcast_unfold_fma231_v4f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 -; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB49_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1 -; CHECK-NEXT: vfmadd231ps {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1 -; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovups %xmm0, 4096(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB49_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -1766,13 +1766,13 @@ ; CHECK-LABEL: bcast_unfold_fma231_v8f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 -; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB51_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1 -; CHECK-NEXT: vfmadd231ps {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1 -; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) +; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm0, %ymm1 +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmovups %ymm0, 4096(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB51_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -1838,13 +1838,13 @@ ; CHECK-LABEL: bcast_unfold_fma231_v16f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 -; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB53_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1 -; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1 -; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm0, %zmm1 +; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vmovups %zmm0, 4096(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB53_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -1910,14 +1910,13 @@ ; CHECK-LABEL: bcast_unfold_fma231_v2f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0] -; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB55_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1 -; CHECK-NEXT: vfmadd231pd {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1 -; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovupd %xmm0, 8192(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB55_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -1982,13 +1981,13 @@ ; CHECK-LABEL: bcast_unfold_fma231_v4f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB57_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1 -; CHECK-NEXT: vfmadd231pd {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1 -; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm0, %ymm1 +; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmovupd %ymm0, 8192(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB57_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2054,13 +2053,13 @@ ; CHECK-LABEL: bcast_unfold_fma231_v8f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB59_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1 -; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1 -; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm0, %zmm1 +; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vmovupd %zmm0, 8192(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB59_1 ; CHECK-NEXT: # %bb.2: # %bb10 diff --git a/llvm/test/CodeGen/X86/avx512-build-vector.ll b/llvm/test/CodeGen/X86/avx512-build-vector.ll --- a/llvm/test/CodeGen/X86/avx512-build-vector.ll +++ b/llvm/test/CodeGen/X86/avx512-build-vector.ll @@ -15,9 +15,9 @@ ; CHECK-LABEL: test3: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15] -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15] +; CHECK-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq %b = extractelement <4 x float> %a, i32 2 diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -3005,9 +3005,8 @@ ; KNL-LABEL: zext_4xi1_to_4x32: ; KNL: # %bb.0: ; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; KNL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; KNL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_4xi1_to_4x32: @@ -3020,8 +3019,8 @@ ; AVX512DQNOBW-LABEL: zext_4xi1_to_4x32: ; AVX512DQNOBW: # %bb.0: ; AVX512DQNOBW-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX512DQNOBW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512DQNOBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512DQNOBW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512DQNOBW-NEXT: retq %mask = icmp eq <4 x i8> %x, %y %1 = zext <4 x i1> %mask to <4 x i32> @@ -3032,8 +3031,8 @@ ; KNL-LABEL: zext_2xi1_to_2xi64: ; KNL: # %bb.0: ; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; KNL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; KNL-NEXT: retq ; ; SKX-LABEL: zext_2xi1_to_2xi64: @@ -3046,8 +3045,8 @@ ; AVX512DQNOBW-LABEL: zext_2xi1_to_2xi64: ; AVX512DQNOBW: # %bb.0: ; AVX512DQNOBW-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX512DQNOBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512DQNOBW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512DQNOBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512DQNOBW-NEXT: retq %mask = icmp eq <2 x i8> %x, %y %1 = zext <2 x i1> %mask to <2 x i64> diff --git a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll --- a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll +++ b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll @@ -732,11 +732,12 @@ define void @load_v3i1_broadcast_2_v1i1_store(ptr %a0,ptr %a1) { ; AVX512-LABEL: load_v3i1_broadcast_2_v1i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: testb $4, (%rdi) -; AVX512-NEXT: movl $255, %ecx -; AVX512-NEXT: cmovel %eax, %ecx -; AVX512-NEXT: kmovd %ecx, %k0 +; AVX512-NEXT: movzbl (%rdi), %eax +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: btl $2, %eax +; AVX512-NEXT: movl $255, %eax +; AVX512-NEXT: cmovael %ecx, %eax +; AVX512-NEXT: kmovd %eax, %k0 ; AVX512-NEXT: kshiftrb $2, %k0, %k0 ; AVX512-NEXT: kshiftlb $7, %k0, %k0 ; AVX512-NEXT: kshiftrb $7, %k0, %k0 @@ -745,11 +746,12 @@ ; ; AVX512NOTDQ-LABEL: load_v3i1_broadcast_2_v1i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: xorl %eax, %eax -; AVX512NOTDQ-NEXT: testb $4, (%rdi) -; AVX512NOTDQ-NEXT: movl $255, %ecx -; AVX512NOTDQ-NEXT: cmovel %eax, %ecx -; AVX512NOTDQ-NEXT: kmovd %ecx, %k0 +; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax +; AVX512NOTDQ-NEXT: xorl %ecx, %ecx +; AVX512NOTDQ-NEXT: btl $2, %eax +; AVX512NOTDQ-NEXT: movl $255, %eax +; AVX512NOTDQ-NEXT: cmovael %ecx, %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k0 ; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0 ; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 diff --git a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll --- a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll +++ b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll @@ -6,7 +6,7 @@ ; KNL-LABEL: hadd_16: ; KNL: # %bb.0: ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vmovd %xmm0, %eax @@ -15,7 +15,7 @@ ; SKX-LABEL: hadd_16: ; SKX: # %bb.0: ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vmovd %xmm0, %eax @@ -33,7 +33,7 @@ ; KNL-LABEL: hsub_16: ; KNL: # %bb.0: ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; KNL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vmovd %xmm0, %eax @@ -42,7 +42,7 @@ ; SKX-LABEL: hsub_16: ; SKX: # %bb.0: ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 ; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vmovd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -973,20 +973,22 @@ ; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: vpcmpnleuq %zmm1, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: andl $1, %ecx -; KNL-NEXT: movl $4, %eax -; KNL-NEXT: subl %ecx, %eax +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: movb $4, %cl +; KNL-NEXT: subb %al, %cl +; KNL-NEXT: movzbl %cl, %eax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_v2i1: ; SKX: ## %bb.0: ; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0 -; SKX-NEXT: kmovd %k0, %ecx -; SKX-NEXT: andl $1, %ecx -; SKX-NEXT: movl $4, %eax -; SKX-NEXT: subl %ecx, %eax +; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: andb $1, %al +; SKX-NEXT: movb $4, %cl +; SKX-NEXT: subb %al, %cl +; SKX-NEXT: movzbl %cl, %eax ; SKX-NEXT: retq %t1 = icmp ugt <2 x i64> %a, %b %t2 = extractelement <2 x i1> %t1, i32 0 @@ -1090,10 +1092,11 @@ ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: andl $1, %ecx -; KNL-NEXT: movl $4, %eax -; KNL-NEXT: subl %ecx, %eax +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: movb $4, %cl +; KNL-NEXT: subb %al, %cl +; KNL-NEXT: movzbl %cl, %eax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; @@ -1101,10 +1104,11 @@ ; SKX: ## %bb.0: ; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; SKX-NEXT: kshiftrq $63, %k0, %k0 -; SKX-NEXT: kmovd %k0, %ecx -; SKX-NEXT: andl $1, %ecx -; SKX-NEXT: movl $4, %eax -; SKX-NEXT: subl %ecx, %eax +; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: andb $1, %al +; SKX-NEXT: movb $4, %cl +; SKX-NEXT: subb %al, %cl +; SKX-NEXT: movzbl %cl, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t1 = icmp ugt <64 x i8> %a, %b diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -1132,9 +1132,9 @@ define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { ; X86-LABEL: test_mask_pcmpeq_d: ; X86: ## %bb.0: -; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] ; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x23,0x44,0x24,0x04] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] @@ -1204,9 +1204,9 @@ define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { ; X86-LABEL: test_mask_pcmpgt_d: ; X86: ## %bb.0: -; X86-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc1] ; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x23,0x44,0x24,0x04] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] @@ -5975,31 +5975,74 @@ declare <8 x i64> @llvm.x86.avx512.movntdqa(ptr) nounwind readonly define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) { -; CHECK-LABEL: test_cmp_d_512: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] -; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 ## encoding: [0x62,0xf1,0x75,0x48,0x66,0xc8] -; CHECK-NEXT: vpcmpled %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd1,0x02] -; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltd %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xe1,0x05] -; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xe9] -; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X86-LABEL: test_cmp_d_512: +; X86: ## %bb.0: +; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X86-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X86-NEXT: vmovd %ecx, %xmm2 ## encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpbroadcastd %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x58,0xd2] +; X86-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 ## encoding: [0x62,0xf1,0x75,0x48,0x66,0xc0] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmpled %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x02] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x05] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_cmp_d_512: +; X64: ## %bb.0: +; X64-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X64-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X64-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X64-NEXT: movq %rcx, %rax ## encoding: [0x48,0x89,0xc8] +; X64-NEXT: shlq $32, %rax ## encoding: [0x48,0xc1,0xe0,0x20] +; X64-NEXT: orq %rcx, %rax ## encoding: [0x48,0x09,0xc8] +; X64-NEXT: vmovq %rax, %xmm2 ## encoding: [0xc4,0xe1,0xf9,0x6e,0xd0] +; X64-NEXT: vpbroadcastq %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X64-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 ## encoding: [0x62,0xf1,0x75,0x48,0x66,0xc0] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpled %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xc1] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq ## encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1) @@ -6025,23 +6068,30 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] -; X86-NEXT: vpcmpgtd %zmm0, %zmm1, %k2 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xd0] -; X86-NEXT: vpcmpled %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xd9,0x02] -; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04] -; X86-NEXT: vpcmpnltd %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe9,0x05] -; X86-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc9] -; X86-NEXT: kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; X86-NEXT: kmovw %k0, %edx ## encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: shll $16, %edx ## encoding: [0xc1,0xe2,0x10] +; X86-NEXT: orl %ecx, %edx ## encoding: [0x09,0xca] +; X86-NEXT: vmovd %edx, %xmm2 ## encoding: [0xc5,0xf9,0x6e,0xd2] +; X86-NEXT: vpbroadcastd %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x58,0xd2] +; X86-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xc0] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpled %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x02] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x05] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc1] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] @@ -6050,23 +6100,33 @@ ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] -; X64-NEXT: vpcmpgtd %zmm0, %zmm1, %k2 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xd0] -; X64-NEXT: vpcmpled %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xd9,0x02] -; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04] -; X64-NEXT: vpcmpnltd %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe9,0x05] -; X64-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc9] -; X64-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X64-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X64-NEXT: movq %rcx, %rax ## encoding: [0x48,0x89,0xc8] +; X64-NEXT: shlq $32, %rax ## encoding: [0x48,0xc1,0xe0,0x20] +; X64-NEXT: orq %rcx, %rax ## encoding: [0x48,0x09,0xc8] +; X64-NEXT: vmovq %rax, %xmm2 ## encoding: [0xc4,0xe1,0xf9,0x6e,0xd0] +; X64-NEXT: vpbroadcastq %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X64-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xc0] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpled %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc1] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq ## encoding: [0xc3] @@ -6092,31 +6152,74 @@ declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) { -; CHECK-LABEL: test_ucmp_d_512: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] -; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc9,0x01] -; CHECK-NEXT: vpcmpleud %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xd1,0x02] -; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xe1,0x05] -; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xe9,0x06] -; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X86-LABEL: test_ucmp_d_512: +; X86: ## %bb.0: +; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X86-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X86-NEXT: vmovd %ecx, %xmm2 ## encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpbroadcastd %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x58,0xd2] +; X86-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x01] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmpleud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x02] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x05] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x06] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_ucmp_d_512: +; X64: ## %bb.0: +; X64-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X64-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X64-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X64-NEXT: movq %rcx, %rax ## encoding: [0x48,0x89,0xc8] +; X64-NEXT: shlq $32, %rax ## encoding: [0x48,0xc1,0xe0,0x20] +; X64-NEXT: orq %rcx, %rax ## encoding: [0x48,0x09,0xc8] +; X64-NEXT: vmovq %rax, %xmm2 ## encoding: [0xc4,0xe1,0xf9,0x6e,0xd0] +; X64-NEXT: vpbroadcastq %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X64-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x01] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x06] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq ## encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1) @@ -6142,23 +6245,30 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] -; X86-NEXT: vpcmpltud %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd1,0x01] -; X86-NEXT: vpcmpleud %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd9,0x02] -; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04] -; X86-NEXT: vpcmpnltud %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xe9,0x05] -; X86-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc9,0x06] -; X86-NEXT: kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; X86-NEXT: kmovw %k0, %edx ## encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: shll $16, %edx ## encoding: [0xc1,0xe2,0x10] +; X86-NEXT: orl %ecx, %edx ## encoding: [0x09,0xca] +; X86-NEXT: vmovd %edx, %xmm2 ## encoding: [0xc5,0xf9,0x6e,0xd2] +; X86-NEXT: vpbroadcastd %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x58,0xd2] +; X86-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x01] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x02] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x05] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x06] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] @@ -6167,23 +6277,33 @@ ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] -; X64-NEXT: vpcmpltud %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd1,0x01] -; X64-NEXT: vpcmpleud %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd9,0x02] -; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04] -; X64-NEXT: vpcmpnltud %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xe9,0x05] -; X64-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc9,0x06] -; X64-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X64-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X64-NEXT: movq %rcx, %rax ## encoding: [0x48,0x89,0xc8] +; X64-NEXT: shlq $32, %rax ## encoding: [0x48,0xc1,0xe0,0x20] +; X64-NEXT: orq %rcx, %rax ## encoding: [0x48,0x09,0xc8] +; X64-NEXT: vmovq %rax, %xmm2 ## encoding: [0xc4,0xe1,0xf9,0x6e,0xd0] +; X64-NEXT: vpbroadcastq %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X64-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x01] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x06] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq ## encoding: [0xc3] @@ -6797,9 +6917,10 @@ ; X86-LABEL: test_vptestmd: ; X86: ## %bb.0: ; X86-NEXT: vptestmd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x41,0xc9] ; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: andw %cx, %ax ## encoding: [0x66,0x21,0xc8] +; X86-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] ; X86-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] @@ -6827,9 +6948,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestnm_d_512: ; X86: ## %bb.0: ; X86-NEXT: vptestnmd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x48,0x27,0xc1] -; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: andw %cx, %ax ## encoding: [0x66,0x21,0xc8] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x41,0xc9] +; X86-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] @@ -6882,8 +7004,10 @@ define i16 @test_kand(i16 %a0, i16 %a1) { ; X86-LABEL: test_kand: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x23,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 ## encoding: [0xc5,0xf8,0x90,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kandw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x41,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: andl $8, %eax ## encoding: [0x83,0xe0,0x08] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ## encoding: [0xc3] @@ -6904,17 +7028,21 @@ define i16 @test_kandn(i16 %a0, i16 %a1) { ; X86-LABEL: test_kandn: ; X86: ## %bb.0: -; X86-NEXT: movl $65527, %eax ## encoding: [0xb8,0xf7,0xff,0x00,0x00] -; X86-NEXT: ## imm = 0xFFF7 -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ## encoding: [0x0b,0x44,0x24,0x04] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x23,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 ## encoding: [0xc5,0xf8,0x90,0x44,0x24,0x04] +; X86-NEXT: movw $8, %ax ## encoding: [0x66,0xb8,0x08,0x00] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: kandnw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x42,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kandnw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x42,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_kandn: ; X64: ## %bb.0: ; X64-NEXT: movl %edi, %eax ## encoding: [0x89,0xf8] -; X64-NEXT: orl $-9, %eax ## encoding: [0x83,0xc8,0xf7] +; X64-NEXT: orl $65527, %eax ## encoding: [0x0d,0xf7,0xff,0x00,0x00] +; X64-NEXT: ## imm = 0xFFF7 ; X64-NEXT: andl %esi, %eax ## encoding: [0x21,0xf0] ; X64-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ## encoding: [0xc3] @@ -6946,8 +7074,10 @@ define i16 @test_kor(i16 %a0, i16 %a1) { ; X86-LABEL: test_kor: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: orw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x0b,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 ## encoding: [0xc5,0xf8,0x90,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: korw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x45,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: orl $8, %eax ## encoding: [0x83,0xc8,0x08] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ## encoding: [0xc3] @@ -6970,8 +7100,10 @@ define i16 @test_kxnor(i16 %a0, i16 %a1) { ; X86-LABEL: test_kxnor: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: xorw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x33,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 ## encoding: [0xc5,0xf8,0x90,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kxorw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x47,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: xorl $8, %eax ## encoding: [0x83,0xf0,0x08] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ## encoding: [0xc3] @@ -6992,8 +7124,10 @@ define i16 @test_kxor(i16 %a0, i16 %a1) { ; X86-LABEL: test_kxor: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: xorw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x33,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 ## encoding: [0xc5,0xf8,0x90,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kxorw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x47,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: xorl $8, %eax ## encoding: [0x83,0xf0,0x08] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ## encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -86,17 +86,13 @@ define void @mask16_mem(ptr %ptr) { ; CHECK-LABEL: mask16_mem: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw (%rdi), %k0 -; CHECK-NEXT: knotw %k0, %k0 -; CHECK-NEXT: kmovw %k0, (%rdi) +; CHECK-NEXT: notw (%rdi) ; CHECK-NEXT: retq ; ; X86-LABEL: mask16_mem: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: kmovw (%eax), %k0 -; X86-NEXT: knotw %k0, %k0 -; X86-NEXT: kmovw %k0, (%eax) +; X86-NEXT: notw (%eax) ; X86-NEXT: retl %x = load i16, ptr %ptr, align 4 %m0 = bitcast i16 %x to <16 x i1> @@ -107,36 +103,15 @@ } define void @mask8_mem(ptr %ptr) { -; KNL-LABEL: mask8_mem: -; KNL: ## %bb.0: -; KNL-NEXT: notb (%rdi) -; KNL-NEXT: retq -; -; SKX-LABEL: mask8_mem: -; SKX: ## %bb.0: -; SKX-NEXT: kmovb (%rdi), %k0 -; SKX-NEXT: knotb %k0, %k0 -; SKX-NEXT: kmovb %k0, (%rdi) -; SKX-NEXT: retq -; -; AVX512BW-LABEL: mask8_mem: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: notb (%rdi) -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: mask8_mem: -; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: kmovb (%rdi), %k0 -; AVX512DQ-NEXT: knotb %k0, %k0 -; AVX512DQ-NEXT: kmovb %k0, (%rdi) -; AVX512DQ-NEXT: retq +; CHECK-LABEL: mask8_mem: +; CHECK: ## %bb.0: +; CHECK-NEXT: notb (%rdi) +; CHECK-NEXT: retq ; ; X86-LABEL: mask8_mem: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: kmovb (%eax), %k0 -; X86-NEXT: knotb %k0, %k0 -; X86-NEXT: kmovb %k0, (%eax) +; X86-NEXT: notb (%eax) ; X86-NEXT: retl %x = load i8, ptr %ptr, align 4 %m0 = bitcast i8 %x to <8 x i1> @@ -156,8 +131,11 @@ ; ; X86-LABEL: mand16: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orw {{[0-9]+}}(%esp), %ax +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: korw %k1, %k0, %k0 +; X86-NEXT: kmovd %k0, %eax +; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl %ma = bitcast i16 %x to <16 x i1> %mb = bitcast i16 %y to <16 x i1> @@ -1352,8 +1330,8 @@ ; ; X86-LABEL: test17: ; X86: ## %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k0 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k0 ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; X86-NEXT: setg %al ; X86-NEXT: kshiftrq $6, %k0, %k1 @@ -3882,8 +3860,11 @@ ; ; X86-LABEL: test_v16i1_add: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorw {{[0-9]+}}(%esp), %ax +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: kxorw %k1, %k0, %k0 +; X86-NEXT: kmovd %k0, %eax +; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl %m0 = bitcast i16 %x to <16 x i1> %m1 = bitcast i16 %y to <16 x i1> @@ -3902,8 +3883,11 @@ ; ; X86-LABEL: test_v16i1_sub: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorw {{[0-9]+}}(%esp), %ax +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: kxorw %k1, %k0, %k0 +; X86-NEXT: kmovd %k0, %eax +; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl %m0 = bitcast i16 %x to <16 x i1> %m1 = bitcast i16 %y to <16 x i1> @@ -3922,8 +3906,11 @@ ; ; X86-LABEL: test_v16i1_mul: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: kandw %k1, %k0, %k0 +; X86-NEXT: kmovd %k0, %eax +; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl %m0 = bitcast i16 %x to <16 x i1> %m1 = bitcast i16 %y to <16 x i1> @@ -3942,8 +3929,11 @@ ; ; X86-LABEL: test_v8i1_add: ; X86: ## %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: kxorb %k1, %k0, %k0 +; X86-NEXT: kmovd %k0, %eax +; X86-NEXT: ## kill: def $al killed $al killed $eax ; X86-NEXT: retl %m0 = bitcast i8 %x to <8 x i1> %m1 = bitcast i8 %y to <8 x i1> @@ -3962,8 +3952,11 @@ ; ; X86-LABEL: test_v8i1_sub: ; X86: ## %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: kxorb %k1, %k0, %k0 +; X86-NEXT: kmovd %k0, %eax +; X86-NEXT: ## kill: def $al killed $al killed $eax ; X86-NEXT: retl %m0 = bitcast i8 %x to <8 x i1> %m1 = bitcast i8 %y to <8 x i1> @@ -3982,8 +3975,11 @@ ; ; X86-LABEL: test_v8i1_mul: ; X86: ## %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: andb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: kandb %k1, %k0, %k0 +; X86-NEXT: kmovd %k0, %eax +; X86-NEXT: ## kill: def $al killed $al killed $eax ; X86-NEXT: retl %m0 = bitcast i8 %x to <8 x i1> %m1 = bitcast i8 %y to <8 x i1> @@ -4712,6 +4708,8 @@ define void @ktest_6(<32 x i16> %w, <32 x i16> %x, <32 x i16> %y, <32 x i16> %z) { ; KNL-LABEL: ktest_6: ; KNL: ## %bb.0: +; KNL-NEXT: pushq %rax +; KNL-NEXT: .cfi_def_cfa_offset 16 ; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; KNL-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; KNL-NEXT: vpcmpeqw %ymm5, %ymm4, %ymm4 @@ -4731,22 +4729,17 @@ ; KNL-NEXT: vpcmpeqw %ymm5, %ymm3, %ymm3 ; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; KNL-NEXT: vpternlogq $200, %zmm1, %zmm0, %zmm2 -; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; KNL-NEXT: vpor %ymm0, %ymm2, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %k0 ; KNL-NEXT: kortestw %k0, %k0 ; KNL-NEXT: je LBB77_1 ; KNL-NEXT: ## %bb.2: ## %exit +; KNL-NEXT: popq %rax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; KNL-NEXT: LBB77_1: ## %bar -; KNL-NEXT: pushq %rax -; KNL-NEXT: .cfi_def_cfa_offset 16 ; KNL-NEXT: vzeroupper ; KNL-NEXT: callq _foo -; KNL-NEXT: addq $8, %rsp +; KNL-NEXT: popq %rax ; KNL-NEXT: retq ; ; SKX-LABEL: ktest_6: @@ -4793,6 +4786,8 @@ ; ; AVX512DQ-LABEL: ktest_6: ; AVX512DQ: ## %bb.0: +; AVX512DQ-NEXT: pushq %rax +; AVX512DQ-NEXT: .cfi_def_cfa_offset 16 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm4, %ymm4 @@ -4812,22 +4807,17 @@ ; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm3, %ymm3 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512DQ-NEXT: vpternlogq $200, %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %k0 ; AVX512DQ-NEXT: kortestw %k0, %k0 ; AVX512DQ-NEXT: je LBB77_1 ; AVX512DQ-NEXT: ## %bb.2: ## %exit +; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; AVX512DQ-NEXT: LBB77_1: ## %bar -; AVX512DQ-NEXT: pushq %rax -; AVX512DQ-NEXT: .cfi_def_cfa_offset 16 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: callq _foo -; AVX512DQ-NEXT: addq $8, %rsp +; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: retq ; ; X86-LABEL: ktest_6: diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -1316,9 +1316,10 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,0,8,4,6,4,12] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1327,8 +1328,9 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,0,8,4,6,4,12] ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper @@ -1342,10 +1344,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,0,8,4,6,4,12] +; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1718,9 +1721,15 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,4,3,6] -; CHECK-NEXT: vpermi2d (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: vpbroadcastd 24(%rdi), %xmm0 +; CHECK-NEXT: vmovdqa (%rdi), %xmm1 +; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 +; CHECK-NEXT: vmovd %xmm1, %eax +; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; CHECK-NEXT: vextractps $3, %xmm2, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; CHECK-NEXT: vpextrd $2, %xmm1, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1729,11 +1738,17 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [2,4,3,6] -; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3 +; CHECK-NEXT: vpbroadcastd 24(%rdi), %xmm2 +; CHECK-NEXT: vmovdqa (%rdi), %xmm3 +; CHECK-NEXT: vmovaps 16(%rdi), %xmm4 +; CHECK-NEXT: vmovd %xmm3, %eax +; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vextractps $3, %xmm4, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vpextrd $2, %xmm3, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1745,11 +1760,17 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,4,3,6] +; CHECK-NEXT: vpbroadcastd 24(%rdi), %xmm1 +; CHECK-NEXT: vmovdqa (%rdi), %xmm2 +; CHECK-NEXT: vmovaps 16(%rdi), %xmm3 +; CHECK-NEXT: vmovd %xmm2, %eax +; CHECK-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; CHECK-NEXT: vextractps $3, %xmm3, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; CHECK-NEXT: vpextrd $2, %xmm2, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -2695,40 +2716,24 @@ } define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) { -; CHECK-FAST-LABEL: test_8xi64_to_2xi64_perm_mem_mask0: -; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm0 = [4,1] -; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm0, %zmm0 -; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; CHECK-FAST-NEXT: vzeroupper -; CHECK-FAST-NEXT: retq -; -; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_2xi64_perm_mem_mask0: -; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm0 -; CHECK-FAST-PERLANE-NEXT: vblendps $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3] -; CHECK-FAST-PERLANE-NEXT: retq +; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovsd 8(%rdi), %xmm0 # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd 32(%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> ret <2 x i64> %res } define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) { -; CHECK-FAST-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0: -; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,1] -; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm2 -; CHECK-FAST-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; CHECK-FAST-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} -; CHECK-FAST-NEXT: vzeroupper -; CHECK-FAST-NEXT: retq -; -; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0: -; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 -; CHECK-FAST-PERLANE-NEXT: vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3] -; CHECK-FAST-PERLANE-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} -; CHECK-FAST-PERLANE-NEXT: retq +; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovq 8(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-NEXT: vmovq 32(%rdi), %xmm3 # xmm3 = mem[0],zero +; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0] +; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer @@ -2737,22 +2742,13 @@ } define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) { -; CHECK-FAST-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0: -; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1] -; CHECK-FAST-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} -; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; CHECK-FAST-NEXT: vzeroupper -; CHECK-FAST-NEXT: retq -; -; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0: -; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm1 -; CHECK-FAST-PERLANE-NEXT: vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3] -; CHECK-FAST-PERLANE-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} -; CHECK-FAST-PERLANE-NEXT: retq +; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovq 8(%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-NEXT: vmovq 32(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0] +; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer @@ -3167,11 +3163,12 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4] -; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [0,4,8,9,6,1,4,4] +; CHECK-NEXT: vpermi2ps %ymm3, %ymm0, %ymm4 +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vcmpeqps %ymm0, %ymm2, %k1 +; CHECK-NEXT: vblendmps %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -3182,10 +3179,11 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,8,9,6,1,4,4] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 -; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4] +; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm1, %k1 +; CHECK-NEXT: vpermt2ps %ymm2, %ymm3, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3482,26 +3480,16 @@ } define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { -; CHECK-FAST-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: -; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1] -; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 -; CHECK-FAST-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-FAST-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-FAST-NEXT: vmovaps %ymm3, %ymm0 {%k1} -; CHECK-FAST-NEXT: retq -; -; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: -; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 -; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 -; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm4 = [9,5,2,3,2,8,8,1] -; CHECK-FAST-PERLANE-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4 -; CHECK-FAST-PERLANE-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm0 {%k1} -; CHECK-FAST-PERLANE-NEXT: retq +; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps (%rdi), %xmm2 +; CHECK-NEXT: vmovaps 32(%rdi), %ymm3 +; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [9,5,2,3,2,8,8,1] +; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 +; CHECK-NEXT: vmovaps %ymm4, %ymm0 {%k1} +; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -3510,26 +3498,16 @@ } define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %mask) { -; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: -; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] -; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-FAST-NEXT: vcmpeqps %ymm3, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-FAST-NEXT: vmovaps %ymm1, %ymm0 -; CHECK-FAST-NEXT: retq -; -; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: -; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 -; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 -; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] -; CHECK-FAST-PERLANE-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm4, %ymm0, %k1 -; CHECK-FAST-PERLANE-NEXT: vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z} -; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm0 -; CHECK-FAST-PERLANE-NEXT: retq +; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps (%rdi), %xmm2 +; CHECK-NEXT: vmovaps 32(%rdi), %ymm3 +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] +; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm0, %k1 +; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -4672,10 +4650,11 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup 8(%rdi), %xmm2 # xmm2 = mem[0,0] -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 -; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0] +; CHECK-NEXT: vmovsd 8(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-NEXT: vmovsd 32(%rdi), %xmm3 # xmm3 = mem[0],zero +; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %xmm4, %xmm1, %k1 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} = xmm2[0],xmm3[0] ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> @@ -4687,10 +4666,11 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup 8(%rdi), %xmm1 # xmm1 = mem[0,0] -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 -; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0] +; CHECK-NEXT: vmovsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-NEXT: vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %xmm3, %xmm0, %k1 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm1[0],xmm2[0] ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-trunc.ll b/llvm/test/CodeGen/X86/avx512-trunc.ll --- a/llvm/test/CodeGen/X86/avx512-trunc.ll +++ b/llvm/test/CodeGen/X86/avx512-trunc.ll @@ -629,7 +629,8 @@ ; ; SKX-LABEL: usat_trunc_wb_128_mem: ; SKX: ## %bb.0: -; SKX-NEXT: vpmovuswb %xmm0, (%rdi) +; SKX-NEXT: vpmovuswb %xmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: retq %x3 = icmp ult <8 x i16> %i, %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> @@ -654,7 +655,8 @@ define void @usat_trunc_qb_512_mem(<8 x i64> %i, ptr %res) { ; ALL-LABEL: usat_trunc_qb_512_mem: ; ALL: ## %bb.0: -; ALL-NEXT: vpmovusqb %zmm0, (%rdi) +; ALL-NEXT: vpmovusqb %zmm0, %xmm0 +; ALL-NEXT: vmovq %xmm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x3 = icmp ult <8 x i64> %i, @@ -864,18 +866,11 @@ } define void @smax_usat_trunc_wb_128_mem(<8 x i16> %i, ptr %res) { -; KNL-LABEL: smax_usat_trunc_wb_128_mem: -; KNL: ## %bb.0: -; KNL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovq %xmm0, (%rdi) -; KNL-NEXT: retq -; -; SKX-LABEL: smax_usat_trunc_wb_128_mem: -; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpmovuswb %xmm0, (%rdi) -; SKX-NEXT: retq +; ALL-LABEL: smax_usat_trunc_wb_128_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vmovq %xmm0, (%rdi) +; ALL-NEXT: retq %x1 = icmp sgt <8 x i16> %i, %x2 = select <8 x i1> %x1, <8 x i16> %i, <8 x i16> %x3 = icmp slt <8 x i16> %x2, @@ -907,7 +902,8 @@ ; ALL: ## %bb.0: ; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; ALL-NEXT: vpmovusqb %zmm0, (%rdi) +; ALL-NEXT: vpmovusqb %zmm0, %xmm0 +; ALL-NEXT: vmovq %xmm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x1 = icmp sgt <8 x i64> %i, diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll @@ -311,10 +311,6 @@ ; We implement the scalar broadcast intrinsics with vector initializers. ; Verify that the IR generated will produce the broadcast at the end. define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a) { -; ALL-LABEL: test_mm512_broadcastsd_pd: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 -; ALL-NEXT: retq entry: %0 = extractelement <2 x double> %a, i32 0 %vecinit.i = insertelement <8 x double> undef, double %0, i32 0 diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -1187,14 +1187,22 @@ } define <2 x i64> @test45(<2 x i16> %x, <2 x i16> %y) #0 { -; AVX512-LABEL: test45: -; AVX512: ## %bb.0: -; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x75,0xc1] -; AVX512-NEXT: vpmovzxwq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x34,0xc0] -; AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A] -; AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; AVX512-NEXT: retq ## encoding: [0xc3] +; KNL-LABEL: test45: +; KNL: ## %bb.0: +; KNL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x75,0xc1] +; KNL-NEXT: vpsrlw $15, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xd0,0x0f] +; KNL-NEXT: vpmovzxwq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x34,0xc0] +; KNL-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; KNL-NEXT: retq ## encoding: [0xc3] +; +; AVX512BW-LABEL: test45: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x75,0xc1] +; AVX512BW-NEXT: vpmovzxwq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x34,0xc0] +; AVX512BW-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A] +; AVX512BW-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512BW-NEXT: retq ## encoding: [0xc3] ; ; SKX-LABEL: test45: ; SKX: ## %bb.0: diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -49,8 +49,8 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) nounwind { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: ; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x78,0x5c,0x24,0x04] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpblendmb %zmm3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x66,0xcb] ; X86-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x6f,0xd3] ; X86-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] @@ -108,8 +108,8 @@ ; X86-LABEL: test_int_x86_avx512_mask_storeu_b_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c] ; X86-NEXT: vmovdqu8 %zmm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x7f,0x01] ; X86-NEXT: vmovdqu64 %zmm0, (%eax) # encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x00] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] @@ -187,9 +187,9 @@ ; X86-LABEL: test_int_x86_avx512_mask_loadu_b_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] ; X86-NEXT: vmovdqu64 (%ecx), %zmm0 # encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x01] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c] ; X86-NEXT: vpblendmb (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x66,0x08] ; X86-NEXT: vmovdqu8 (%ecx), %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x6f,0x11] ; X86-NEXT: retl # encoding: [0xc3] @@ -455,12 +455,11 @@ define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind { ; X86-LABEL: test_mask_pcmpeq_b: ; X86: # %bb.0: -; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04] -; X86-NEXT: andl {{[0-9]+}}(%esp), %edx # encoding: [0x23,0x54,0x24,0x08] +; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -491,9 +490,9 @@ define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind { ; X86-LABEL: test_mask_pcmpeq_w: ; X86: # %bb.0: -; X86-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x75,0xc1] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -533,12 +532,11 @@ define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind { ; X86-LABEL: test_mask_pcmpgt_b: ; X86: # %bb.0: -; X86-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x64,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04] -; X86-NEXT: andl {{[0-9]+}}(%esp), %edx # encoding: [0x23,0x54,0x24,0x08] +; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -569,9 +567,9 @@ define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind { ; X86-LABEL: test_mask_pcmpgt_w: ; X86: # %bb.0: -; X86-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x65,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -1941,45 +1939,66 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwind { ; X86-LABEL: test_mask_cmp_b_512: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp # encoding: [0x55] +; X86-NEXT: pushl %ebx # encoding: [0x53] +; X86-NEXT: pushl %edi # encoding: [0x57] ; X86-NEXT: pushl %esi # encoding: [0x56] -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x64,0xc0] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] -; X86-NEXT: vpcmpleb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x02] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: addl %esi, %edx # encoding: [0x01,0xf2] -; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] -; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] -; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] -; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] -; X86-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6] -; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] -; X86-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x64,0xc1] -; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x14] +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x18] +; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xd1] +; X86-NEXT: kmovd %esi, %k0 # encoding: [0xc5,0xfb,0x92,0xc6] +; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %edx # encoding: [0xc5,0xfb,0x93,0xd3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %edi # encoding: [0xc5,0xfb,0x93,0xfa] +; X86-NEXT: vpcmpgtb %zmm0, %zmm1, %k2 # encoding: [0x62,0xf1,0x75,0x48,0x64,0xd0] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %ebx # encoding: [0xc5,0xfb,0x93,0xda] +; X86-NEXT: addl %edi, %ebx # encoding: [0x01,0xfb] +; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] +; X86-NEXT: vpcmpleb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xd1,0x02] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %edx # encoding: [0xc5,0xfb,0x93,0xd3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %edi # encoding: [0xc5,0xfb,0x93,0xfa] +; X86-NEXT: addl %ebx, %edi # encoding: [0x01,0xdf] +; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] +; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xd1,0x04] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %ebx # encoding: [0xc5,0xfb,0x93,0xda] +; X86-NEXT: addl %edi, %ebx # encoding: [0x01,0xfb] +; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] +; X86-NEXT: vpcmpnltb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xd1,0x05] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %edi # encoding: [0xc5,0xfb,0x93,0xfb] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %ebp # encoding: [0xc5,0xfb,0x93,0xea] +; X86-NEXT: addl %ebx, %ebp # encoding: [0x01,0xdd] +; X86-NEXT: adcl %eax, %edi # encoding: [0x11,0xc7] +; X86-NEXT: vpcmpgtb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xd1] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k1 # encoding: [0xc4,0xe1,0xe5,0x41,0xc9] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] +; X86-NEXT: kandd %k0, %k2, %k0 # encoding: [0xc4,0xe1,0xed,0x41,0xc0] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: addl %ebp, %eax # encoding: [0x01,0xe8] +; X86-NEXT: adcl %edi, %edx # encoding: [0x11,0xfa] ; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] -; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x08] -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x0c] ; X86-NEXT: popl %esi # encoding: [0x5e] +; X86-NEXT: popl %edi # encoding: [0x5f] +; X86-NEXT: popl %ebx # encoding: [0x5b] +; X86-NEXT: popl %ebp # encoding: [0x5d] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2114,45 +2133,66 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwind { ; X86-LABEL: test_mask_x86_avx512_ucmp_b_512: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp # encoding: [0x55] +; X86-NEXT: pushl %ebx # encoding: [0x53] +; X86-NEXT: pushl %edi # encoding: [0x57] ; X86-NEXT: pushl %esi # encoding: [0x56] -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: vpcmpltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x01] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] -; X86-NEXT: vpcmpleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x02] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: addl %esi, %edx # encoding: [0x01,0xf2] -; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] -; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] -; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] -; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] -; X86-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6] -; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] -; X86-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x06] -; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x14] +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x18] +; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xd1] +; X86-NEXT: kmovd %esi, %k0 # encoding: [0xc5,0xfb,0x92,0xc6] +; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %edx # encoding: [0xc5,0xfb,0x93,0xd3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %edi # encoding: [0xc5,0xfb,0x93,0xfa] +; X86-NEXT: vpcmpltub %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xd1,0x01] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %ebx # encoding: [0xc5,0xfb,0x93,0xda] +; X86-NEXT: addl %edi, %ebx # encoding: [0x01,0xfb] +; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] +; X86-NEXT: vpcmpleub %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xd1,0x02] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %edx # encoding: [0xc5,0xfb,0x93,0xd3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %edi # encoding: [0xc5,0xfb,0x93,0xfa] +; X86-NEXT: addl %ebx, %edi # encoding: [0x01,0xdf] +; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] +; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xd1,0x04] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %ebx # encoding: [0xc5,0xfb,0x93,0xda] +; X86-NEXT: addl %edi, %ebx # encoding: [0x01,0xfb] +; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] +; X86-NEXT: vpcmpnltub %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xd1,0x05] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %edi # encoding: [0xc5,0xfb,0x93,0xfb] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %ebp # encoding: [0xc5,0xfb,0x93,0xea] +; X86-NEXT: addl %ebx, %ebp # encoding: [0x01,0xdd] +; X86-NEXT: adcl %eax, %edi # encoding: [0x11,0xc7] +; X86-NEXT: vpcmpnleub %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xd1,0x06] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k1 # encoding: [0xc4,0xe1,0xe5,0x41,0xc9] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] +; X86-NEXT: kandd %k0, %k2, %k0 # encoding: [0xc4,0xe1,0xed,0x41,0xc0] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: addl %ebp, %eax # encoding: [0x01,0xe8] +; X86-NEXT: adcl %edi, %edx # encoding: [0x11,0xfa] ; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] -; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x08] -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x0c] ; X86-NEXT: popl %esi # encoding: [0x5e] +; X86-NEXT: popl %edi # encoding: [0x5f] +; X86-NEXT: popl %ebx # encoding: [0x5b] +; X86-NEXT: popl %ebp # encoding: [0x5d] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2589,13 +2629,14 @@ ; X86: # %bb.0: ; X86-NEXT: pushl %esi # encoding: [0x56] ; X86-NEXT: vptestmb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x48,0x26,0xc1] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kandq %k1, %k0, %k1 # encoding: [0xc4,0xe1,0xfc,0x41,0xc9] +; X86-NEXT: kshiftrq $32, %k1, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd1,0x20] +; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k1, %esi # encoding: [0xc5,0xfb,0x93,0xf1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] -; X86-NEXT: andl %ecx, %edx # encoding: [0x21,0xca] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] -; X86-NEXT: andl %esi, %eax # encoding: [0x21,0xf0] +; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] ; X86-NEXT: popl %esi # encoding: [0x5e] @@ -2622,9 +2663,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestm_w_512: ; X86: # %bb.0: ; X86-NEXT: vptestmw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: andl %ecx, %eax # encoding: [0x21,0xc8] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandd %k1, %k0, %k1 # encoding: [0xc4,0xe1,0xfd,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -2650,13 +2692,14 @@ ; X86: # %bb.0: ; X86-NEXT: pushl %esi # encoding: [0x56] ; X86-NEXT: vptestnmb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x48,0x26,0xc1] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kandq %k1, %k0, %k1 # encoding: [0xc4,0xe1,0xfc,0x41,0xc9] +; X86-NEXT: kshiftrq $32, %k1, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd1,0x20] +; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k1, %esi # encoding: [0xc5,0xfb,0x93,0xf1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] -; X86-NEXT: andl %ecx, %edx # encoding: [0x21,0xca] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] -; X86-NEXT: andl %esi, %eax # encoding: [0x21,0xf0] +; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] ; X86-NEXT: popl %esi # encoding: [0x5e] @@ -2683,9 +2726,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestnm_w_512: ; X86: # %bb.0: ; X86-NEXT: vptestnmw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0xfe,0x48,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: andl %ecx, %eax # encoding: [0x21,0xc8] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandd %k1, %k0, %k1 # encoding: [0xc4,0xe1,0xfd,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512bw-mask-op.ll b/llvm/test/CodeGen/X86/avx512bw-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512bw-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512bw-mask-op.ll @@ -38,9 +38,7 @@ define void @mask32_mem(ptr %ptr) { ; CHECK-LABEL: mask32_mem: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd (%rdi), %k0 -; CHECK-NEXT: knotd %k0, %k0 -; CHECK-NEXT: kmovd %k0, (%rdi) +; CHECK-NEXT: notl (%rdi) ; CHECK-NEXT: retq %x = load i32, ptr %ptr, align 4 %m0 = bitcast i32 %x to <32 x i1> @@ -56,9 +54,7 @@ define void @mask64_mem(ptr %ptr) { ; CHECK-LABEL: mask64_mem: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovq (%rdi), %k0 -; CHECK-NEXT: knotq %k0, %k0 -; CHECK-NEXT: kmovq %k0, (%rdi) +; CHECK-NEXT: notq (%rdi) ; CHECK-NEXT: retq %x = load i64, ptr %ptr, align 4 %m0 = bitcast i64 %x to <64 x i1> diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -927,9 +927,9 @@ define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { ; X86-LABEL: test_mask_pcmpeq_b_256: ; X86: # %bb.0: -; X86-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -961,9 +961,9 @@ define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { ; X86-LABEL: test_mask_pcmpeq_w_256: ; X86: # %bb.0: -; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax # encoding: [0x66,0x23,0x44,0x24,0x04] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -996,9 +996,9 @@ define i32 @test_mask_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { ; X86-LABEL: test_mask_pcmpgt_b_256: ; X86: # %bb.0: -; X86-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x64,0xc1] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -1030,9 +1030,9 @@ define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { ; X86-LABEL: test_mask_pcmpgt_w_256: ; X86: # %bb.0: -; X86-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax # encoding: [0x66,0x23,0x44,0x24,0x04] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -1065,9 +1065,9 @@ define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { ; X86-LABEL: test_mask_pcmpeq_b_128: ; X86: # %bb.0: -; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax # encoding: [0x66,0x23,0x44,0x24,0x04] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl # encoding: [0xc3] ; @@ -1131,9 +1131,9 @@ define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { ; X86-LABEL: test_mask_pcmpgt_b_128: ; X86: # %bb.0: -; X86-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x64,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax # encoding: [0x66,0x23,0x44,0x24,0x04] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl # encoding: [0xc3] ; @@ -4846,7 +4846,7 @@ ; X64-LABEL: test_cmp_b_256: ; X64: # %bb.0: ; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] -; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x64,0xc0] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpleb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x02] @@ -4946,7 +4946,7 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1] -; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x64,0xc0] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpleb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02] @@ -5040,7 +5040,7 @@ ; X64-LABEL: test_ucmp_b_256: ; X64: # %bb.0: ; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] -; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpltub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x01] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpleub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x02] @@ -5140,7 +5140,7 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1] -; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x01] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpleub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02] @@ -5186,31 +5186,68 @@ declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) { -; CHECK-LABEL: test_cmp_w_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] -; CHECK-NEXT: vpcmpgtw %ymm0, %ymm1, %k1 # encoding: [0x62,0xf1,0x75,0x28,0x65,0xc8] -; CHECK-NEXT: vpcmplew %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd1,0x02] -; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltw %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xe1,0x05] -; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k5 # encoding: [0x62,0xf1,0x7d,0x28,0x65,0xe9] -; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-LABEL: test_cmp_w_256: +; X86: # %bb.0: +; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X86-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpgtw %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x65,0xc0] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmplew %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x02] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x05] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_cmp_w_256: +; X64: # %bb.0: +; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpgtw %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x65,0xc0] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmplew %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1) @@ -5236,23 +5273,29 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] -; X86-NEXT: vpcmpgtw %ymm0, %ymm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x65,0xd0] -; X86-NEXT: vpcmplew %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xd9,0x02] -; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04] -; X86-NEXT: vpcmpnltw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe9,0x05] -; X86-NEXT: vpcmpgtw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc9] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpgtw %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x65,0xc0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmplew %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -5261,23 +5304,28 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] -; X64-NEXT: vpcmpgtw %ymm0, %ymm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x65,0xd0] -; X64-NEXT: vpcmplew %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xd9,0x02] -; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04] -; X64-NEXT: vpcmpnltw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe9,0x05] -; X64-NEXT: vpcmpgtw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc9] -; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpgtw %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x65,0xc0] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmplew %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -5303,31 +5351,68 @@ declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) { -; CHECK-LABEL: test_ucmp_w_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] -; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k1 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc9,0x01] -; CHECK-NEXT: vpcmpleuw %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xd1,0x02] -; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltuw %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe1,0x05] -; CHECK-NEXT: vpcmpnleuw %ymm1, %ymm0, %k5 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe9,0x06] -; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-LABEL: test_ucmp_w_256: +; X86: # %bb.0: +; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X86-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x01] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmpleuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x02] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x05] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x06] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_ucmp_w_256: +; X64: # %bb.0: +; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x01] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x06] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1) @@ -5353,23 +5438,29 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] -; X86-NEXT: vpcmpltuw %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd1,0x01] -; X86-NEXT: vpcmpleuw %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd9,0x02] -; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04] -; X86-NEXT: vpcmpnltuw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xe9,0x05] -; X86-NEXT: vpcmpnleuw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc9,0x06] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x01] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x06] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -5378,23 +5469,28 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] -; X64-NEXT: vpcmpltuw %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd1,0x01] -; X64-NEXT: vpcmpleuw %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd9,0x02] -; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04] -; X64-NEXT: vpcmpnltuw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xe9,0x05] -; X64-NEXT: vpcmpnleuw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc9,0x06] -; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x01] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x06] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -5420,30 +5516,66 @@ declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) { -; CHECK-LABEL: test_cmp_b_128: -; CHECK: # %bb.0: -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] -; CHECK-NEXT: vpcmpgtb %xmm0, %xmm1, %k1 # encoding: [0x62,0xf1,0x75,0x08,0x64,0xc8] -; CHECK-NEXT: vpcmpleb %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd1,0x02] -; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltb %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xe1,0x05] -; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k5 # encoding: [0x62,0xf1,0x7d,0x08,0x64,0xe9] -; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-LABEL: test_cmp_b_128: +; X86: # %bb.0: +; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X86-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpgtb %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x75,0x08,0x64,0xc0] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmpleb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x02] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x05] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x64,0xc1] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_cmp_b_128: +; X64: # %bb.0: +; X64-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpgtb %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x75,0x08,0x64,0xc0] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x64,0xc1] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1) @@ -5469,23 +5601,29 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] -; X86-NEXT: vpcmpgtb %xmm0, %xmm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x64,0xd0] -; X86-NEXT: vpcmpleb %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xd9,0x02] -; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04] -; X86-NEXT: vpcmpnltb %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe9,0x05] -; X86-NEXT: vpcmpgtb %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc9] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpgtb %xmm0, %xmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x64,0xc0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpleb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc1] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -5493,23 +5631,28 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] -; X64-NEXT: vpcmpgtb %xmm0, %xmm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x64,0xd0] -; X64-NEXT: vpcmpleb %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xd9,0x02] -; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04] -; X64-NEXT: vpcmpnltb %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe9,0x05] -; X64-NEXT: vpcmpgtb %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc9] -; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpgtb %xmm0, %xmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x64,0xc0] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc1] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask) @@ -5534,30 +5677,66 @@ declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) { -; CHECK-LABEL: test_ucmp_b_128: -; CHECK: # %bb.0: -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] -; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k1 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc9,0x01] -; CHECK-NEXT: vpcmpleub %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xd1,0x02] -; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltub %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe1,0x05] -; CHECK-NEXT: vpcmpnleub %xmm1, %xmm0, %k5 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe9,0x06] -; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-LABEL: test_ucmp_b_128: +; X86: # %bb.0: +; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X86-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpltub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x01] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmpleub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x02] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x05] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x06] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_ucmp_b_128: +; X64: # %bb.0: +; X64-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpltub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x01] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x06] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1) @@ -5583,23 +5762,29 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] -; X86-NEXT: vpcmpltub %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd1,0x01] -; X86-NEXT: vpcmpleub %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd9,0x02] -; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04] -; X86-NEXT: vpcmpnltub %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xe9,0x05] -; X86-NEXT: vpcmpnleub %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc9,0x06] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x01] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x06] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -5607,23 +5792,28 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] -; X64-NEXT: vpcmpltub %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd1,0x01] -; X64-NEXT: vpcmpleub %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd9,0x02] -; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04] -; X64-NEXT: vpcmpnltub %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xe9,0x05] -; X64-NEXT: vpcmpnleub %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc9,0x06] -; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x01] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x06] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask) @@ -6119,9 +6309,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestm_b_128: ; X86: # %bb.0: ; X86-NEXT: vptestmb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x08,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: andw %cx, %ax # encoding: [0x66,0x21,0xc8] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl # encoding: [0xc3] @@ -6146,9 +6337,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestm_b_256: ; X86: # %bb.0: ; X86-NEXT: vptestmb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0x7d,0x28,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: andl %ecx, %eax # encoding: [0x21,0xc8] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandd %k1, %k0, %k1 # encoding: [0xc4,0xe1,0xfd,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -6199,9 +6391,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestm_w_256: ; X86: # %bb.0: ; X86-NEXT: vptestmw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: andw %cx, %ax # encoding: [0x66,0x21,0xc8] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] @@ -6228,9 +6421,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestnm_b_128: ; X86: # %bb.0: ; X86-NEXT: vptestnmb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x08,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: andw %cx, %ax # encoding: [0x66,0x21,0xc8] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl # encoding: [0xc3] @@ -6255,9 +6449,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestnm_b_256: ; X86: # %bb.0: ; X86-NEXT: vptestnmb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0x7e,0x28,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: andl %ecx, %eax # encoding: [0x21,0xc8] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandd %k1, %k0, %k1 # encoding: [0xc4,0xe1,0xfd,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -6308,9 +6503,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestnm_w_256: ; X86: # %bb.0: ; X86-NEXT: vptestnmw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfe,0x28,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: andw %cx, %ax # encoding: [0x66,0x21,0xc8] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx512dq-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics-fast-isel.ll @@ -7,9 +7,9 @@ define zeroext i8 @test_mm512_mask_fpclass_pd_mask(i8 zeroext %__U, <8 x double> %__A) { ; X86-LABEL: test_mm512_mask_fpclass_pd_mask: ; X86: # %bb.0: # %entry -; X86-NEXT: vfpclasspd $4, %zmm0, %k0 +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vfpclasspd $4, %zmm0, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax -; X86-NEXT: andb {{[0-9]+}}(%esp), %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: vzeroupper ; X86-NEXT: retl @@ -49,9 +49,9 @@ define zeroext i16 @test_mm512_mask_fpclass_ps_mask(i16 zeroext %__U, <16 x float> %__A) { ; X86-LABEL: test_mm512_mask_fpclass_ps_mask: ; X86: # %bb.0: # %entry -; X86-NEXT: vfpclassps $4, %zmm0, %k0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vfpclassps $4, %zmm0, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/avx512dq-mask-op.ll b/llvm/test/CodeGen/X86/avx512dq-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512dq-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512dq-mask-op.ll @@ -17,9 +17,7 @@ define void @mask8_mem(ptr %ptr) { ; CHECK-LABEL: mask8_mem: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovb (%rdi), %k0 -; CHECK-NEXT: knotb %k0, %k0 -; CHECK-NEXT: kmovb %k0, (%rdi) +; CHECK-NEXT: notb (%rdi) ; CHECK-NEXT: retq %x = load i8, ptr %ptr, align 4 %m0 = bitcast i8 %x to <8 x i1> diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-fast-isel.ll @@ -336,9 +336,9 @@ define zeroext i8 @test_mm256_mask_fpclass_ps_mask(i8 zeroext %__U, <8 x float> %__A) { ; X86-LABEL: test_mm256_mask_fpclass_ps_mask: ; X86: # %bb.0: # %entry -; X86-NEXT: vfpclassps $2, %ymm0, %k0 +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vfpclassps $2, %ymm0, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax -; X86-NEXT: andb {{[0-9]+}}(%esp), %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: vzeroupper ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll b/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll --- a/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll +++ b/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll @@ -92,9 +92,9 @@ ; ; X86-LABEL: TEST_mm512_mask_test_epi32_mask: ; X86: # %bb.0: # %entry -; X86-NEXT: vptestmd %zmm0, %zmm1, %k0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper ; X86-NEXT: retl @@ -198,9 +198,9 @@ ; ; X86-LABEL: TEST_mm512_mask_testn_epi32_mask: ; X86: # %bb.0: # %entry -; X86-NEXT: vptestnmd %zmm0, %zmm1, %k0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll @@ -528,12 +528,12 @@ define void @fmadd_sh_mask_memfold(ptr %a, ptr %b, i8 %c) { ; X86-LABEL: fmadd_sh_mask_memfold: ; X86: # %bb.0: -; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] ; X86-NEXT: vmovsh (%ecx), %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x01] ; X86-NEXT: vmovsh (%eax), %xmm1 # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x08] ; X86-NEXT: vfmadd213sh %xmm0, %xmm0, %xmm1 # encoding: [0x62,0xf6,0x7d,0x08,0xa9,0xc8] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x0c] ; X86-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xc1] ; X86-NEXT: vmovsh %xmm0, (%ecx) # encoding: [0x62,0xf5,0x7e,0x08,0x11,0x01] ; X86-NEXT: retl # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll --- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll @@ -1995,25 +1995,27 @@ define <8 x half> @test21(half %a, half %b, half %c) nounwind { ; X64-LABEL: test21: ; X64: # %bb.0: -; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X64-NEXT: vmovsh %xmm2, %xmm3, %xmm2 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-NEXT: vpbroadcastw %xmm1, %xmm1 -; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: vpbroadcastw %xmm1, %xmm1 +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: retq ; ; X86-LABEL: test21: ; X86: # %bb.0: ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 -; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 -; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86-NEXT: vpbroadcastw %xmm1, %xmm1 -; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X86-NEXT: vpbroadcastw %xmm2, %xmm2 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-NEXT: vpbroadcastw %xmm2, %xmm1 +; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X86-NEXT: retl %1 = insertelement <8 x half> , half %a, i32 0 %2 = insertelement <8 x half> %1, half %b, i32 1 @@ -2099,7 +2101,9 @@ ; X64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112] ; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; X64-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7,8,9,10,11,12,13],ymm1[14],ymm2[15] +; X64-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X64-NEXT: vmovsh %xmm0, %xmm2, %xmm0 ; X64-NEXT: retq @@ -2115,7 +2119,9 @@ ; X86-NEXT: vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112] ; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; X86-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 +; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X86-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7,8,9,10,11,12,13],ymm1[14],ymm2[15] +; X86-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X86-NEXT: vmovsh %xmm0, %xmm2, %xmm0 ; X86-NEXT: movl %ebp, %esp @@ -2130,8 +2136,9 @@ define <8 x i16> @pr59628_xmm(i16 %arg) { ; X64-LABEL: pr59628_xmm: ; X64: # %bb.0: -; X64-NEXT: vmovw %edi, %xmm0 +; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vpbroadcastw %edi, %xmm1 +; X64-NEXT: vmovsh %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1 ; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll @@ -283,8 +283,8 @@ define void @test_mask_compress_store_b_512(ptr %addr, <64 x i8> %data, i64 %mask) { ; X86-LABEL: test_mask_compress_store_b_512: ; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpcompressb %zmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x63,0x00] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll @@ -282,8 +282,8 @@ define void @test_mask_compress_store_b_512(ptr %addr, <64 x i8> %data, i64 %mask) { ; X86-LABEL: test_mask_compress_store_b_512: ; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpcompressb %zmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x63,0x00] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll --- a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -19496,8 +19496,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovaps (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -19682,8 +19682,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovaps (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -19859,8 +19859,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovaps (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -20033,8 +20033,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovaps (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -21162,8 +21162,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl $3, %eax @@ -21343,8 +21343,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 @@ -21529,8 +21529,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 @@ -21706,8 +21706,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 @@ -21880,8 +21880,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 @@ -22068,8 +22068,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -22260,8 +22260,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -22443,8 +22443,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -22623,8 +22623,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 diff --git a/llvm/test/CodeGen/X86/bit-test-shift.ll b/llvm/test/CodeGen/X86/bit-test-shift.ll --- a/llvm/test/CodeGen/X86/bit-test-shift.ll +++ b/llvm/test/CodeGen/X86/bit-test-shift.ll @@ -5,10 +5,12 @@ define i32 @x(i32 %t) nounwind readnone ssp { ; CHECK-LABEL: x: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: shll $23, %eax -; CHECK-NEXT: sarl $31, %eax -; CHECK-NEXT: andl $-26, %eax +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb $1, {{[0-9]+}}(%esp) +; CHECK-NEXT: je .LBB0_2 +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: movl $-26, %eax +; CHECK-NEXT: .LBB0_2: # %entry ; CHECK-NEXT: retl entry: %and = and i32 %t, 256 diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll @@ -163,17 +163,17 @@ ; ; AVX1-LABEL: v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpgtw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpcmpgtw %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -182,10 +182,12 @@ ; AVX2-LABEL: v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpmovmskb %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll @@ -357,20 +357,17 @@ ; AVX2-LABEL: v16i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpgtd %ymm7, %ymm5, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtd %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -426,27 +423,46 @@ ; SSE-NEXT: # kill: def $ax killed $ax killed $eax ; SSE-NEXT: retq ; -; AVX12-LABEL: v16f32: -; AVX12: # %bb.0: -; AVX12-NEXT: vcmpltps %ymm1, %ymm3, %ymm1 -; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX12-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 -; AVX12-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 -; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX12-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX12-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vcmpltps %ymm5, %ymm7, %ymm1 -; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX12-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX12-NEXT: vcmpltps %ymm4, %ymm6, %ymm2 -; AVX12-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX12-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX12-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 -; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpmovmskb %xmm0, %eax -; AVX12-NEXT: # kill: def $ax killed $ax killed $eax -; AVX12-NEXT: vzeroupper -; AVX12-NEXT: retq +; AVX1-LABEL: v16f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpltps %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vcmpltps %ymm5, %ymm7, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vcmpltps %ymm4, %ymm6, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: v16f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vcmpltps %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vcmpltps %ymm5, %ymm7, %ymm1 +; AVX2-NEXT: vcmpltps %ymm4, %ymm6, %ymm2 +; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: v16f32: ; AVX512F: # %bb.0: @@ -585,3 +601,5 @@ %res = bitcast <64 x i1> %y to i64 ret i64 %res } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX12: {{.*}} diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll --- a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll @@ -566,18 +566,14 @@ ; ; AVX512F-LABEL: bitcast_16i8_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, (%rdi) -; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: vpmovmskb %xmm0, %eax +; AVX512F-NEXT: movw %ax, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: bitcast_16i8_store: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovb2m %xmm0, %k0 -; AVX512BW-NEXT: kmovw %k0, (%rdi) +; AVX512BW-NEXT: vpmovmskb %xmm0, %eax +; AVX512BW-NEXT: movw %ax, (%rdi) ; AVX512BW-NEXT: retq %a1 = icmp slt <16 x i8> %a0, zeroinitializer %a2 = bitcast <16 x i1> %a1 to i16 @@ -638,17 +634,13 @@ ; ; AVX512F-LABEL: bitcast_4i32_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpgtd %xmm0, %xmm1, %k0 -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vmovmskps %xmm0, %eax ; AVX512F-NEXT: movb %al, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: bitcast_4i32_store: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpcmpgtd %xmm0, %xmm1, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: vmovmskps %xmm0, %eax ; AVX512BW-NEXT: movb %al, (%rdi) ; AVX512BW-NEXT: retq %a1 = icmp slt <4 x i32> %a0, zeroinitializer @@ -672,17 +664,13 @@ ; ; AVX512F-LABEL: bitcast_2i64_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vmovmskpd %xmm0, %eax ; AVX512F-NEXT: movb %al, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: bitcast_2i64_store: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: vmovmskpd %xmm0, %eax ; AVX512BW-NEXT: movb %al, (%rdi) ; AVX512BW-NEXT: retq %a1 = icmp slt <2 x i64> %a0, zeroinitializer diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-setcc-256.ll --- a/llvm/test/CodeGen/X86/bitcast-setcc-256.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-256.ll @@ -330,26 +330,12 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: bitcast_32i8_store: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: kmovw %k1, 2(%rdi) -; AVX512F-NEXT: kmovw %k0, (%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: bitcast_32i8_store: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovb2m %ymm0, %k0 -; AVX512BW-NEXT: kmovd %k0, (%rdi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: bitcast_32i8_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovmskb %ymm0, %eax +; AVX512-NEXT: movl %eax, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %a1 = icmp slt <32 x i8> %a0, zeroinitializer %a2 = bitcast <32 x i1> %a1 to i32 store i32 %a2, ptr %p @@ -447,23 +433,12 @@ ; AVX12-NEXT: vzeroupper ; AVX12-NEXT: retq ; -; AVX512F-LABEL: bitcast_4i64_store: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpgtq %ymm0, %ymm1, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, (%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: bitcast_4i64_store: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpcmpgtq %ymm0, %ymm1, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, (%rdi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: bitcast_4i64_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovmskpd %ymm0, %eax +; AVX512-NEXT: movb %al, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %a1 = icmp slt <4 x i64> %a0, zeroinitializer %a2 = bitcast <4 x i1> %a1 to i4 store i4 %a2, ptr %p diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-setcc-512.ll --- a/llvm/test/CodeGen/X86/bitcast-setcc-512.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-512.ll @@ -450,24 +450,12 @@ ; ; AVX512F-LABEL: bitcast_64i8_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k3 -; AVX512F-NEXT: kmovw %k3, 6(%rdi) -; AVX512F-NEXT: kmovw %k2, 4(%rdi) -; AVX512F-NEXT: kmovw %k1, 2(%rdi) -; AVX512F-NEXT: kmovw %k0, (%rdi) +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpmovmskb %ymm1, %eax +; AVX512F-NEXT: shlq $32, %rax +; AVX512F-NEXT: vpmovmskb %ymm0, %ecx +; AVX512F-NEXT: orq %rax, %rcx +; AVX512F-NEXT: movq %rcx, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -615,13 +603,10 @@ ; ; AVX1-LABEL: bitcast_8i64_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovmskps %ymm0, %eax ; AVX1-NEXT: movb %al, (%rdi) diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -46,30 +46,27 @@ } define i1 @trunc_v2i64_cmp(<2 x i64> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v2i64_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllq $63, %xmm0 -; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax -; SSE2-SSSE3-NEXT: testl %eax, %eax -; SSE2-SSSE3-NEXT: sete %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v2i64_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v2i64_cmp: +; SSE: # %bb.0: +; SSE-NEXT: psllq $63, %xmm0 +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX12-LABEL: trunc_v2i64_cmp: ; AVX12: # %bb.0: -; AVX12-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX12-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX12-NEXT: vtestpd %xmm0, %xmm0 ; AVX12-NEXT: sete %al ; AVX12-NEXT: retq ; ; AVX512-LABEL: trunc_v2i64_cmp: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; AVX512-NEXT: vptest %xmm1, %xmm0 +; AVX512-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512-NEXT: vptestmq %xmm0, %xmm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: testb %al, %al ; AVX512-NEXT: sete %al ; AVX512-NEXT: retq %1 = trunc <2 x i64> %a0 to <2 x i1> @@ -79,15 +76,30 @@ } define i2 @bitcast_v4i32_to_v2i2(<4 x i32> %a0) nounwind { -; SSE-LABEL: bitcast_v4i32_to_v2i2: -; SSE: # %bb.0: -; SSE-NEXT: movmskps %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb $2, %cl -; SSE-NEXT: andb $3, %al -; SSE-NEXT: addb %cl, %al -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: bitcast_v4i32_to_v2i2: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrb $2, %cl +; SSE2-SSSE3-NEXT: movzbl %cl, %ecx +; SSE2-SSSE3-NEXT: andb $3, %al +; SSE2-SSSE3-NEXT: movzbl %al, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: bitcast_v4i32_to_v2i2: +; SSE41: # %bb.0: +; SSE41-NEXT: movmskps %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrb $2, %cl +; SSE41-NEXT: andb $3, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq ; ; AVX-LABEL: bitcast_v4i32_to_v2i2: ; AVX: # %bb.0: @@ -107,31 +119,29 @@ } define i1 @trunc_v4i32_cmp(<4 x i32> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v4i32_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pslld $31, %xmm0 -; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax -; SSE2-SSSE3-NEXT: xorl $15, %eax -; SSE2-SSSE3-NEXT: sete %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v4i32_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v4i32_cmp: +; SSE: # %bb.0: +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX12-LABEL: trunc_v4i32_cmp: ; AVX12: # %bb.0: -; AVX12-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX12-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX12-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX12-NEXT: vtestps %xmm1, %xmm0 ; AVX12-NEXT: setb %al ; AVX12-NEXT: retq ; ; AVX512-LABEL: trunc_v4i32_cmp: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967297,4294967297] -; AVX512-NEXT: vptest %xmm1, %xmm0 -; AVX512-NEXT: setb %al +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: testb $15, %al +; AVX512-NEXT: sete %al ; AVX512-NEXT: retq %1 = trunc <4 x i32> %a0 to <4 x i1> %2 = bitcast <4 x i1> %1 to i4 @@ -140,16 +150,32 @@ } define i4 @bitcast_v8i16_to_v2i4(<8 x i16> %a0) nounwind { -; SSE-LABEL: bitcast_v8i16_to_v2i4: -; SSE: # %bb.0: -; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb $4, %cl -; SSE-NEXT: andb $15, %al -; SSE-NEXT: addb %cl, %al -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: bitcast_v8i16_to_v2i4: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrb $4, %cl +; SSE2-SSSE3-NEXT: movzbl %cl, %ecx +; SSE2-SSSE3-NEXT: andb $15, %al +; SSE2-SSSE3-NEXT: movzbl %al, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: bitcast_v8i16_to_v2i4: +; SSE41: # %bb.0: +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrb $4, %cl +; SSE41-NEXT: andb $15, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq ; ; AVX12-LABEL: bitcast_v8i16_to_v2i4: ; AVX12: # %bb.0: @@ -181,23 +207,19 @@ } define i1 @trunc_v8i16_cmp(<8 x i16> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v8i16_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllw $7, %xmm0 -; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSE2-SSSE3-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-SSSE3-NEXT: setne %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v8i16_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v8i16_cmp: +; SSE: # %bb.0: +; SSE-NEXT: psllw $15, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; ; AVX12-LABEL: trunc_v8i16_cmp: ; AVX12: # %bb.0: -; AVX12-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX12-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: testl $43690, %eax # imm = 0xAAAA ; AVX12-NEXT: setne %al ; AVX12-NEXT: retq ; @@ -232,24 +254,14 @@ ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; -; AVX12-LABEL: bitcast_v16i8_to_v2i8: -; AVX12: # %bb.0: -; AVX12-NEXT: vpmovmskb %xmm0, %ecx -; AVX12-NEXT: movl %ecx, %eax -; AVX12-NEXT: shrl $8, %eax -; AVX12-NEXT: addb %cl, %al -; AVX12-NEXT: # kill: def $al killed $al killed $eax -; AVX12-NEXT: retq -; -; AVX512-LABEL: bitcast_v16i8_to_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovb2m %xmm0, %k0 -; AVX512-NEXT: kshiftrw $8, %k0, %k1 -; AVX512-NEXT: kmovd %k0, %ecx -; AVX512-NEXT: kmovd %k1, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq +; AVX-LABEL: bitcast_v16i8_to_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpmovmskb %xmm0, %ecx +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: shrl $8, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq %1 = icmp slt <16 x i8> %a0, zeroinitializer %2 = bitcast <16 x i1> %1 to <2 x i8> %3 = extractelement <2 x i8> %2, i32 0 @@ -259,32 +271,21 @@ } define i1 @trunc_v16i8_cmp(<16 x i8> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v16i8_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllw $7, %xmm0 -; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSE2-SSSE3-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-SSSE3-NEXT: setne %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v16i8_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setae %al -; SSE41-NEXT: retq -; -; AVX12-LABEL: trunc_v16i8_cmp: -; AVX12: # %bb.0: -; AVX12-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX12-NEXT: setae %al -; AVX12-NEXT: retq +; SSE-LABEL: trunc_v16i8_cmp: +; SSE: # %bb.0: +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; -; AVX512-LABEL: trunc_v16i8_cmp: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [72340172838076673,72340172838076673] -; AVX512-NEXT: vptest %xmm1, %xmm0 -; AVX512-NEXT: setae %al -; AVX512-NEXT: retq +; AVX-LABEL: trunc_v16i8_cmp: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX-NEXT: setne %al +; AVX-NEXT: retq %1 = trunc <16 x i8> %a0 to <16 x i1> %2 = bitcast <16 x i1> %1 to i16 %3 = icmp ne i16 %2, -1 @@ -296,16 +297,32 @@ ; define i2 @bitcast_v4i64_to_v2i2(<4 x i64> %a0) nounwind { -; SSE-LABEL: bitcast_v4i64_to_v2i2: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: movmskps %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb $2, %cl -; SSE-NEXT: andb $3, %al -; SSE-NEXT: addb %cl, %al -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: bitcast_v4i64_to_v2i2: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrb $2, %cl +; SSE2-SSSE3-NEXT: movzbl %cl, %ecx +; SSE2-SSSE3-NEXT: andb $3, %al +; SSE2-SSSE3-NEXT: movzbl %al, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: bitcast_v4i64_to_v2i2: +; SSE41: # %bb.0: +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: movmskps %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrb $2, %cl +; SSE41-NEXT: andb $3, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq ; ; AVX-LABEL: bitcast_v4i64_to_v2i2: ; AVX: # %bb.0: @@ -326,41 +343,39 @@ } define i1 @trunc_v4i64_cmp(<4 x i64> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v4i64_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE2-SSSE3-NEXT: pslld $31, %xmm0 -; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax -; SSE2-SSSE3-NEXT: testl %eax, %eax -; SSE2-SSSE3-NEXT: setne %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v4i64_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v4i64_cmp: +; SSE: # %bb.0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v4i64_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vtestps %xmm0, %xmm0 ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v4i64_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX2-NEXT: vtestpd %ymm0, %ymm0 ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_v4i64_cmp: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX512-NEXT: vptest %ymm1, %ymm0 +; AVX512-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX512-NEXT: vptestmq %ymm0, %ymm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: testb %al, %al ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -371,17 +386,34 @@ } define i4 @bitcast_v8i32_to_v2i4(<8 x i32> %a0) nounwind { -; SSE-LABEL: bitcast_v8i32_to_v2i4: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb $4, %cl -; SSE-NEXT: andb $15, %al -; SSE-NEXT: addb %cl, %al -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: bitcast_v8i32_to_v2i4: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrb $4, %cl +; SSE2-SSSE3-NEXT: movzbl %cl, %ecx +; SSE2-SSSE3-NEXT: andb $15, %al +; SSE2-SSSE3-NEXT: movzbl %al, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: bitcast_v8i32_to_v2i4: +; SSE41: # %bb.0: +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrb $4, %cl +; SSE41-NEXT: andb $15, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq ; ; AVX-LABEL: bitcast_v8i32_to_v2i4: ; AVX: # %bb.0: @@ -402,33 +434,35 @@ } define i1 @trunc_v8i132_cmp(<8 x i32> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v8i132_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pslld $31, %xmm0 -; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax -; SSE2-SSSE3-NEXT: xorl $15, %eax -; SSE2-SSSE3-NEXT: setne %al -; SSE2-SSSE3-NEXT: retq -; ; SSE41-LABEL: trunc_v8i132_cmp: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setae %al +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: psllw $15, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpb $-1, %al +; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_v8i132_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vtestps %xmm1, %xmm0 ; AVX1-NEXT: setae %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v8i132_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vtestps %ymm1, %ymm0 ; AVX2-NEXT: setae %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -510,33 +544,38 @@ } define i1 @trunc_v16i16_cmp(<16 x i16> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v16i16_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: por %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: psllw $7, %xmm0 -; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSE2-SSSE3-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-SSSE3-NEXT: sete %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v16i16_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v16i16_cmp: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v16i16_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v16i16_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -585,10 +624,9 @@ ; ; AVX512-LABEL: bitcast_v32i8_to_v2i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovb2m %ymm0, %k0 -; AVX512-NEXT: kshiftrd $16, %k0, %k1 -; AVX512-NEXT: kmovd %k0, %ecx -; AVX512-NEXT: kmovd %k1, %eax +; AVX512-NEXT: vpmovmskb %ymm0, %ecx +; AVX512-NEXT: movl %ecx, %eax +; AVX512-NEXT: shrl $16, %eax ; AVX512-NEXT: addl %ecx, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -602,42 +640,41 @@ } define i1 @trunc_v32i8_cmp(<32 x i8> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v32i8_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: psllw $7, %xmm0 -; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSE2-SSSE3-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-SSSE3-NEXT: sete %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v32i8_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v32i8_cmp: +; SSE: # %bb.0: +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v32i8_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v32i8_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_v32i8_cmp: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setb %al +; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512-NEXT: vpmovmskb %ymm0, %eax +; AVX512-NEXT: cmpl $-1, %eax +; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = trunc <32 x i8> %a0 to <32 x i1> @@ -651,29 +688,45 @@ ; define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind { -; SSE-LABEL: bitcast_v8i64_to_v2i4: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packssdw %xmm2, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb $4, %cl -; SSE-NEXT: andb $15, %al -; SSE-NEXT: addb %cl, %al -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: bitcast_v8i64_to_v2i4: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrb $4, %cl +; SSE2-SSSE3-NEXT: movzbl %cl, %ecx +; SSE2-SSSE3-NEXT: andb $15, %al +; SSE2-SSSE3-NEXT: movzbl %al, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: bitcast_v8i64_to_v2i4: +; SSE41: # %bb.0: +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: packssdw %xmm2, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrb $4, %cl +; SSE41-NEXT: andb $15, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq ; ; AVX1-LABEL: bitcast_v8i64_to_v2i4: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovmskps %ymm0, %eax ; AVX1-NEXT: movl %eax, %ecx @@ -740,26 +793,43 @@ ; ; SSE41-LABEL: trunc_v8i64_cmp: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm2, %xmm0 +; SSE41-NEXT: psllw $15, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpb $-1, %al +; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_v8i64_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vtestps %xmm1, %xmm0 ; AVX1-NEXT: setb %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v8i64_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vtestps %ymm1, %ymm0 ; AVX2-NEXT: setb %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -858,37 +928,65 @@ define i1 @trunc_v16i32_cmp(<16 x i32> %a0) nounwind { ; SSE2-SSSE3-LABEL: trunc_v16i32_cmp: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: por %xmm3, %xmm1 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: por %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pslld $31, %xmm0 -; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: psllw $7, %xmm0 +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax ; SSE2-SSSE3-NEXT: testl %eax, %eax ; SSE2-SSSE3-NEXT: sete %al ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_v16i32_cmp: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm1 +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: psllw $7, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: testl %eax, %eax ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_v16i32_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v16i32_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -961,41 +1059,50 @@ } define i1 @trunc_v32i16_cmp(<32 x i16> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v32i16_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: psllw $7, %xmm0 -; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSE2-SSSE3-NEXT: notl %eax -; SSE2-SSSE3-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-SSSE3-NEXT: setne %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v32i16_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setae %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v32i16_cmp: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: psllw $7, %xmm2 +; SSE-NEXT: pmovmskb %xmm2, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v32i16_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: setae %al +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v32i16_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setae %al +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1026,7 +1133,6 @@ ; SSE2-SSSE3-NEXT: shll $16, %edx ; SSE2-SSSE3-NEXT: orl %eax, %edx ; SSE2-SSSE3-NEXT: shlq $32, %rdx -; SSE2-SSSE3-NEXT: orq %rcx, %rdx ; SSE2-SSSE3-NEXT: movq %rdx, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-SSSE3-NEXT: movd %xmm0, %eax @@ -1473,14 +1579,10 @@ ; SSE-NEXT: packssdw %xmm1, %xmm0 ; SSE-NEXT: movdqu (%rdi), %xmm1 ; SSE-NEXT: movdqu 16(%rdi), %xmm2 -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE-NEXT: packssdw %xmm4, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pmovmskb %xmm3, %eax -; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: packssdw %xmm2, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pmovmskb %xmm1, %eax +; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE-NEXT: setne %al ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll --- a/llvm/test/CodeGen/X86/bitselect.ll +++ b/llvm/test/CodeGen/X86/bitselect.ll @@ -35,21 +35,21 @@ define i16 @bitselect_i16(i16 %a, i16 %b, i16 %m) nounwind { ; X86-LABEL: bitselect_i16: ; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorw %ax, %cx +; X86-NEXT: andw %cx, %ax +; X86-NEXT: notl %ecx ; X86-NEXT: andw {{[0-9]+}}(%esp), %cx -; X86-NEXT: xorl %ecx, %eax +; X86-NEXT: orl %ecx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; ; X64-NOBMI-LABEL: bitselect_i16: ; X64-NOBMI: # %bb.0: -; X64-NOBMI-NEXT: movl %edx, %eax -; X64-NOBMI-NEXT: andl %edx, %esi -; X64-NOBMI-NEXT: notl %eax -; X64-NOBMI-NEXT: andl %edi, %eax -; X64-NOBMI-NEXT: orl %esi, %eax +; X64-NOBMI-NEXT: movl %esi, %eax +; X64-NOBMI-NEXT: xorl %edi, %eax +; X64-NOBMI-NEXT: andl %edx, %eax +; X64-NOBMI-NEXT: xorl %edi, %eax ; X64-NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NOBMI-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/bool-ext-inc.ll b/llvm/test/CodeGen/X86/bool-ext-inc.ll --- a/llvm/test/CodeGen/X86/bool-ext-inc.ll +++ b/llvm/test/CodeGen/X86/bool-ext-inc.ll @@ -6,8 +6,8 @@ define i32 @sext_inc(i1 zeroext %x) nounwind { ; CHECK-LABEL: sext_inc: ; CHECK: # %bb.0: -; CHECK-NEXT: xorb $1, %dil -; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: xorl $1, %eax ; CHECK-NEXT: retq %ext = sext i1 %x to i32 %add = add i32 %ext, 1 @@ -19,8 +19,10 @@ define <4 x i32> @sext_inc_vec(<4 x i1> %x) nounwind { ; CHECK-LABEL: sext_inc_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-NEXT: vpsrad $31, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %ext = sext <4 x i1> %x to <4 x i32> %add = add <4 x i32> %ext, @@ -31,8 +33,8 @@ ; CHECK-LABEL: cmpgt_sext_inc_vec: ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %cmp = icmp sgt <4 x i32> %x, %y %ext = sext <4 x i1> %cmp to <4 x i32> @@ -44,7 +46,8 @@ ; CHECK-LABEL: cmpne_sext_inc_vec: ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %cmp = icmp ne <4 x i32> %x, %y %ext = sext <4 x i1> %cmp to <4 x i32> @@ -56,8 +59,8 @@ ; CHECK-LABEL: cmpgt_sext_inc_vec256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; CHECK-NEXT: vpandn %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %cmp = icmp sgt <4 x i64> %x, %y %ext = sext <4 x i1> %cmp to <4 x i64> diff --git a/llvm/test/CodeGen/X86/bool-math.ll b/llvm/test/CodeGen/X86/bool-math.ll --- a/llvm/test/CodeGen/X86/bool-math.ll +++ b/llvm/test/CodeGen/X86/bool-math.ll @@ -12,8 +12,9 @@ ; ; X32-LABEL: sub_zext_cmp_mask_same_size_result: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: andl $1, %eax +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NEXT: andb $1, %al +; X32-NEXT: movzbl %al, %eax ; X32-NEXT: orl $-28, %eax ; X32-NEXT: retl %a = and i32 %x, 1 @@ -141,7 +142,7 @@ ; ; X32-LABEL: low_bit_select_constants_bigger_false_same_size_result: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: andl $1, %eax ; X32-NEXT: orl $42, %eax ; X32-NEXT: retl @@ -161,7 +162,7 @@ ; ; X32-LABEL: low_bit_select_constants_bigger_false_wider_result: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: andl $1, %eax ; X32-NEXT: orl $26, %eax ; X32-NEXT: xorl %edx, %edx @@ -183,7 +184,7 @@ ; ; X32-LABEL: low_bit_select_constants_bigger_false_narrower_result: ; X32: # %bb.0: -; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: andl $1, %eax ; X32-NEXT: orl $36, %eax ; X32-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/bswap.ll b/llvm/test/CodeGen/X86/bswap.ll --- a/llvm/test/CodeGen/X86/bswap.ll +++ b/llvm/test/CodeGen/X86/bswap.ll @@ -126,14 +126,21 @@ ; CHECK-LABEL: test2: ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shll $24, %ecx +; CHECK-NEXT: shll $8, %eax +; CHECK-NEXT: andl $16711680, %eax # imm = 0xFF0000 +; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: sarl $16, %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: test2: ; CHECK64: # %bb.0: ; CHECK64-NEXT: movl %edi, %eax -; CHECK64-NEXT: bswapl %eax +; CHECK64-NEXT: shll $24, %eax +; CHECK64-NEXT: shll $8, %edi +; CHECK64-NEXT: andl $16711680, %edi # imm = 0xFF0000 +; CHECK64-NEXT: orl %edi, %eax ; CHECK64-NEXT: sarl $16, %eax ; CHECK64-NEXT: retq %and = lshr i32 %a, 8 diff --git a/llvm/test/CodeGen/X86/bswap_tree2.ll b/llvm/test/CodeGen/X86/bswap_tree2.ll --- a/llvm/test/CodeGen/X86/bswap_tree2.ll +++ b/llvm/test/CodeGen/X86/bswap_tree2.ll @@ -11,20 +11,28 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: andl $16711935, %ecx # imm = 0xFF00FF +; CHECK-NEXT: andl $16711680, %ecx # imm = 0xFF0000 +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: orl $-16777216, %edx # imm = 0xFF000000 ; CHECK-NEXT: shll $8, %ecx -; CHECK-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; CHECK-NEXT: shrl $8, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: shrl $8, %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: shrl $16, %eax +; CHECK-NEXT: orl %edx, %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: test1: ; CHECK64: # %bb.0: +; CHECK64-NEXT: movl %edi, %ecx +; CHECK64-NEXT: andl $16711680, %ecx # imm = 0xFF0000 ; CHECK64-NEXT: movl %edi, %eax -; CHECK64-NEXT: andl $16711935, %eax # imm = 0xFF00FF -; CHECK64-NEXT: shll $8, %eax -; CHECK64-NEXT: orl $-16777216, %edi # imm = 0xFF000000 -; CHECK64-NEXT: shrl $8, %edi +; CHECK64-NEXT: orl $-16777216, %eax # imm = 0xFF000000 +; CHECK64-NEXT: shll $8, %ecx +; CHECK64-NEXT: shrl $8, %eax +; CHECK64-NEXT: orl %ecx, %eax +; CHECK64-NEXT: bswapl %edi +; CHECK64-NEXT: shrl $16, %edi ; CHECK64-NEXT: orl %edi, %eax ; CHECK64-NEXT: retq %byte0 = and i32 %x, 255 ; 0x000000ff diff --git a/llvm/test/CodeGen/X86/bt.ll b/llvm/test/CodeGen/X86/bt.ll --- a/llvm/test/CodeGen/X86/bt.ll +++ b/llvm/test/CodeGen/X86/bt.ll @@ -1064,7 +1064,7 @@ ; X86-LABEL: extend: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: btl %ecx, %eax ; X86-NEXT: setb %al ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/btc_bts_btr.ll b/llvm/test/CodeGen/X86/btc_bts_btr.ll --- a/llvm/test/CodeGen/X86/btc_bts_btr.ll +++ b/llvm/test/CodeGen/X86/btc_bts_btr.ll @@ -859,8 +859,8 @@ ; X86-NEXT: .LBB33_2: ; X86-NEXT: notl %esi ; X86-NEXT: notl %edx -; X86-NEXT: andl %edx, (%eax) ; X86-NEXT: andl %esi, 4(%eax) +; X86-NEXT: andl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl @@ -899,8 +899,8 @@ ; X86-NEXT: movl %edx, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .LBB34_2: -; X86-NEXT: orl %edx, (%eax) ; X86-NEXT: orl %esi, 4(%eax) +; X86-NEXT: orl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl @@ -938,8 +938,8 @@ ; X86-NEXT: movl %edx, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .LBB35_2: -; X86-NEXT: xorl %edx, (%eax) ; X86-NEXT: xorl %esi, 4(%eax) +; X86-NEXT: xorl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl @@ -1027,8 +1027,8 @@ ; ; X86-LABEL: btr_64_mask_zeros: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $2, %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shlb $2, %cl ; X86-NEXT: movl $1, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: shldl %cl, %eax, %edx @@ -1062,8 +1062,8 @@ ; ; X86-LABEL: bts_64_mask_zeros: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $2, %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shlb $2, %cl ; X86-NEXT: movl $1, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: shldl %cl, %eax, %edx @@ -1094,8 +1094,8 @@ ; ; X86-LABEL: btc_64_mask_zeros: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $2, %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shlb $2, %cl ; X86-NEXT: movl $1, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: shldl %cl, %eax, %edx diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -8,22 +8,17 @@ ; SSE2-LABEL: foo: ; SSE2: # %bb.0: ; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: shll $8, %ecx -; SSE2-NEXT: orl %eax, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl $65280, %eax # imm = 0xFF00 -; SSE2-NEXT: orl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: pinsrw $1, %eax, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: movd %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE41-LABEL: foo: ; SSE41: # %bb.0: ; SSE41-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: packusdw %xmm0, %xmm0 +; SSE41-NEXT: packuswb %xmm0, %xmm0 ; SSE41-NEXT: movl $255, %eax ; SSE41-NEXT: pinsrb $3, %eax, %xmm0 ; SSE41-NEXT: movd %xmm0, (%rdi) @@ -32,7 +27,8 @@ ; AVX-LABEL: foo: ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: movl $255, %eax ; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, (%rdi) @@ -50,12 +46,12 @@ define <4 x float> @test_negative_zero_1(<4 x float> %A) { ; SSE2-LABEL: test_negative_zero_1: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_negative_zero_1: @@ -80,19 +76,14 @@ ; FIXME: This could be 'movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]'. define <2 x double> @test_negative_zero_2(<2 x double> %A) { -; SSE2-LABEL: test_negative_zero_2: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],mem[1] -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_negative_zero_2: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; SSE41-NEXT: retq +; SSE-LABEL: test_negative_zero_2: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; SSE-NEXT: retq ; ; AVX-LABEL: test_negative_zero_2: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; AVX-NEXT: retq entry: %0 = extractelement <2 x double> %A, i32 0 @@ -788,9 +779,10 @@ define i32 @PR46586(ptr %p, <4 x i32> %v) { ; SSE2-LABEL: PR46586: ; SSE2: # %bb.0: -; SSE2-NEXT: movzbl 3(%rdi), %eax -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pinsrw $6, %eax, %xmm1 +; SSE2-NEXT: movzbl (%rdi), %eax +; SSE2-NEXT: movzbl 3(%rdi), %ecx +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pinsrw $6, %ecx, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] @@ -802,9 +794,10 @@ ; ; SSE41-LABEL: PR46586: ; SSE41: # %bb.0: -; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pinsrb $12, 3(%rdi), %xmm1 +; SSE41-NEXT: pextrd $3, %xmm1, %eax ; SSE41-NEXT: extractps $3, %xmm0, %ecx -; SSE41-NEXT: pextrb $3, %xmm1, %eax ; SSE41-NEXT: xorl %edx, %edx ; SSE41-NEXT: divl %ecx ; SSE41-NEXT: movl %edx, %eax @@ -812,9 +805,10 @@ ; ; AVX-LABEL: PR46586: ; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $12, 3(%rdi), %xmm1, %xmm1 +; AVX-NEXT: vpextrd $3, %xmm1, %eax ; AVX-NEXT: vextractps $3, %xmm0, %ecx -; AVX-NEXT: vpextrb $3, %xmm1, %eax ; AVX-NEXT: xorl %edx, %edx ; AVX-NEXT: divl %ecx ; AVX-NEXT: movl %edx, %eax diff --git a/llvm/test/CodeGen/X86/bypass-slow-division-32.ll b/llvm/test/CodeGen/X86/bypass-slow-division-32.ll --- a/llvm/test/CodeGen/X86/bypass-slow-division-32.ll +++ b/llvm/test/CodeGen/X86/bypass-slow-division-32.ll @@ -174,13 +174,10 @@ ; CHECK-NEXT: imull %edx ; CHECK-NEXT: movl %edx, %eax ; CHECK-NEXT: shrl $31, %eax -; CHECK-NEXT: sarl $3, %edx -; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: shll $5, %edx +; CHECK-NEXT: shrl $3, %edx ; CHECK-NEXT: addl %eax, %edx +; CHECK-NEXT: shll $5, %edx ; CHECK-NEXT: subl %edx, %ecx -; CHECK-NEXT: addl %eax, %ecx ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: retl %resultdiv = sdiv i32 %a, 33 diff --git a/llvm/test/CodeGen/X86/cast-vsel.ll b/llvm/test/CodeGen/X86/cast-vsel.ll --- a/llvm/test/CodeGen/X86/cast-vsel.ll +++ b/llvm/test/CodeGen/X86/cast-vsel.ll @@ -343,9 +343,16 @@ ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX1-NEXT: vmovups da+4096(%rax), %ymm1 ; AVX1-NEXT: vcmpltps db+4096(%rax), %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovups dc+4096(%rax), %ymm2 ; AVX1-NEXT: vcmpltps dd+4096(%rax), %ymm2, %ymm2 -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovups %ymm1, dj+4096(%rax) ; AVX1-NEXT: addq $32, %rax @@ -357,16 +364,22 @@ ; AVX2-LABEL: example25: ; AVX2: # %bb.0: # %vector.ph ; AVX2-NEXT: movq $-4096, %rax # imm = 0xF000 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB5_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vmovups da+4096(%rax), %ymm0 -; AVX2-NEXT: vcmpltps db+4096(%rax), %ymm0, %ymm0 -; AVX2-NEXT: vmovups dc+4096(%rax), %ymm1 -; AVX2-NEXT: vcmpltps dd+4096(%rax), %ymm1, %ymm1 -; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $31, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, dj+4096(%rax) +; AVX2-NEXT: vmovups da+4096(%rax), %ymm1 +; AVX2-NEXT: vcmpltps db+4096(%rax), %ymm1, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovups dc+4096(%rax), %ymm2 +; AVX2-NEXT: vcmpltps dd+4096(%rax), %ymm2, %ymm2 +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqu %ymm1, dj+4096(%rax) ; AVX2-NEXT: addq $32, %rax ; AVX2-NEXT: jne .LBB5_1 ; AVX2-NEXT: # %bb.2: # %for.end diff --git a/llvm/test/CodeGen/X86/cfguard-x86-64-vectorcall.ll b/llvm/test/CodeGen/X86/cfguard-x86-64-vectorcall.ll --- a/llvm/test/CodeGen/X86/cfguard-x86-64-vectorcall.ll +++ b/llvm/test/CodeGen/X86/cfguard-x86-64-vectorcall.ll @@ -14,12 +14,12 @@ ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: movups (%rdx), %xmm0 ; X64-NEXT: movups 16(%rdx), %xmm1 -; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; X64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; X64-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; X64-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; X64-NEXT: callq *__guard_dispatch_icall_fptr(%rip) ; X64-NEXT: nop ; X64-NEXT: addq $72, %rsp diff --git a/llvm/test/CodeGen/X86/clz.ll b/llvm/test/CodeGen/X86/clz.ll --- a/llvm/test/CodeGen/X86/clz.ll +++ b/llvm/test/CodeGen/X86/clz.ll @@ -831,7 +831,6 @@ ; X86-NOCMOV-LABEL: cttz_i64_zero_test: ; X86-NOCMOV: # %bb.0: ; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOCMOV-NOT: rep ; X86-NOCMOV-NEXT: bsfl {{[0-9]+}}(%esp), %edx ; X86-NOCMOV-NEXT: movl $32, %eax ; X86-NOCMOV-NEXT: je .LBB15_2 @@ -852,12 +851,10 @@ ; X86-CMOV-LABEL: cttz_i64_zero_test: ; X86-CMOV: # %bb.0: ; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-CMOV-NOT: rep ; X86-CMOV-NEXT: bsfl {{[0-9]+}}(%esp), %ecx ; X86-CMOV-NEXT: movl $32, %edx ; X86-CMOV-NEXT: cmovnel %ecx, %edx ; X86-CMOV-NEXT: addl $32, %edx -; X86-CMOV-NOT: rep ; X86-CMOV-NEXT: bsfl %eax, %eax ; X86-CMOV-NEXT: cmovel %edx, %eax ; X86-CMOV-NEXT: xorl %edx, %edx @@ -1395,15 +1392,13 @@ ; ; X86-CLZ-LABEL: PR47603_trunc: ; X86-CLZ: # %bb.0: -; X86-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax -; X86-CLZ-NEXT: xorb $31, %al +; X86-CLZ-NEXT: bsrl {{[0-9]+}}(%esp), %eax ; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X86-CLZ-NEXT: retl ; ; X64-CLZ-LABEL: PR47603_trunc: ; X64-CLZ: # %bb.0: -; X64-CLZ-NEXT: lzcntl %edi, %eax -; X64-CLZ-NEXT: xorb $31, %al +; X64-CLZ-NEXT: bsrl %edi, %eax ; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X64-CLZ-NEXT: retq ; @@ -1481,13 +1476,11 @@ define i32 @cttz_i32_osize(i32 %x) optsize { ; X86-LABEL: cttz_i32_osize: ; X86: # %bb.0: -; X86-NOT: rep ; X86-NEXT: bsfl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; ; X64-LABEL: cttz_i32_osize: ; X64: # %bb.0: -; X64-NOT: rep ; X64-NEXT: bsfl %edi, %eax ; X64-NEXT: retq ; @@ -1517,13 +1510,11 @@ define i32 @cttz_i32_msize(i32 %x) minsize { ; X86-LABEL: cttz_i32_msize: ; X86: # %bb.0: -; X86-NOT: rep ; X86-NEXT: bsfl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; ; X64-LABEL: cttz_i32_msize: ; X64: # %bb.0: -; X64-NOT: rep ; X64-NEXT: bsfl %edi, %eax ; X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/cmov-promotion.ll b/llvm/test/CodeGen/X86/cmov-promotion.ll --- a/llvm/test/CodeGen/X86/cmov-promotion.ll +++ b/llvm/test/CodeGen/X86/cmov-promotion.ll @@ -30,20 +30,19 @@ define i32 @cmov_zpromotion_8_to_32(i1 %c) { ; CMOV-LABEL: cmov_zpromotion_8_to_32: ; CMOV: # %bb.0: -; CMOV-NEXT: testb $1, %dil -; CMOV-NEXT: movl $126, %ecx -; CMOV-NEXT: movl $255, %eax -; CMOV-NEXT: cmovnel %ecx, %eax +; CMOV-NEXT: andb $1, %dil +; CMOV-NEXT: decb %dil +; CMOV-NEXT: orb $126, %dil +; CMOV-NEXT: movzbl %dil, %eax ; CMOV-NEXT: retq ; ; NO_CMOV-LABEL: cmov_zpromotion_8_to_32: ; NO_CMOV: # %bb.0: -; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; NO_CMOV-NEXT: movl $126, %eax -; NO_CMOV-NEXT: jne .LBB1_2 -; NO_CMOV-NEXT: # %bb.1: -; NO_CMOV-NEXT: movl $255, %eax -; NO_CMOV-NEXT: .LBB1_2: +; NO_CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; NO_CMOV-NEXT: andb $1, %al +; NO_CMOV-NEXT: decb %al +; NO_CMOV-NEXT: orb $126, %al +; NO_CMOV-NEXT: movzbl %al, %eax ; NO_CMOV-NEXT: retl %t0 = select i1 %c, i8 12414, i8 -1 %ret = zext i8 %t0 to i32 @@ -53,20 +52,19 @@ define i64 @cmov_zpromotion_8_to_64(i1 %c) { ; CMOV-LABEL: cmov_zpromotion_8_to_64: ; CMOV: # %bb.0: -; CMOV-NEXT: testb $1, %dil -; CMOV-NEXT: movl $126, %ecx -; CMOV-NEXT: movl $255, %eax -; CMOV-NEXT: cmovneq %rcx, %rax +; CMOV-NEXT: andb $1, %dil +; CMOV-NEXT: decb %dil +; CMOV-NEXT: orb $126, %dil +; CMOV-NEXT: movzbl %dil, %eax ; CMOV-NEXT: retq ; ; NO_CMOV-LABEL: cmov_zpromotion_8_to_64: ; NO_CMOV: # %bb.0: -; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; NO_CMOV-NEXT: movl $126, %eax -; NO_CMOV-NEXT: jne .LBB2_2 -; NO_CMOV-NEXT: # %bb.1: -; NO_CMOV-NEXT: movl $255, %eax -; NO_CMOV-NEXT: .LBB2_2: +; NO_CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; NO_CMOV-NEXT: andb $1, %al +; NO_CMOV-NEXT: decb %al +; NO_CMOV-NEXT: orb $126, %al +; NO_CMOV-NEXT: movzbl %al, %eax ; NO_CMOV-NEXT: xorl %edx, %edx ; NO_CMOV-NEXT: retl %t0 = select i1 %c, i8 12414, i8 -1 @@ -77,20 +75,19 @@ define i32 @cmov_zpromotion_16_to_32(i1 %c) { ; CMOV-LABEL: cmov_zpromotion_16_to_32: ; CMOV: # %bb.0: -; CMOV-NEXT: testb $1, %dil -; CMOV-NEXT: movl $12414, %ecx # imm = 0x307E -; CMOV-NEXT: movl $65535, %eax # imm = 0xFFFF -; CMOV-NEXT: cmovnel %ecx, %eax +; CMOV-NEXT: andl $1, %edi +; CMOV-NEXT: decl %edi +; CMOV-NEXT: orl $12414, %edi # imm = 0x307E +; CMOV-NEXT: movzwl %di, %eax ; CMOV-NEXT: retq ; ; NO_CMOV-LABEL: cmov_zpromotion_16_to_32: ; NO_CMOV: # %bb.0: -; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; NO_CMOV-NEXT: movl $12414, %eax # imm = 0x307E -; NO_CMOV-NEXT: jne .LBB3_2 -; NO_CMOV-NEXT: # %bb.1: -; NO_CMOV-NEXT: movl $65535, %eax # imm = 0xFFFF -; NO_CMOV-NEXT: .LBB3_2: +; NO_CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; NO_CMOV-NEXT: andl $1, %eax +; NO_CMOV-NEXT: decl %eax +; NO_CMOV-NEXT: orl $12414, %eax # imm = 0x307E +; NO_CMOV-NEXT: movzwl %ax, %eax ; NO_CMOV-NEXT: retl %t0 = select i1 %c, i16 12414, i16 -1 %ret = zext i16 %t0 to i32 @@ -100,20 +97,19 @@ define i64 @cmov_zpromotion_16_to_64(i1 %c) { ; CMOV-LABEL: cmov_zpromotion_16_to_64: ; CMOV: # %bb.0: -; CMOV-NEXT: testb $1, %dil -; CMOV-NEXT: movl $12414, %ecx # imm = 0x307E -; CMOV-NEXT: movl $65535, %eax # imm = 0xFFFF -; CMOV-NEXT: cmovneq %rcx, %rax +; CMOV-NEXT: andl $1, %edi +; CMOV-NEXT: decl %edi +; CMOV-NEXT: orl $12414, %edi # imm = 0x307E +; CMOV-NEXT: movzwl %di, %eax ; CMOV-NEXT: retq ; ; NO_CMOV-LABEL: cmov_zpromotion_16_to_64: ; NO_CMOV: # %bb.0: -; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; NO_CMOV-NEXT: movl $12414, %eax # imm = 0x307E -; NO_CMOV-NEXT: jne .LBB4_2 -; NO_CMOV-NEXT: # %bb.1: -; NO_CMOV-NEXT: movl $65535, %eax # imm = 0xFFFF -; NO_CMOV-NEXT: .LBB4_2: +; NO_CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; NO_CMOV-NEXT: andl $1, %eax +; NO_CMOV-NEXT: decl %eax +; NO_CMOV-NEXT: orl $12414, %eax # imm = 0x307E +; NO_CMOV-NEXT: movzwl %ax, %eax ; NO_CMOV-NEXT: xorl %edx, %edx ; NO_CMOV-NEXT: retl %t0 = select i1 %c, i16 12414, i16 -1 diff --git a/llvm/test/CodeGen/X86/cmov.ll b/llvm/test/CodeGen/X86/cmov.ll --- a/llvm/test/CodeGen/X86/cmov.ll +++ b/llvm/test/CodeGen/X86/cmov.ll @@ -216,7 +216,7 @@ ; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: notl %edi ; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: cmovnsl %edi, %eax +; CHECK-NEXT: cmovgl %edi, %eax ; CHECK-NEXT: retq %not_x = xor i32 %x, -1 %1 = icmp slt i32 %not_x, -1 diff --git a/llvm/test/CodeGen/X86/cmp-bool.ll b/llvm/test/CodeGen/X86/cmp-bool.ll --- a/llvm/test/CodeGen/X86/cmp-bool.ll +++ b/llvm/test/CodeGen/X86/cmp-bool.ll @@ -25,8 +25,9 @@ define void @bool_ne(i1 zeroext %a, i1 zeroext %b, ptr nocapture %c) nounwind { ; CHECK-LABEL: bool_ne: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cmpb %sil, %dil -; CHECK-NEXT: je .LBB1_1 +; CHECK-NEXT: xorl %esi, %edi +; CHECK-NEXT: cmpl $1, %edi +; CHECK-NEXT: jne .LBB1_1 ; CHECK-NEXT: # %bb.2: # %if.then ; CHECK-NEXT: jmpq *%rdx # TAILCALL ; CHECK-NEXT: .LBB1_1: # %if.end diff --git a/llvm/test/CodeGen/X86/cmp-concat.ll b/llvm/test/CodeGen/X86/cmp-concat.ll --- a/llvm/test/CodeGen/X86/cmp-concat.ll +++ b/llvm/test/CodeGen/X86/cmp-concat.ll @@ -36,7 +36,7 @@ ; CHECK-NEXT: movzwl %di, %eax ; CHECK-NEXT: movzwl %si, %ecx ; CHECK-NEXT: shlq $8, %rcx -; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: orq %rcx, %rax ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %zx = zext i16 %x to i64 @@ -54,7 +54,7 @@ ; CHECK-NEXT: movzwl %di, %eax ; CHECK-NEXT: movzwl %si, %ecx ; CHECK-NEXT: shlq $8, %rcx -; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: orq %rcx, %rax ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %zx = zext i16 %x to i64 diff --git a/llvm/test/CodeGen/X86/cmp.ll b/llvm/test/CodeGen/X86/cmp.ll --- a/llvm/test/CodeGen/X86/cmp.ll +++ b/llvm/test/CodeGen/X86/cmp.ll @@ -310,8 +310,10 @@ define i8 @signbit_i16(i16 signext %L) { ; CHECK-LABEL: signbit_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: testw %di, %di # encoding: [0x66,0x85,0xff] -; CHECK-NEXT: setns %al # encoding: [0x0f,0x99,0xc0] +; CHECK-NEXT: movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7] +; CHECK-NEXT: shrl $15, %eax # encoding: [0xc1,0xe8,0x0f] +; CHECK-NEXT: xorb $1, %al # encoding: [0x34,0x01] +; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] %lshr = lshr i16 %L, 15 %trunc = trunc i16 %lshr to i8 diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll --- a/llvm/test/CodeGen/X86/combine-and.ll +++ b/llvm/test/CodeGen/X86/combine-and.ll @@ -589,8 +589,9 @@ ; ; AVX512-LABEL: neg_scalar_broadcast_v8i64_arg: ; AVX512: # %bb.0: +; AVX512-NEXT: notq %rdi ; AVX512-NEXT: vpbroadcastq %rdi, %zmm1 -; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %1 = xor i64 %a0, -1 %2 = insertelement <8 x i64> undef, i64 %1, i64 0 @@ -618,35 +619,38 @@ ; AVX1-LABEL: neg_scalar_broadcast_v8i64: ; AVX1: # %bb.0: ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-NEXT: notq %rdi +; AVX1-NEXT: vmovq %rdi, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,3] -; AVX1-NEXT: vmovq %rdi, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; AVX1-NEXT: vandnpd %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vandnpd %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vandpd %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v8i64: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,0] +; AVX2-NEXT: notq %rdi +; AVX2-NEXT: vmovq %rdi, %xmm1 +; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,0] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,1,1] -; AVX2-NEXT: vmovq %rdi, %xmm2 -; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512-NEXT: notq %rdi ; AVX512-NEXT: vpbroadcastq %rdi, %zmm1 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,1,1,0,1,0,0] ; AVX512-NEXT: vpermq %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %1 = xor i64 %a0, -1 %2 = insertelement <8 x i64> undef, i64 %1, i64 0 @@ -668,23 +672,26 @@ ; ; AVX1-LABEL: neg_scalar_broadcast_v4i64_arg: ; AVX1: # %bb.0: +; AVX1-NEXT: notq %rdi ; AVX1-NEXT: vmovq %rdi, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v4i64_arg: ; AVX2: # %bb.0: +; AVX2-NEXT: notq %rdi ; AVX2-NEXT: vmovq %rdi, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v4i64_arg: ; AVX512: # %bb.0: +; AVX512-NEXT: notq %rdi ; AVX512-NEXT: vpbroadcastq %rdi, %ymm1 -; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %1 = xor i64 %a0, -1 %2 = insertelement <4 x i64> undef, i64 %1, i64 0 @@ -708,29 +715,32 @@ ; AVX1-LABEL: neg_scalar_broadcast_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: notq %rdi ; AVX1-NEXT: vmovq %rdi, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,3] -; AVX1-NEXT: vandnpd %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: notq %rdi ; AVX2-NEXT: vmovq %rdi, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,1,1] -; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: notq %rdi ; AVX512-NEXT: vpbroadcastq %rdi, %ymm1 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,1,1] -; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %1 = xor i64 %a0, -1 %2 = insertelement <4 x i64> undef, i64 %1, i64 0 @@ -743,30 +753,33 @@ define <2 x i64> @neg_scalar_broadcast_v2i64(i64 %a0, <2 x i64> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v2i64: ; SSE: # %bb.0: +; SSE-NEXT: notq %rdi ; SSE-NEXT: movq %rdi, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: neg_scalar_broadcast_v2i64: ; AVX1: # %bb.0: +; AVX1-NEXT: notq %rdi ; AVX1-NEXT: vmovq %rdi, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v2i64: ; AVX2: # %bb.0: +; AVX2-NEXT: notq %rdi ; AVX2-NEXT: vmovq %rdi, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v2i64: ; AVX512: # %bb.0: +; AVX512-NEXT: notq %rdi ; AVX512-NEXT: vpbroadcastq %rdi, %xmm1 -; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i64 %a0, -1 %2 = insertelement <2 x i64> undef, i64 %1, i64 0 @@ -819,23 +832,26 @@ ; ; AVX1-LABEL: neg_scalar_broadcast_v8i32: ; AVX1: # %bb.0: +; AVX1-NEXT: notl %edi ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v8i32: ; AVX2: # %bb.0: +; AVX2-NEXT: notl %edi ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 -; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v8i32: ; AVX512: # %bb.0: +; AVX512-NEXT: notl %edi ; AVX512-NEXT: vpbroadcastd %edi, %ymm1 -; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %1 = xor i32 %a0, -1 %2 = insertelement <8 x i32> undef, i32 %1, i64 0 @@ -847,32 +863,35 @@ define <8 x i16> @neg_scalar_broadcast_v8i16(i16 %a0, <8 x i16> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v8i16: ; SSE: # %bb.0: +; SSE-NEXT: notl %edi ; SSE-NEXT: movd %edi, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: neg_scalar_broadcast_v8i16: ; AVX1: # %bb.0: +; AVX1-NEXT: notl %edi ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v8i16: ; AVX2: # %bb.0: +; AVX2-NEXT: notl %edi ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v8i16: ; AVX512: # %bb.0: +; AVX512-NEXT: notl %edi ; AVX512-NEXT: vpbroadcastw %edi, %xmm1 -; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i16 %a0, -1 %2 = insertelement <8 x i16> undef, i16 %1, i64 0 @@ -884,32 +903,36 @@ define <16 x i8> @neg_scalar_broadcast_v16i8(i8 %a0, <16 x i8> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: movd %edi, %xmm1 +; SSE-NEXT: notb %dil +; SSE-NEXT: movzbl %dil, %eax +; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: pshufb %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: neg_scalar_broadcast_v16i8: ; AVX1: # %bb.0: +; AVX1-NEXT: notb %dil ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v16i8: ; AVX2: # %bb.0: +; AVX2-NEXT: notb %dil ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v16i8: ; AVX512: # %bb.0: +; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %xmm1 -; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <16 x i8> undef, i8 %1, i64 0 @@ -954,8 +977,9 @@ ; ; AVX512-LABEL: neg_scalar_broadcast_v64i8: ; AVX512: # %bb.0: +; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %zmm1 -; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <64 x i8> undef, i8 %1, i64 0 @@ -1000,8 +1024,9 @@ ; ; AVX512-LABEL: neg_scalar_broadcast_v64i8_v8i64: ; AVX512: # %bb.0: +; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %zmm1 -; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <64 x i8> undef, i8 %1, i64 0 @@ -1025,24 +1050,27 @@ ; ; AVX1-LABEL: neg_scalar_broadcast_v32i8_v4i64: ; AVX1: # %bb.0: +; AVX1-NEXT: notb %dil ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v32i8_v4i64: ; AVX2: # %bb.0: +; AVX2-NEXT: notb %dil ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 -; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v32i8_v4i64: ; AVX512: # %bb.0: +; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %ymm1 -; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <32 x i8> undef, i8 %1, i64 0 @@ -1055,32 +1083,36 @@ define <2 x i64> @neg_scalar_broadcast_v16i8_v2i64(i8 %a0, <2 x i64> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v16i8_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: movd %edi, %xmm1 +; SSE-NEXT: notb %dil +; SSE-NEXT: movzbl %dil, %eax +; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: pshufb %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: neg_scalar_broadcast_v16i8_v2i64: ; AVX1: # %bb.0: +; AVX1-NEXT: notb %dil ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v16i8_v2i64: ; AVX2: # %bb.0: +; AVX2-NEXT: notb %dil ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v16i8_v2i64: ; AVX512: # %bb.0: +; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %xmm1 -; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <16 x i8> undef, i8 %1, i64 0 @@ -1102,23 +1134,26 @@ ; ; AVX1-LABEL: neg_scalar_broadcast_v8i32_v4i64: ; AVX1: # %bb.0: +; AVX1-NEXT: notl %edi ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v8i32_v4i64: ; AVX2: # %bb.0: +; AVX2-NEXT: notl %edi ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 -; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v8i32_v4i64: ; AVX512: # %bb.0: +; AVX512-NEXT: notl %edi ; AVX512-NEXT: vpbroadcastd %edi, %ymm1 -; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %1 = xor i32 %a0, -1 %2 = insertelement <8 x i32> undef, i32 %1, i64 0 diff --git a/llvm/test/CodeGen/X86/combine-avx2-intrinsics.ll b/llvm/test/CodeGen/X86/combine-avx2-intrinsics.ll --- a/llvm/test/CodeGen/X86/combine-avx2-intrinsics.ll +++ b/llvm/test/CodeGen/X86/combine-avx2-intrinsics.ll @@ -113,6 +113,7 @@ define <4 x i64> @demandedelts_vpsrlvq(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: demandedelts_vpsrlvq: ; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastq %xmm1, %xmm1 ; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-bitreverse.ll b/llvm/test/CodeGen/X86/combine-bitreverse.ll --- a/llvm/test/CodeGen/X86/combine-bitreverse.ll +++ b/llvm/test/CodeGen/X86/combine-bitreverse.ll @@ -75,9 +75,9 @@ ; X86-NEXT: andl $858993408, %eax # imm = 0x33333300 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: andl $1431655744, %ecx # imm = 0x55555540 ; X86-NEXT: shrl %eax -; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X86-NEXT: andl $1431655680, %eax # imm = 0x55555500 ; X86-NEXT: leal (%eax,%ecx,2), %eax ; X86-NEXT: retl ; @@ -115,9 +115,9 @@ ; X64-NEXT: andl $858993408, %eax # imm = 0x33333300 ; X64-NEXT: leal (%rax,%rcx,4), %eax ; X64-NEXT: movl %eax, %ecx -; X64-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X64-NEXT: andl $1431655744, %ecx # imm = 0x55555540 ; X64-NEXT: shrl %eax -; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X64-NEXT: andl $1431655680, %eax # imm = 0x55555500 ; X64-NEXT: leal (%rax,%rcx,2), %eax ; X64-NEXT: retq %b = call i32 @llvm.bitreverse.i32(i32 %a0) @@ -163,7 +163,7 @@ ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 ; X86-NEXT: shrl %eax -; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X86-NEXT: andl $1431655764, %eax # imm = 0x55555554 ; X86-NEXT: leal (%eax,%ecx,2), %edx ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: retl @@ -183,34 +183,33 @@ ; X64-NEXT: andq %rax, %rcx ; X64-NEXT: shrq $2, %rdi ; X64-NEXT: andq %rax, %rdi -; X64-NEXT: leaq (%rdi,%rcx,4), %rax -; X64-NEXT: movabsq $6148914689804861440, %rcx # imm = 0x5555555500000000 -; X64-NEXT: andq %rax, %rcx -; X64-NEXT: shrq %rax -; X64-NEXT: movabsq $6148914685509894144, %rdx # imm = 0x5555555400000000 -; X64-NEXT: andq %rax, %rdx -; X64-NEXT: leaq (%rdx,%rcx,2), %rax -; X64-NEXT: shrq $33, %rax -; X64-NEXT: bswapq %rax -; X64-NEXT: movabsq $1085102592318504960, %rcx # imm = 0xF0F0F0F00000000 -; X64-NEXT: andq %rax, %rcx -; X64-NEXT: shrq $4, %rax -; X64-NEXT: movabsq $1085102557958766592, %rdx # imm = 0xF0F0F0700000000 -; X64-NEXT: andq %rax, %rdx -; X64-NEXT: shlq $4, %rcx -; X64-NEXT: orq %rdx, %rcx -; X64-NEXT: movabsq $3689348813882916864, %rax # imm = 0x3333333300000000 -; X64-NEXT: andq %rcx, %rax -; X64-NEXT: shrq $2, %rcx -; X64-NEXT: movabsq $3689348805292982272, %rdx # imm = 0x3333333100000000 +; X64-NEXT: leaq (%rdi,%rcx,4), %rdx +; X64-NEXT: movabsq $6148914689804861440, %rax # imm = 0x5555555500000000 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: andq %rax, %rsi +; X64-NEXT: shrq %rdx +; X64-NEXT: movabsq $6148914685509894144, %rcx # imm = 0x5555555400000000 ; X64-NEXT: andq %rcx, %rdx -; X64-NEXT: leaq (%rdx,%rax,4), %rax -; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 -; X64-NEXT: movq %rax, %rdx +; X64-NEXT: leaq (%rdx,%rsi,2), %rdx +; X64-NEXT: shrq $33, %rdx +; X64-NEXT: bswapq %rdx +; X64-NEXT: movabsq $1085102592318504960, %rsi # imm = 0xF0F0F0F00000000 +; X64-NEXT: andq %rdx, %rsi +; X64-NEXT: shrq $4, %rdx +; X64-NEXT: movabsq $1085102557958766592, %rdi # imm = 0xF0F0F0700000000 +; X64-NEXT: andq %rdx, %rdi +; X64-NEXT: shlq $4, %rsi +; X64-NEXT: orq %rdi, %rsi +; X64-NEXT: movabsq $3689348813882916864, %rdx # imm = 0x3333333300000000 +; X64-NEXT: andq %rsi, %rdx +; X64-NEXT: shrq $2, %rsi +; X64-NEXT: movabsq $3689348805292982272, %rdi # imm = 0x3333333100000000 +; X64-NEXT: andq %rsi, %rdi +; X64-NEXT: leaq (%rdi,%rdx,4), %rdx +; X64-NEXT: andq %rdx, %rax +; X64-NEXT: shrq %rdx ; X64-NEXT: andq %rcx, %rdx -; X64-NEXT: shrq %rax -; X64-NEXT: andq %rcx, %rax -; X64-NEXT: leaq (%rax,%rdx,2), %rax +; X64-NEXT: leaq (%rdx,%rax,2), %rax ; X64-NEXT: retq %1 = call i64 @llvm.bitreverse.i64(i64 %a) %2 = lshr i64 %1, 33 @@ -254,9 +253,9 @@ ; X86-NEXT: andl $36909875, %eax # imm = 0x2333333 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: andl $5592405, %ecx # imm = 0x555555 ; X86-NEXT: shrl %eax -; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X86-NEXT: andl $22369621, %eax # imm = 0x1555555 ; X86-NEXT: leal (%eax,%ecx,2), %eax ; X86-NEXT: retl ; @@ -294,9 +293,9 @@ ; X64-NEXT: andl $36909875, %ecx # imm = 0x2333333 ; X64-NEXT: leal (%rcx,%rax,4), %eax ; X64-NEXT: movl %eax, %ecx -; X64-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X64-NEXT: andl $5592405, %ecx # imm = 0x555555 ; X64-NEXT: shrl %eax -; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X64-NEXT: andl $22369621, %eax # imm = 0x1555555 ; X64-NEXT: leal (%rax,%rcx,2), %eax ; X64-NEXT: retq %b = call i32 @llvm.bitreverse.i32(i32 %a0) @@ -338,7 +337,7 @@ ; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: andl $357913941, %ecx # imm = 0x15555555 ; X86-NEXT: shrl %eax ; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %eax @@ -377,12 +376,11 @@ ; X64-NEXT: shrq $2, %rax ; X64-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X64-NEXT: leaq (%rax,%rcx,4), %rax -; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 -; X64-NEXT: movq %rax, %rdx -; X64-NEXT: andq %rcx, %rdx +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: andl $357913941, %ecx # imm = 0x15555555 ; X64-NEXT: shrq %rax -; X64-NEXT: andq %rcx, %rax -; X64-NEXT: leaq (%rax,%rdx,2), %rax +; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X64-NEXT: leaq (%rax,%rcx,2), %rax ; X64-NEXT: retq %1 = call i64 @llvm.bitreverse.i64(i64 %a) %2 = shl i64 %1, 33 diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll --- a/llvm/test/CodeGen/X86/combine-bitselect.ll +++ b/llvm/test/CodeGen/X86/combine-bitselect.ll @@ -541,8 +541,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq %rdi, %xmm2 ; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm3 +; AVX2-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -551,14 +554,20 @@ ; AVX512F-NEXT: vmovq %rdi, %xmm2 ; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: bitselect_v4i64_broadcast_rrr: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpbroadcastq %rdi, %ymm2 -; AVX512VL-NEXT: vpternlogq $226, %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: # kill: def $xmm2 killed $xmm2 killed $ymm2 +; AVX512VL-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512VL-NEXT: vpternlogq $248, %ymm2, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %1 = insertelement <4 x i64> undef, i64 %a2, i32 0 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <4 x i32> zeroinitializer @@ -590,25 +599,43 @@ ; XOP-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v4i64_broadcast_rrm: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd (%rdi), %ymm2 -; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v4i64_broadcast_rrm: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v4i64_broadcast_rrm: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm3 +; AVX2-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: bitselect_v4i64_broadcast_rrm: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vbroadcastsd (%rdi), %ymm2 -; AVX512F-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: bitselect_v4i64_broadcast_rrm: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpternlogq $228, (%rdi){1to4}, %ymm1, %ymm0 +; AVX512VL-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: # kill: def $xmm2 killed $xmm2 killed $ymm2 +; AVX512VL-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512VL-NEXT: vpternlogq $248, %ymm2, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %a2 = load i64, ptr %p2 %1 = insertelement <4 x i64> undef, i64 %a2, i32 0 @@ -914,19 +941,35 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq %rdi, %xmm4 ; AVX2-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX2-NEXT: vpxor %xmm5, %xmm4, %xmm5 +; AVX2-NEXT: vpbroadcastq %xmm5, %ymm5 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: bitselect_v8i64_broadcast_rrr: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq %rdi, %zmm2 -; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: bitselect_v8i64_broadcast_rrr: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq %rdi, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512F-NEXT: vpternlogq $248, %zmm2, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: bitselect_v8i64_broadcast_rrr: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq %rdi, %zmm2 +; AVX512VL-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512VL-NEXT: # kill: def $xmm2 killed $xmm2 killed $zmm2 +; AVX512VL-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512VL-NEXT: vpternlogq $248, %zmm2, %zmm1, %zmm0 +; AVX512VL-NEXT: retq %1 = insertelement <8 x i64> undef, i64 %a2, i32 0 %2 = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> zeroinitializer %3 = xor <8 x i64> %1, @@ -966,21 +1009,49 @@ ; XOP-NEXT: vpcmov %ymm4, %ymm3, %ymm1, %ymm1 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v8i64_broadcast_rrm: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd (%rdi), %ymm4 -; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v8i64_broadcast_rrm: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vandnps %ymm3, %ymm4, %ymm3 +; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vandnps %ymm2, %ymm4, %ymm2 +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: retq ; -; AVX512-LABEL: bitselect_v8i64_broadcast_rrm: -; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogq $228, (%rdi){1to8}, %zmm1, %zmm0 -; AVX512-NEXT: retq +; AVX2-LABEL: bitselect_v8i64_broadcast_rrm: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX2-NEXT: vpxor %xmm5, %xmm4, %xmm5 +; AVX2-NEXT: vpbroadcastq %xmm5, %ymm5 +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: bitselect_v8i64_broadcast_rrm: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq (%rdi), %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512F-NEXT: vpternlogq $248, %zmm2, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: bitselect_v8i64_broadcast_rrm: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq (%rdi), %zmm2 +; AVX512VL-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512VL-NEXT: # kill: def $xmm2 killed $xmm2 killed $zmm2 +; AVX512VL-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512VL-NEXT: vpternlogq $248, %zmm2, %zmm1, %zmm0 +; AVX512VL-NEXT: retq %a2 = load i64, ptr %p2 %1 = insertelement <8 x i64> undef, i64 %a2, i32 0 %2 = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/combine-bswap.ll b/llvm/test/CodeGen/X86/combine-bswap.ll --- a/llvm/test/CodeGen/X86/combine-bswap.ll +++ b/llvm/test/CodeGen/X86/combine-bswap.ll @@ -42,15 +42,15 @@ define i16 @test_bswap_srli_8_bswap_i16(i16 %a) nounwind { ; X86-LABEL: test_bswap_srli_8_bswap_i16: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll $8, %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: rolw $8, %ax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; ; X64-LABEL: test_bswap_srli_8_bswap_i16: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $8, %eax +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: rolw $8, %ax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %1 = call i16 @llvm.bswap.i16(i16 %a) @@ -106,7 +106,8 @@ ; X64-LABEL: test_bswap_shli_8_bswap_i16: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: movzbl %ah, %eax +; X64-NEXT: andl $65280, %eax # imm = 0xFF00 +; X64-NEXT: rolw $8, %ax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %1 = call i16 @llvm.bswap.i16(i16 %a) @@ -136,8 +137,12 @@ define i64 @test_bswap_shli_16_bswap_i64(i64 %a) nounwind { ; X86-LABEL: test_bswap_shli_16_bswap_i64: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shll $16, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: shrl $16, %edx ; X86-NEXT: retl ; ; X64-LABEL: test_bswap_shli_16_bswap_i64: @@ -220,7 +225,7 @@ define i64 @test_bswap64_shift48(i64 %a0) { ; X86-LABEL: test_bswap64_shift48: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: rolw $8, %ax ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: xorl %edx, %edx diff --git a/llvm/test/CodeGen/X86/combine-concatvectors.ll b/llvm/test/CodeGen/X86/combine-concatvectors.ll --- a/llvm/test/CodeGen/X86/combine-concatvectors.ll +++ b/llvm/test/CodeGen/X86/combine-concatvectors.ll @@ -48,8 +48,7 @@ ; AVX1-NEXT: movl $1091567616, 30256(%rax) # imm = 0x41100000 ; AVX1-NEXT: movabsq $4294967297, %rcx # imm = 0x100000001 ; AVX1-NEXT: movq %rcx, 46348(%rax) -; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [7.812501848093234E-3,7.812501848093234E-3,7.812501848093234E-3,7.812501848093234E-3] -; AVX1-NEXT: # ymm0 = mem[0,1,0,1] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm0 = [?,?,?,?] ; AVX1-NEXT: vmovups %ymm0, 48296(%rax) ; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovsd %xmm0, 47372(%rax) @@ -91,25 +90,24 @@ ; AVX1-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-NEXT: vmovaps (%rsi), %ymm1 ; AVX1-NEXT: vmovaps (%rdx), %ymm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0,2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,0],xmm0[0,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[2,0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: concat_of_broadcast_v4f32_v8f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,0] -; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [6,7,4,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vmovaps (%rsi), %ymm1 +; AVX2-NEXT: vmovaps {{.*#+}} xmm2 = <6,0,u,3> +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %ld0 = load volatile <8 x float>, ptr %a0 diff --git a/llvm/test/CodeGen/X86/combine-multiplies.ll b/llvm/test/CodeGen/X86/combine-multiplies.ll --- a/llvm/test/CodeGen/X86/combine-multiplies.ll +++ b/llvm/test/CodeGen/X86/combine-multiplies.ll @@ -105,21 +105,21 @@ define void @testCombineMultiplies_splat(<4 x i32> %v1) nounwind { ; CHECK-LABEL: testCombineMultiplies_splat: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,11,11,11] -; CHECK-NEXT: paddd %xmm0, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [22,22,22,22] +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [22,22,22,22] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-NEXT: pmuludq %xmm2, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,242,242,242] -; CHECK-NEXT: paddd %xmm0, %xmm2 +; CHECK-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; CHECK-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; CHECK-NEXT: movdqa %xmm2, v2 -; CHECK-NEXT: movdqa %xmm0, v3 -; CHECK-NEXT: movdqa %xmm1, x +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [242,242,242,242] +; CHECK-NEXT: paddd %xmm2, %xmm1 +; CHECK-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; CHECK-NEXT: movdqa %xmm1, v2 +; CHECK-NEXT: movdqa %xmm2, v3 +; CHECK-NEXT: movdqa %xmm0, x ; CHECK-NEXT: retl entry: %add1 = add <4 x i32> %v1, @@ -139,20 +139,20 @@ define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind { ; CHECK-LABEL: testCombineMultiplies_non_splat: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,22,33,44] -; CHECK-NEXT: paddd %xmm0, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [22,33,44,55] +; CHECK-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,726,1452,2420] -; CHECK-NEXT: paddd %xmm0, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,726,1452,2420] +; CHECK-NEXT: paddd %xmm1, %xmm2 +; CHECK-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; CHECK-NEXT: movdqa %xmm2, v2 -; CHECK-NEXT: movdqa %xmm0, v3 -; CHECK-NEXT: movdqa %xmm1, x +; CHECK-NEXT: movdqa %xmm1, v3 +; CHECK-NEXT: movdqa %xmm0, x ; CHECK-NEXT: retl entry: %add1 = add <4 x i32> %v1, diff --git a/llvm/test/CodeGen/X86/combine-or.ll b/llvm/test/CodeGen/X86/combine-or.ll --- a/llvm/test/CodeGen/X86/combine-or.ll +++ b/llvm/test/CodeGen/X86/combine-or.ll @@ -345,7 +345,9 @@ define <4 x float> @test25(<4 x float> %a0) { ; CHECK-LABEL: test25: ; CHECK: # %bb.0: -; CHECK-NEXT: blendps {{.*#+}} xmm0 = mem[0],xmm0[1,2],mem[3] +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %bc1 = bitcast <4 x float> %a0 to <4 x i32> %bc2 = bitcast <4 x float> to <4 x i32> diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll --- a/llvm/test/CodeGen/X86/combine-pmuldq.ll +++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -254,18 +254,18 @@ ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %edi ; AVX2-NEXT: vpextrd $1, %xmm0, %esi @@ -277,18 +277,18 @@ ; AVX512VL: # %bb.0: # %entry ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX512VL-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vmovd %xmm0, %edi ; AVX512VL-NEXT: vpextrd $1, %xmm0, %esi @@ -300,18 +300,18 @@ ; AVX512DQVL: # %bb.0: # %entry ; AVX512DQVL-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512DQVL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX512DQVL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX512DQVL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; AVX512DQVL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; AVX512DQVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX512DQVL-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovd %xmm0, %edi ; AVX512DQVL-NEXT: vpextrd $1, %xmm0, %esi @@ -597,7 +597,7 @@ ; AVX512VL-NEXT: .p2align 4, 0x90 ; AVX512VL-NEXT: .LBB8_1: # %loop ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512VL-NEXT: vpmovsxdq 2097152(%rdi,%rax), %zmm2 ; AVX512VL-NEXT: vpmuldq %zmm2, %zmm1, %zmm2 ; AVX512VL-NEXT: vpsrlq $32, %zmm2, %zmm2 ; AVX512VL-NEXT: vpmovqd %zmm2, %ymm2 @@ -616,7 +616,7 @@ ; AVX512DQVL-NEXT: .p2align 4, 0x90 ; AVX512DQVL-NEXT: .LBB8_1: # %loop ; AVX512DQVL-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512DQVL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512DQVL-NEXT: vpmovsxdq 2097152(%rdi,%rax), %zmm2 ; AVX512DQVL-NEXT: vpmuldq %zmm2, %zmm1, %zmm2 ; AVX512DQVL-NEXT: vpsrlq $32, %zmm2, %zmm2 ; AVX512DQVL-NEXT: vpmovqd %zmm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/combine-rotates.ll b/llvm/test/CodeGen/X86/combine-rotates.ll --- a/llvm/test/CodeGen/X86/combine-rotates.ll +++ b/llvm/test/CodeGen/X86/combine-rotates.ll @@ -163,8 +163,10 @@ ; ; AVX512-LABEL: combine_vec_rot_select_zero: ; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vptestnmd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm2 {%k1} +; AVX512-NEXT: vmovdqa %xmm2, %xmm0 ; AVX512-NEXT: retq %3 = and <4 x i32> %1, %4 = shl <4 x i32> %0, %3 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -1031,19 +1031,19 @@ ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrld $28, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrld $29, %xmm3 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE2-NEXT: psrld $30, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm1 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE2-NEXT: psrld $30, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] +; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: psrad $4, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm3 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] ; SSE2-NEXT: psrad $2, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -1115,37 +1115,37 @@ ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psrld $28, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrld $29, %xmm4 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE2-NEXT: psrld $30, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3] -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm0 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE2-NEXT: psrld $30, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,3] +; SSE2-NEXT: paddd %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: psrad $4, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm4 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE2-NEXT: psrad $2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: psrld $28, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psrld $29, %xmm4 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE2-NEXT: psrld $30, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3] -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm2 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE2-NEXT: psrld $30, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[0,3] +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: psrad $4, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm4 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE2-NEXT: psrad $2, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm4[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: retq @@ -1258,73 +1258,73 @@ ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: psrld $28, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: psrld $29, %xmm6 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE2-NEXT: psrld $30, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3] -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm0 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; SSE2-NEXT: psrld $30, %xmm6 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm0[0,3] +; SSE2-NEXT: paddd %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm5 ; SSE2-NEXT: psrad $4, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm6 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] ; SSE2-NEXT: psrad $2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: psrld $28, %xmm5 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: psrld $29, %xmm6 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE2-NEXT: psrld $30, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3] -; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm1 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; SSE2-NEXT: psrld $30, %xmm6 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm1[0,3] +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm5 ; SSE2-NEXT: psrad $4, %xmm5 -; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm6 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] ; SSE2-NEXT: psrad $2, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: psrld $28, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: psrld $29, %xmm6 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE2-NEXT: psrld $30, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3] -; SSE2-NEXT: paddd %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm4 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE2-NEXT: psrld $30, %xmm6 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm4[0,3] +; SSE2-NEXT: paddd %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm5 ; SSE2-NEXT: psrad $4, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm6 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] ; SSE2-NEXT: psrad $2, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm6[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: psrad $31, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: psrld $28, %xmm2 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: psrld $29, %xmm6 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE2-NEXT: psrld $30, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3] -; SSE2-NEXT: paddd %xmm3, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psrld $28, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm2 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; SSE2-NEXT: psrld $30, %xmm6 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm2[0,3] +; SSE2-NEXT: paddd %xmm3, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm2 ; SSE2-NEXT: psrad $4, %xmm2 -; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm6 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1] ; SSE2-NEXT: psrad $2, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm6[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] ; SSE2-NEXT: movaps %xmm4, %xmm2 ; SSE2-NEXT: movaps %xmm5, %xmm3 @@ -1988,25 +1988,25 @@ ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrld $28, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrld $29, %xmm3 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE2-NEXT: psrld $30, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $4, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm1 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE2-NEXT: psrld $30, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] +; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrad $4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm3 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE2-NEXT: psrad $2, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: psubd %xmm1, %xmm2 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE2-NEXT: psrad $2, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubd %xmm2, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: @@ -3055,7 +3055,8 @@ ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; XOP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15],xmm2[1,3,5,7,9,11,13,15] +; XOP-NEXT: vpsrlw $8, %xmm2, %xmm2 +; XOP-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 @@ -3159,7 +3160,7 @@ ; CHECK-NEXT: testw %di, %di ; CHECK-NEXT: cmovnsl %edi, %eax ; CHECK-NEXT: cwtl -; CHECK-NEXT: sarl $8, %eax +; CHECK-NEXT: shrl $8, %eax ; CHECK-NEXT: negl %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll --- a/llvm/test/CodeGen/X86/combine-shl.ll +++ b/llvm/test/CodeGen/X86/combine-shl.ll @@ -711,13 +711,10 @@ define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) { ; SSE2-LABEL: combine_vec_shl_mul0: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [20,20,20,20] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pslld $2, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pslld $2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_shl_mul0: diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -225,7 +225,8 @@ ; ; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_lshr: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7] +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -247,23 +248,33 @@ define <16 x i8> @combine_vec_ashr_trunc_lshr_splat(<16 x i32> %x) { ; SSE-LABEL: combine_vec_ashr_trunc_lshr_splat: ; SSE: # %bb.0: -; SSE-NEXT: psrad $26, %xmm3 -; SSE-NEXT: psrad $26, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: psrad $26, %xmm1 -; SSE-NEXT: psrad $26, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: psrld $24, %xmm1 +; SSE-NEXT: psrld $24, %xmm0 +; SSE-NEXT: packusdw %xmm1, %xmm0 +; SSE-NEXT: psrld $24, %xmm3 +; SSE-NEXT: psrld $24, %xmm2 +; SSE-NEXT: packusdw %xmm3, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: psrlw $2, %xmm0 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: psubb %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_ashr_trunc_lshr_splat: ; AVX: # %bb.0: -; AVX-NEXT: vpsrad $26, %ymm1, %ymm1 -; AVX-NEXT: vpsrad $26, %ymm0, %ymm0 -; AVX-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpsrld $24, %ymm1, %ymm1 +; AVX-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %1 = lshr <16 x i32> %x, @@ -297,7 +308,8 @@ ; ; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_ashr: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7] +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -319,16 +331,18 @@ define <8 x i16> @combine_vec_ashr_trunc_ashr_splat(<8 x i32> %x) { ; SSE-LABEL: combine_vec_ashr_trunc_ashr_splat: ; SSE: # %bb.0: -; SSE-NEXT: psrad $19, %xmm1 -; SSE-NEXT: psrad $19, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: packusdw %xmm1, %xmm0 +; SSE-NEXT: psraw $3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_ashr_trunc_ashr_splat: ; AVX: # %bb.0: -; AVX-NEXT: vpsrad $19, %ymm0, %ymm0 +; AVX-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $3, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %1 = ashr <8 x i32> %x, diff --git a/llvm/test/CodeGen/X86/combine-srem.ll b/llvm/test/CodeGen/X86/combine-srem.ll --- a/llvm/test/CodeGen/X86/combine-srem.ll +++ b/llvm/test/CodeGen/X86/combine-srem.ll @@ -494,7 +494,7 @@ ; CHECK-NEXT: leal 15(%rax), %ecx ; CHECK-NEXT: testw %ax, %ax ; CHECK-NEXT: cmovnsl %edi, %ecx -; CHECK-NEXT: andl $-16, %ecx +; CHECK-NEXT: andl $65520, %ecx # imm = 0xFFF0 ; CHECK-NEXT: subl %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $rax ; CHECK-NEXT: retq @@ -509,7 +509,7 @@ ; CHECK-NEXT: leal 255(%rax), %ecx ; CHECK-NEXT: testw %ax, %ax ; CHECK-NEXT: cmovnsl %edi, %ecx -; CHECK-NEXT: andl $-256, %ecx +; CHECK-NEXT: andl $65280, %ecx # imm = 0xFF00 ; CHECK-NEXT: subl %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $rax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll --- a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll +++ b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll @@ -139,18 +139,18 @@ ; SSE-LABEL: demandedelts_pblendvb: ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: pblendvb %xmm0, %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pshufb %xmm0, %xmm3 +; SSE-NEXT: pshufb %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: demandedelts_pblendvb: ; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> zeroinitializer %2 = shufflevector <16 x i8> %a1, <16 x i8> undef, <16 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/combine-sub-usat.ll b/llvm/test/CodeGen/X86/combine-sub-usat.ll --- a/llvm/test/CodeGen/X86/combine-sub-usat.ll +++ b/llvm/test/CodeGen/X86/combine-sub-usat.ll @@ -212,17 +212,17 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] ; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] ; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pandn %xmm4, %xmm6 ; SSE2-NEXT: por %xmm2, %xmm6 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: packssdw %xmm6, %xmm5 diff --git a/llvm/test/CodeGen/X86/combine-sub.ll b/llvm/test/CodeGen/X86/combine-sub.ll --- a/llvm/test/CodeGen/X86/combine-sub.ll +++ b/llvm/test/CodeGen/X86/combine-sub.ll @@ -286,10 +286,10 @@ ; SSE-NEXT: movdqu (%rdi), %xmm0 ; SSE-NEXT: movdqu 16(%rdi), %xmm1 ; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: psubd %xmm2, %xmm1 ; SSE-NEXT: psubd %xmm2, %xmm0 -; SSE-NEXT: movdqu %xmm0, (%rdi) +; SSE-NEXT: psubd %xmm2, %xmm1 ; SSE-NEXT: movdqu %xmm1, 16(%rdi) +; SSE-NEXT: movdqu %xmm0, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: PR52032_oneuse_constant: @@ -317,14 +317,14 @@ ; SSE-NEXT: movdqu 16(%rdi), %xmm2 ; SSE-NEXT: movdqu 32(%rdi), %xmm3 ; SSE-NEXT: movdqu 48(%rdi), %xmm4 -; SSE-NEXT: psubd %xmm0, %xmm2 ; SSE-NEXT: psubd %xmm0, %xmm1 -; SSE-NEXT: movdqu %xmm1, (%rdi) +; SSE-NEXT: psubd %xmm0, %xmm2 ; SSE-NEXT: movdqu %xmm2, 16(%rdi) -; SSE-NEXT: psubd %xmm0, %xmm4 +; SSE-NEXT: movdqu %xmm1, (%rdi) ; SSE-NEXT: psubd %xmm0, %xmm3 -; SSE-NEXT: movdqu %xmm3, 32(%rdi) +; SSE-NEXT: psubd %xmm0, %xmm4 ; SSE-NEXT: movdqu %xmm4, 48(%rdi) +; SSE-NEXT: movdqu %xmm3, 32(%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: PR52032: diff --git a/llvm/test/CodeGen/X86/commute-blend-sse41.ll b/llvm/test/CodeGen/X86/commute-blend-sse41.ll --- a/llvm/test/CodeGen/X86/commute-blend-sse41.ll +++ b/llvm/test/CodeGen/X86/commute-blend-sse41.ll @@ -54,11 +54,11 @@ define void @baz(ptr %arg, ptr %arg1) optsize { ; CHECK-LABEL: baz: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movaps (%rdi), %xmm0 -; CHECK-NEXT: movaps {{.*#+}} xmm1 = [3,3] -; CHECK-NEXT: andps %xmm0, %xmm1 -; CHECK-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] -; CHECK-NEXT: movups %xmm1, (%rsi) +; CHECK-NEXT: movq (%rdi), %rax +; CHECK-NEXT: movq 8(%rdi), %rcx +; CHECK-NEXT: movq %rax, (%rsi) +; CHECK-NEXT: andl $3, %ecx +; CHECK-NEXT: movq %rcx, 8(%rsi) ; CHECK-NEXT: retq bb: %tmp = load <2 x i64>, ptr %arg, align 16 diff --git a/llvm/test/CodeGen/X86/conditional-tailcall.ll b/llvm/test/CodeGen/X86/conditional-tailcall.ll --- a/llvm/test/CodeGen/X86/conditional-tailcall.ll +++ b/llvm/test/CodeGen/X86/conditional-tailcall.ll @@ -476,8 +476,8 @@ ; WIN64-NEXT: # %bb.5: # %sw.bb ; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; WIN64-NEXT: movzbl (%rcx), %r9d # encoding: [0x44,0x0f,0xb6,0x09] -; WIN64-NEXT: cmpl $43, %r9d # encoding: [0x41,0x83,0xf9,0x2b] ; WIN64-NEXT: movl $1, %r8d # encoding: [0x41,0xb8,0x01,0x00,0x00,0x00] +; WIN64-NEXT: cmpl $43, %r9d # encoding: [0x41,0x83,0xf9,0x2b] ; WIN64-NEXT: je .LBB3_10 # encoding: [0x74,A] ; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1 ; WIN64-NEXT: # %bb.6: # %sw.bb diff --git a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll --- a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll @@ -1595,10 +1595,10 @@ define i64 @test_i64_140737488289792_mask_lshr_15(i64 %a0) { ; X86-LABEL: test_i64_140737488289792_mask_lshr_15: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $16, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shldl $17, %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll $17, %ecx +; X86-NEXT: leal (%ecx,%eax,2), %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: retl ; @@ -1615,10 +1615,11 @@ define i64 @test_i64_140737488289792_mask_lshr_16(i64 %a0) { ; X86-LABEL: test_i64_140737488289792_mask_lshr_16: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $32767, %eax # imm = 0x7FFF ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shldl $16, %ecx, %eax +; X86-NEXT: shll $16, %eax +; X86-NEXT: orl %ecx, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: retl ; @@ -1769,10 +1770,10 @@ define i64 @test_i64_140737488289792_mask_ashr_15(i64 %a0) { ; X86-LABEL: test_i64_140737488289792_mask_ashr_15: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $16, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shldl $17, %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll $17, %ecx +; X86-NEXT: leal (%ecx,%eax,2), %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: retl ; @@ -1789,10 +1790,11 @@ define i64 @test_i64_140737488289792_mask_ashr_16(i64 %a0) { ; X86-LABEL: test_i64_140737488289792_mask_ashr_16: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $32767, %eax # imm = 0x7FFF ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shldl $16, %ecx, %eax +; X86-NEXT: shll $16, %eax +; X86-NEXT: orl %ecx, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: retl ; @@ -1996,12 +1998,13 @@ define i64 @test_i64_140737488289792_mask_shl_15(i64 %a0) { ; X86-LABEL: test_i64_140737488289792_mask_shl_15: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shll $16, %ecx ; X86-NEXT: movl $32767, %edx # imm = 0x7FFF ; X86-NEXT: andl {{[0-9]+}}(%esp), %edx -; X86-NEXT: shldl $15, %eax, %edx -; X86-NEXT: andl $65536, %eax # imm = 0x10000 -; X86-NEXT: shll $15, %eax +; X86-NEXT: shldl $15, %ecx, %edx +; X86-NEXT: shll $31, %eax ; X86-NEXT: retl ; ; X64-LABEL: test_i64_140737488289792_mask_shl_15: @@ -2017,7 +2020,8 @@ define i64 @test_i64_140737488289792_mask_shl_16(i64 %a0) { ; X86-LABEL: test_i64_140737488289792_mask_shl_16: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $16, %eax ; X86-NEXT: movl $32767, %edx # imm = 0x7FFF ; X86-NEXT: andl {{[0-9]+}}(%esp), %edx ; X86-NEXT: shldl $16, %eax, %edx diff --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll --- a/llvm/test/CodeGen/X86/dagcombine-cse.ll +++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll @@ -17,7 +17,8 @@ ; X64-NEXT: imull %ecx, %esi ; X64-NEXT: addl %edx, %esi ; X64-NEXT: movslq %esi, %rax -; X64-NEXT: movl (%rdi,%rax), %eax +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movd %xmm0, %eax ; X64-NEXT: retq entry: %tmp7 = mul i32 %idxY, %ref_frame_stride ; [#uses=2] @@ -70,13 +71,13 @@ ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %ebp, %ebx ; X86-NEXT: adcl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %edi ; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/dagcombine-select.ll b/llvm/test/CodeGen/X86/dagcombine-select.ll --- a/llvm/test/CodeGen/X86/dagcombine-select.ll +++ b/llvm/test/CodeGen/X86/dagcombine-select.ll @@ -7,7 +7,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: cmovgel %esi, %eax +; CHECK-NEXT: setge %al +; CHECK-NEXT: negl %eax +; CHECK-NEXT: andl %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 @@ -20,7 +22,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: cmovgel %esi, %eax +; CHECK-NEXT: setge %al +; CHECK-NEXT: negl %eax +; CHECK-NEXT: andl %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 @@ -33,7 +37,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: cmovll %esi, %eax +; CHECK-NEXT: setl %al +; CHECK-NEXT: negl %eax +; CHECK-NEXT: andl %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 -1, i32 0 @@ -61,9 +67,11 @@ define i32 @select_or1(i32 %x, i32 %y) { ; CHECK-LABEL: select_or1: ; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: cmovll %esi, %eax +; CHECK-NEXT: setge %al +; CHECK-NEXT: negl %eax +; CHECK-NEXT: orl %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 @@ -74,9 +82,11 @@ define i32 @select_or2(i32 %x, i32 %y) { ; CHECK-LABEL: select_or2: ; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: cmovll %esi, %eax +; CHECK-NEXT: setge %al +; CHECK-NEXT: negl %eax +; CHECK-NEXT: orl %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 @@ -87,9 +97,11 @@ define i32 @select_or3(i32 %x, i32 %y) { ; CHECK-LABEL: select_or3: ; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: cmovgel %esi, %eax +; CHECK-NEXT: setl %al +; CHECK-NEXT: negl %eax +; CHECK-NEXT: orl %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 -1, i32 0 @@ -180,10 +192,9 @@ define i32 @sel_constants_shl_constant(i1 %cond) { ; CHECK-LABEL: sel_constants_shl_constant: ; CHECK: # %bb.0: -; CHECK-NEXT: notb %dil -; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: orl $2, %eax +; CHECK-NEXT: xorl $3, %eax ; CHECK-NEXT: shll $8, %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 2, i32 3 diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -589,8 +589,8 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sbbl %ebx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %esi, 8(%eax) ; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: addl $156, %esp @@ -1033,35 +1033,35 @@ ; X86-NEXT: movd %xmm2, %esi ; X86-NEXT: cltd ; X86-NEXT: idivl %esi -; X86-NEXT: movd %eax, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X86-NEXT: movd %xmm2, %eax -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X86-NEXT: movd %xmm2, %esi +; X86-NEXT: movd %eax, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-NEXT: movd %xmm3, %eax +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; X86-NEXT: movd %xmm3, %esi ; X86-NEXT: cltd ; X86-NEXT: idivl %esi -; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86-NEXT: movd %eax, %xmm3 +; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; X86-NEXT: movd %xmm0, %eax ; X86-NEXT: movd %xmm1, %esi ; X86-NEXT: cltd ; X86-NEXT: idivl %esi -; X86-NEXT: movd %eax, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; X86-NEXT: movd %xmm4, %eax -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; X86-NEXT: movd %xmm4, %esi +; X86-NEXT: movd %eax, %xmm4 +; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] +; X86-NEXT: movd %xmm5, %eax +; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; X86-NEXT: movd %xmm5, %esi ; X86-NEXT: cltd ; X86-NEXT: idivl %esi -; X86-NEXT: movd %eax, %xmm4 -; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; X86-NEXT: movdqa %xmm3, (%ecx) -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; X86-NEXT: movd %eax, %xmm5 +; X86-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; X86-NEXT: movdqa %xmm4, (%ecx) +; X86-NEXT: pmuludq %xmm1, %xmm4 +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; X86-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-NEXT: pmuludq %xmm2, %xmm1 +; X86-NEXT: pmuludq %xmm5, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; X86-NEXT: psubd %xmm3, %xmm0 @@ -1089,25 +1089,25 @@ ; X64-NEXT: movd %xmm1, %ecx ; X64-NEXT: cltd ; X64-NEXT: idivl %ecx -; X64-NEXT: movd %eax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; X64-NEXT: movd %xmm4, %eax -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; X64-NEXT: movd %xmm4, %ecx +; X64-NEXT: movd %eax, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] +; X64-NEXT: movd %xmm5, %eax +; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; X64-NEXT: movd %xmm5, %ecx ; X64-NEXT: cltd ; X64-NEXT: idivl %ecx -; X64-NEXT: movd %eax, %xmm4 -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X64-NEXT: movdqa %xmm2, (%rdi) -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X64-NEXT: movd %eax, %xmm5 +; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; X64-NEXT: movdqa %xmm4, (%rdi) +; X64-NEXT: pmuludq %xmm1, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X64-NEXT: pmuludq %xmm3, %xmm1 +; X64-NEXT: pmuludq %xmm5, %xmm1 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64-NEXT: psubd %xmm2, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X64-NEXT: psubd %xmm3, %xmm0 ; X64-NEXT: retq %div = sdiv <4 x i32> %x, %y store <4 x i32> %div, ptr %divdst, align 16 diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -540,8 +540,8 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebx, (%eax) ; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %ebx, (%eax) ; X86-NEXT: movl %edi, 8(%eax) ; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: addl $132, %esp @@ -984,35 +984,35 @@ ; X86-NEXT: movd %xmm2, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi -; X86-NEXT: movd %eax, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X86-NEXT: movd %xmm2, %eax -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X86-NEXT: movd %xmm2, %esi +; X86-NEXT: movd %eax, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-NEXT: movd %xmm3, %eax +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; X86-NEXT: movd %xmm3, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi -; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86-NEXT: movd %eax, %xmm3 +; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; X86-NEXT: movd %xmm0, %eax ; X86-NEXT: movd %xmm1, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi -; X86-NEXT: movd %eax, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; X86-NEXT: movd %xmm4, %eax -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; X86-NEXT: movd %xmm4, %esi +; X86-NEXT: movd %eax, %xmm4 +; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] +; X86-NEXT: movd %xmm5, %eax +; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; X86-NEXT: movd %xmm5, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi -; X86-NEXT: movd %eax, %xmm4 -; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; X86-NEXT: movdqa %xmm3, (%ecx) -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; X86-NEXT: movd %eax, %xmm5 +; X86-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; X86-NEXT: movdqa %xmm4, (%ecx) +; X86-NEXT: pmuludq %xmm1, %xmm4 +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; X86-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-NEXT: pmuludq %xmm2, %xmm1 +; X86-NEXT: pmuludq %xmm5, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; X86-NEXT: psubd %xmm3, %xmm0 @@ -1040,25 +1040,25 @@ ; X64-NEXT: movd %xmm1, %ecx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divl %ecx -; X64-NEXT: movd %eax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; X64-NEXT: movd %xmm4, %eax -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; X64-NEXT: movd %xmm4, %ecx +; X64-NEXT: movd %eax, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] +; X64-NEXT: movd %xmm5, %eax +; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; X64-NEXT: movd %xmm5, %ecx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divl %ecx -; X64-NEXT: movd %eax, %xmm4 -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X64-NEXT: movdqa %xmm2, (%rdi) -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X64-NEXT: movd %eax, %xmm5 +; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; X64-NEXT: movdqa %xmm4, (%rdi) +; X64-NEXT: pmuludq %xmm1, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X64-NEXT: pmuludq %xmm3, %xmm1 +; X64-NEXT: pmuludq %xmm5, %xmm1 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64-NEXT: psubd %xmm2, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X64-NEXT: psubd %xmm3, %xmm0 ; X64-NEXT: retq %div = udiv <4 x i32> %x, %y store <4 x i32> %div, ptr %divdst, align 16 diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll --- a/llvm/test/CodeGen/X86/divide-by-constant.ll +++ b/llvm/test/CodeGen/X86/divide-by-constant.ll @@ -320,7 +320,10 @@ ; X64-FAST-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 ; X64-FAST-NEXT: movq %rdi, %rax ; X64-FAST-NEXT: mulq %rcx -; X64-FAST-NEXT: movq %rdx, %rax +; X64-FAST-NEXT: subq %rdx, %rdi +; X64-FAST-NEXT: shrq %rdi +; X64-FAST-NEXT: leaq (%rdi,%rdx), %rax +; X64-FAST-NEXT: shrq $2, %rax ; X64-FAST-NEXT: retq ; ; X64-SLOW-LABEL: PR23590: @@ -329,10 +332,14 @@ ; X64-SLOW-NEXT: movq %rdi, %rax ; X64-SLOW-NEXT: mulq %rcx ; X64-SLOW-NEXT: shrq $12, %rdx -; X64-SLOW-NEXT: imulq $12345, %rdx, %rax # imm = 0x3039 -; X64-SLOW-NEXT: subq %rax, %rdi +; X64-SLOW-NEXT: imull $12345, %edx, %eax # imm = 0x3039 +; X64-SLOW-NEXT: subl %eax, %edi ; X64-SLOW-NEXT: imulq $613566757, %rdi, %rax # imm = 0x24924925 ; X64-SLOW-NEXT: shrq $32, %rax +; X64-SLOW-NEXT: subl %eax, %edi +; X64-SLOW-NEXT: shrl %edi +; X64-SLOW-NEXT: addl %edi, %eax +; X64-SLOW-NEXT: shrl $2, %eax ; X64-SLOW-NEXT: retq entry: %rem = urem i64 %x, 12345 diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll --- a/llvm/test/CodeGen/X86/divmod128.ll +++ b/llvm/test/CodeGen/X86/divmod128.ll @@ -19,8 +19,8 @@ ; WIN64-NEXT: subq $72, %rsp ; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; WIN64-NEXT: callq __modti3 @@ -49,8 +49,8 @@ ; WIN64-NEXT: subq $72, %rsp ; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; WIN64-NEXT: callq __divti3 @@ -79,8 +79,8 @@ ; WIN64-NEXT: subq $72, %rsp ; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $11, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $11, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; WIN64-NEXT: callq __umodti3 @@ -969,8 +969,8 @@ ; WIN64-NEXT: movq %rdx, 8(%rax) ; WIN64-NEXT: movq %rcx, (%rax) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: movq $3, (%rdx) ; WIN64-NEXT: andq $0, 8(%rdx) +; WIN64-NEXT: movq $3, (%rdx) ; WIN64-NEXT: movq %rax, %rcx ; WIN64-NEXT: callq __umodti3 ; WIN64-NEXT: movq %xmm0, %rax @@ -1001,8 +1001,8 @@ ; WIN64-NEXT: movq %rdx, 8(%rax) ; WIN64-NEXT: movq %rcx, (%rax) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: movq $3, (%rdx) ; WIN64-NEXT: movq $0, 8(%rdx) +; WIN64-NEXT: movq $3, (%rdx) ; WIN64-NEXT: movq %rax, %rcx ; WIN64-NEXT: callq __umodti3 ; WIN64-NEXT: movq %xmm0, %rax diff --git a/llvm/test/CodeGen/X86/dont-trunc-store-double-to-float.ll b/llvm/test/CodeGen/X86/dont-trunc-store-double-to-float.ll --- a/llvm/test/CodeGen/X86/dont-trunc-store-double-to-float.ll +++ b/llvm/test/CodeGen/X86/dont-trunc-store-double-to-float.ll @@ -10,10 +10,13 @@ ; CHECK-NEXT: movl %esp, %ebp ; CHECK-NEXT: .cfi_def_cfa_register %ebp ; CHECK-NEXT: andl $-8, %esp -; CHECK-NEXT: subl $16, %esp +; CHECK-NEXT: subl $24, %esp ; CHECK-NEXT: movl $1074339512, {{[0-9]+}}(%esp) # imm = 0x40091EB8 -; CHECK-NEXT: movl $1374389535, (%esp) # imm = 0x51EB851F -; CHECK-NEXT: movl $1078523331, {{[0-9]+}}(%esp) # imm = 0x4048F5C3 +; CHECK-NEXT: movl $1374389535, {{[0-9]+}}(%esp) # imm = 0x51EB851F +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fstps {{[0-9]+}}(%esp) +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: fstps {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %ebp, %esp ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: .cfi_def_cfa %esp, 4 diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll --- a/llvm/test/CodeGen/X86/dpbusd.ll +++ b/llvm/test/CodeGen/X86/dpbusd.ll @@ -6,13 +6,18 @@ define i32 @no_dpbusd(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVXVNNI-LABEL: no_dpbusd: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 +; AVXVNNI-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax @@ -22,13 +27,16 @@ ; ; AVX512-LABEL: no_dpbusd: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmulld %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -49,44 +57,41 @@ define i32 @vpdpbusd_mutate(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVXVNNI-LABEL: vpdpbusd_mutate: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vmovdqa (%rsi), %xmm0 -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: {vex} vpdpbusd (%rdi), %xmm0, %xmm1 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vpmovsxbd 8(%rdi), %ymm0 +; AVXVNNI-NEXT: vpmovsxbd (%rdi), %ymm1 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 +; AVXVNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edx, %eax +; AVXVNNI-NEXT: vzeroupper ; AVXVNNI-NEXT: retq ; -; AVX512VNNI-LABEL: vpdpbusd_mutate: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VNNI-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VNNI-NEXT: vmovd %xmm0, %eax -; AVX512VNNI-NEXT: addl %edx, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: vpdpbusd_mutate: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpdpbusd (%rdi), %xmm0, %xmm1 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax -; AVX512VLVNNI-NEXT: addl %edx, %eax -; AVX512VLVNNI-NEXT: retq +; AVX512-LABEL: vpdpbusd_mutate: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxbd (%rdi), %zmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmulld %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: addl %edx, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %0 = load <16 x i8>, ptr %a, align 16 %1 = sext <16 x i8> %0 to <16 x i32> @@ -109,9 +114,9 @@ ; AVXVNNI-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax @@ -128,9 +133,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -163,9 +168,9 @@ ; AVXVNNI-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax @@ -182,9 +187,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -210,44 +215,41 @@ define i32 @vpdpbusd_512(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVXVNNI-LABEL: vpdpbusd_512: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vmovdqa (%rdi), %xmm0 -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: {vex} vpdpbusd (%rsi), %xmm0, %xmm1 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovsxbd 8(%rsi), %ymm2 +; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 +; AVXVNNI-NEXT: vpmovsxbd (%rsi), %ymm2 +; AVXVNNI-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 +; AVXVNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edx, %eax +; AVXVNNI-NEXT: vzeroupper ; AVXVNNI-NEXT: retq ; -; AVX512VNNI-LABEL: vpdpbusd_512: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VNNI-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm1, %zmm0, %zmm2 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VNNI-NEXT: vmovd %xmm0, %eax -; AVX512VNNI-NEXT: addl %edx, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: vpdpbusd_512: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpdpbusd (%rsi), %xmm0, %xmm1 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax -; AVX512VLVNNI-NEXT: addl %edx, %eax -; AVX512VLVNNI-NEXT: retq +; AVX512-LABEL: vpdpbusd_512: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovsxbd (%rsi), %zmm1 +; AVX512-NEXT: vpmulld %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: addl %edx, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %0 = load <16 x i8>, ptr %a, align 16 %1 = zext <16 x i8> %0 to <16 x i32> @@ -264,40 +266,35 @@ define i32 @vpdpbusd_256(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVXVNNI-LABEL: vpdpbusd_256: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVXVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVXVNNI-NEXT: {vex} vpdpbusd %xmm0, %xmm1, %xmm2 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; AVXVNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovsxbd (%rsi), %ymm1 +; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edx, %eax +; AVXVNNI-NEXT: vzeroupper ; AVXVNNI-NEXT: retq ; -; AVX512VNNI-LABEL: vpdpbusd_256: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX512VNNI-NEXT: vmovd %xmm0, %eax -; AVX512VNNI-NEXT: addl %edx, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: vpdpbusd_256: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax -; AVX512VLVNNI-NEXT: addl %edx, %eax -; AVX512VLVNNI-NEXT: retq +; AVX512-LABEL: vpdpbusd_256: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpmovsxbd (%rsi), %ymm1 +; AVX512-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: addl %edx, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %0 = load <8 x i8>, ptr %a, align 8 %1 = zext <8 x i8> %0 to <8 x i32> @@ -314,42 +311,29 @@ define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVXVNNI-LABEL: vpdpbusd_128: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVXVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; AVXVNNI-NEXT: {vex} vpdpbusd %xmm1, %xmm0, %xmm2 -; AVXVNNI-NEXT: vmovd %xmm2, %eax +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVXVNNI-NEXT: vpmovsxbd (%rsi), %xmm1 +; AVXVNNI-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edx, %eax ; AVXVNNI-NEXT: retq ; -; AVX512VNNI-LABEL: vpdpbusd_128: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 -; AVX512VNNI-NEXT: vmovd %xmm2, %eax -; AVX512VNNI-NEXT: addl %edx, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: vpdpbusd_128: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLVNNI-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 -; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax -; AVX512VLVNNI-NEXT: addl %edx, %eax -; AVX512VLVNNI-NEXT: retq +; AVX512-LABEL: vpdpbusd_128: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX512-NEXT: vpmovsxbd (%rsi), %xmm1 +; AVX512-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: addl %edx, %eax +; AVX512-NEXT: retq entry: %0 = load <4 x i8>, ptr %a, align 8 %1 = zext <4 x i8> %0 to <4 x i32> @@ -367,40 +351,28 @@ ; AVXVNNI-LABEL: vpdpbusd_2xi32: ; AVXVNNI: # %bb.0: # %entry ; AVXVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVXVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; AVXVNNI-NEXT: {vex} vpdpbusd %xmm1, %xmm0, %xmm2 -; AVXVNNI-NEXT: vmovd %xmm2, %eax +; AVXVNNI-NEXT: vpmovsxbd %xmm1, %xmm1 +; AVXVNNI-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edx, %eax ; AVXVNNI-NEXT: retq ; -; AVX512VNNI-LABEL: vpdpbusd_2xi32: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512VNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX512VNNI-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512VNNI-NEXT: vpandq %zmm1, %zmm2, %zmm1 -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 -; AVX512VNNI-NEXT: vmovd %xmm2, %eax -; AVX512VNNI-NEXT: addl %edx, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: vpdpbusd_2xi32: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 -; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax -; AVX512VLVNNI-NEXT: addl %edx, %eax -; AVX512VLVNNI-NEXT: retq +; AVX512-LABEL: vpdpbusd_2xi32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vpmovsxbd %xmm1, %xmm1 +; AVX512-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: addl %edx, %eax +; AVX512-NEXT: retq entry: %0 = load <2 x i8>, ptr %a, align 8 %1 = zext <2 x i8> %0 to <2 x i32> @@ -417,13 +389,25 @@ define i32 @vpdpbusd_32xi32(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVXVNNI-LABEL: vpdpbusd_32xi32: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vmovdqu (%rdi), %ymm0 -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: {vex} vpdpbusd (%rsi), %ymm0, %ymm1 -; AVXVNNI-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovsxbd 16(%rsi), %ymm4 +; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm4, %ymm0 +; AVXVNNI-NEXT: vpmovsxbd 24(%rsi), %ymm4 +; AVXVNNI-NEXT: vpmaddwd %ymm1, %ymm4, %ymm1 +; AVXVNNI-NEXT: vpmovsxbd 8(%rsi), %ymm4 +; AVXVNNI-NEXT: vpmaddwd %ymm2, %ymm4, %ymm2 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVXVNNI-NEXT: vpmovsxbd (%rsi), %ymm2 +; AVXVNNI-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2 +; AVXVNNI-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax @@ -431,38 +415,27 @@ ; AVXVNNI-NEXT: vzeroupper ; AVXVNNI-NEXT: retq ; -; AVX512VNNI-LABEL: vpdpbusd_32xi32: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512VNNI-NEXT: vmovdqu (%rsi), %ymm1 -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm1, %zmm0, %zmm2 -; AVX512VNNI-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VNNI-NEXT: vmovd %xmm0, %eax -; AVX512VNNI-NEXT: addl %edx, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: vpdpbusd_32xi32: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpdpbusd (%rsi), %ymm0, %ymm1 -; AVX512VLVNNI-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax -; AVX512VLVNNI-NEXT: addl %edx, %eax -; AVX512VLVNNI-NEXT: vzeroupper -; AVX512VLVNNI-NEXT: retq +; AVX512-LABEL: vpdpbusd_32xi32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovsxbd (%rsi), %zmm2 +; AVX512-NEXT: vpmulld %zmm0, %zmm2, %zmm0 +; AVX512-NEXT: vpmovsxbd 16(%rsi), %zmm2 +; AVX512-NEXT: vpmulld %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: addl %edx, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %0 = load <32 x i8>, ptr %a, align 16 %1 = zext <32 x i8> %0 to <32 x i32> @@ -479,17 +452,41 @@ define i32 @vpdpbusd_64xi32(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVXVNNI-LABEL: vpdpbusd_64xi32: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vmovdqu (%rdi), %ymm0 -; AVXVNNI-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVXVNNI-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVXVNNI-NEXT: {vex} vpdpbusd 32(%rsi), %ymm1, %ymm3 -; AVXVNNI-NEXT: {vex} vpdpbusd (%rsi), %ymm0, %ymm2 -; AVXVNNI-NEXT: vpaddd %ymm3, %ymm2, %ymm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovsxbd 40(%rsi), %ymm8 +; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm8, %ymm0 +; AVXVNNI-NEXT: vpmovsxbd 56(%rsi), %ymm8 +; AVXVNNI-NEXT: vpmaddwd %ymm1, %ymm8, %ymm1 +; AVXVNNI-NEXT: vpmovsxbd 32(%rsi), %ymm8 +; AVXVNNI-NEXT: vpmaddwd %ymm2, %ymm8, %ymm2 +; AVXVNNI-NEXT: vpmovsxbd 48(%rsi), %ymm8 +; AVXVNNI-NEXT: vpmaddwd %ymm3, %ymm8, %ymm3 +; AVXVNNI-NEXT: vpmovsxbd 16(%rsi), %ymm8 +; AVXVNNI-NEXT: vpmaddwd %ymm4, %ymm8, %ymm4 +; AVXVNNI-NEXT: vpaddd %ymm3, %ymm4, %ymm3 +; AVXVNNI-NEXT: vpmovsxbd (%rsi), %ymm4 +; AVXVNNI-NEXT: vpmaddwd %ymm5, %ymm4, %ymm4 +; AVXVNNI-NEXT: vpaddd %ymm2, %ymm4, %ymm2 +; AVXVNNI-NEXT: vpaddd %ymm3, %ymm2, %ymm2 +; AVXVNNI-NEXT: vpmovsxbd 24(%rsi), %ymm3 +; AVXVNNI-NEXT: vpmaddwd %ymm6, %ymm3, %ymm3 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm3, %ymm1 +; AVXVNNI-NEXT: vpmovsxbd 8(%rsi), %ymm3 +; AVXVNNI-NEXT: vpmaddwd %ymm7, %ymm3, %ymm3 +; AVXVNNI-NEXT: vpaddd %ymm0, %ymm3, %ymm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax @@ -499,15 +496,27 @@ ; ; AVX512-LABEL: vpdpbusd_64xi32: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpdpbusd (%rsi), %zmm0, %zmm1 -; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovsxbd 16(%rsi), %zmm4 +; AVX512-NEXT: vpmulld %zmm0, %zmm4, %zmm0 +; AVX512-NEXT: vpmovsxbd 48(%rsi), %zmm4 +; AVX512-NEXT: vpmulld %zmm1, %zmm4, %zmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovsxbd (%rsi), %zmm1 +; AVX512-NEXT: vpmulld %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpmovsxbd 32(%rsi), %zmm2 +; AVX512-NEXT: vpmulld %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -526,3 +535,6 @@ } declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX512VLVNNI: {{.*}} +; AVX512VNNI: {{.*}} diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll --- a/llvm/test/CodeGen/X86/dpbusd_const.ll +++ b/llvm/test/CodeGen/X86/dpbusd_const.ll @@ -24,35 +24,17 @@ } define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) { -; AVXVNNI-LABEL: mul_4xi8_zc: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVXVNNI-NEXT: vmovd %xmm1, %eax -; AVXVNNI-NEXT: addl %edi, %eax -; AVXVNNI-NEXT: retq -; -; AVX512VNNI-LABEL: mul_4xi8_zc: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512VNNI-NEXT: vmovd %xmm1, %eax -; AVX512VNNI-NEXT: addl %edi, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: mul_4xi8_zc: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512VLVNNI-NEXT: vmovd %xmm1, %eax -; AVX512VLVNNI-NEXT: addl %edi, %eax -; AVX512VLVNNI-NEXT: retq +; ALL-LABEL: mul_4xi8_zc: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: addl %edi, %eax +; ALL-NEXT: retq entry: %0 = zext <4 x i8> %a to <4 x i32> %1 = mul nsw <4 x i32> %0, @@ -64,35 +46,39 @@ define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) { ; AVXVNNI-LABEL: mul_4xi4_cz: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVXVNNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVXVNNI-NEXT: vmovd %xmm1, %eax +; AVXVNNI-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] +; AVXVNNI-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edi, %eax ; AVXVNNI-NEXT: retq ; ; AVX512VNNI-LABEL: mul_4xi4_cz: ; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512VNNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512VNNI-NEXT: vmovd %xmm1, %eax +; AVX512VNNI-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] +; AVX512VNNI-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VNNI-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VNNI-NEXT: vmovd %xmm0, %eax ; AVX512VNNI-NEXT: addl %edi, %eax -; AVX512VNNI-NEXT: vzeroupper ; AVX512VNNI-NEXT: retq ; ; AVX512VLVNNI-LABEL: mul_4xi4_cz: ; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vpmovdb %xmm0, %xmm0 ; AVX512VLVNNI-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512VLVNNI-NEXT: vmovd %xmm1, %eax +; AVX512VLVNNI-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax ; AVX512VLVNNI-NEXT: addl %edi, %eax ; AVX512VLVNNI-NEXT: retq entry: @@ -104,38 +90,17 @@ } define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) { -; AVXVNNI-LABEL: mul_4xi8_cs: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVXVNNI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] -; AVXVNNI-NEXT: {vex} vpdpbusd %xmm0, %xmm2, %xmm1 -; AVXVNNI-NEXT: vmovd %xmm1, %eax -; AVXVNNI-NEXT: addl %edi, %eax -; AVXVNNI-NEXT: retq -; -; AVX512VNNI-LABEL: mul_4xi8_cs: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 -; AVX512VNNI-NEXT: vmovd %xmm2, %eax -; AVX512VNNI-NEXT: addl %edi, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: mul_4xi8_cs: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2 -; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax -; AVX512VLVNNI-NEXT: addl %edi, %eax -; AVX512VLVNNI-NEXT: retq +; ALL-LABEL: mul_4xi8_cs: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vpmovsxbd %xmm0, %xmm0 +; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: addl %edi, %eax +; ALL-NEXT: retq entry: %0 = sext <4 x i8> %a to <4 x i32> %1 = mul nsw <4 x i32> , %0 @@ -167,41 +132,41 @@ define i32 @mul_16xi8_zc(<16 x i8> %a, i32 %c) { ; AVXVNNI-LABEL: mul_16xi8_zc: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVXVNNI-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,64,0,0,0,1,0,2,0,64,0] +; AVXVNNI-NEXT: # ymm2 = mem[0,1,0,1] +; AVXVNNI-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edi, %eax +; AVXVNNI-NEXT: vzeroupper ; AVXVNNI-NEXT: retq ; -; AVX512VNNI-LABEL: mul_16xi8_zc: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vmovdqa %xmm0, %xmm0 -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VNNI-NEXT: vmovd %xmm0, %eax -; AVX512VNNI-NEXT: addl %edi, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: mul_16xi8_zc: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax -; AVX512VLVNNI-NEXT: addl %edi, %eax -; AVX512VLVNNI-NEXT: retq +; AVX512-LABEL: mul_16xi8_zc: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: addl %edi, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %0 = zext <16 x i8> %a to <16 x i32> %1 = mul nsw <16 x i32> %0, @@ -213,12 +178,26 @@ define i32 @mul_32xi8_zc(<32 x i8> %a, i32 %c) { ; AVXVNNI-LABEL: mul_32xi8_zc: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 -; AVXVNNI-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVXVNNI-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,1,0,2,0,64,0,0,0,1,0,2,0,64,0] +; AVXVNNI-NEXT: # ymm4 = mem[0,1,0,1] +; AVXVNNI-NEXT: vpmaddwd %ymm4, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 +; AVXVNNI-NEXT: vpmaddwd %ymm4, %ymm1, %ymm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm3, %ymm1 +; AVXVNNI-NEXT: vpmaddwd %ymm4, %ymm2, %ymm2 +; AVXVNNI-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax @@ -232,9 +211,9 @@ ; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 ; AVX512VNNI-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX512VNNI-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VNNI-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512VNNI-NEXT: vmovd %xmm0, %eax @@ -247,9 +226,9 @@ ; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512VLVNNI-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX512VLVNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VLVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax @@ -267,16 +246,41 @@ define i32 @mul_64xi8_zc(<64 x i8> %a, i32 %c) { ; AVXVNNI-LABEL: mul_64xi8_zc: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64] -; AVXVNNI-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVXVNNI-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVXVNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm4 -; AVXVNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm0, %ymm3 -; AVXVNNI-NEXT: vpaddd %ymm4, %ymm3, %ymm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVXVNNI-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVXVNNI-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,0,2,0,64,0,0,0,1,0,2,0,64,0] +; AVXVNNI-NEXT: # ymm8 = mem[0,1,0,1] +; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm8, %ymm0 +; AVXVNNI-NEXT: vpmaddwd %ymm5, %ymm8, %ymm5 +; AVXVNNI-NEXT: vpmaddwd %ymm7, %ymm8, %ymm7 +; AVXVNNI-NEXT: vpmaddwd %ymm6, %ymm8, %ymm6 +; AVXVNNI-NEXT: vpmaddwd %ymm3, %ymm8, %ymm3 +; AVXVNNI-NEXT: vpaddd %ymm3, %ymm6, %ymm3 +; AVXVNNI-NEXT: vpmaddwd %ymm1, %ymm8, %ymm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm7, %ymm1 +; AVXVNNI-NEXT: vpaddd %ymm3, %ymm1, %ymm1 +; AVXVNNI-NEXT: vpmaddwd %ymm4, %ymm8, %ymm3 +; AVXVNNI-NEXT: vpaddd %ymm3, %ymm5, %ymm3 +; AVXVNNI-NEXT: vpmaddwd %ymm2, %ymm8, %ymm2 +; AVXVNNI-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpaddd %ymm3, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax @@ -291,9 +295,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/dpbusd_i4.ll b/llvm/test/CodeGen/X86/dpbusd_i4.ll --- a/llvm/test/CodeGen/X86/dpbusd_i4.ll +++ b/llvm/test/CodeGen/X86/dpbusd_i4.ll @@ -6,15 +6,20 @@ define i32 @mul_i8i8(ptr%a, <16 x i8> %b, i32 %c) { ; CHECK-LABEL: mul_i8i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0 +; CHECK-NEXT: vpmulld %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: addl %esi, %eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: %0 = load <16 x i8>, ptr %a, align 16 @@ -30,14 +35,20 @@ ; CHECK-LABEL: mul_i4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; CHECK-NEXT: vpmovsxbd %xmm1, %zmm1 +; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: %0 = zext <16 x i4> %a to <16 x i32> @@ -51,20 +62,23 @@ define i32 @mul_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) { ; CHECK-LABEL: mul_i4i4: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpsllw $4, %xmm1, %xmm1 -; CHECK-NEXT: vpsrlw $4, %xmm1, %xmm1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; CHECK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1 -; CHECK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; CHECK-NEXT: vpslld $28, %zmm1, %zmm1 +; CHECK-NEXT: vpsrad $28, %zmm1, %zmm1 +; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: %0 = zext <16 x i4> %a to <16 x i32> @@ -78,17 +92,20 @@ define i32 @mul_sext_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) { ; CHECK-LABEL: mul_sext_i4i4: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; CHECK-NEXT: vpsllw $12, %ymm1, %ymm1 -; CHECK-NEXT: vpsraw $12, %ymm1, %ymm1 -; CHECK-NEXT: vpsllw $12, %ymm0, %ymm0 -; CHECK-NEXT: vpsraw $12, %ymm0, %ymm0 -; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; CHECK-NEXT: vpslld $28, %zmm0, %zmm0 +; CHECK-NEXT: vpsrad $28, %zmm0, %zmm0 +; CHECK-NEXT: vpslld $28, %zmm1, %zmm1 +; CHECK-NEXT: vpsrad $28, %zmm1, %zmm1 +; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax @@ -108,16 +125,22 @@ ; CHECK-LABEL: mul_zext_i4i4: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: %0 = zext <16 x i4> %a to <16 x i32> diff --git a/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll b/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll --- a/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll +++ b/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll @@ -625,10 +625,10 @@ ; ALL: # %bb.0: ; ALL-NEXT: movq (%rdi), %rax ; ALL-NEXT: movq 8(%rdi), %rcx -; ALL-NEXT: notq %rcx ; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, (%rsi) +; ALL-NEXT: notq %rcx ; ALL-NEXT: movq %rcx, 8(%rsi) +; ALL-NEXT: movq %rax, (%rsi) ; ALL-NEXT: movq %rcx, 24(%rsi) ; ALL-NEXT: movq %rax, 16(%rsi) ; ALL-NEXT: retq @@ -1038,10 +1038,10 @@ ; ALL: # %bb.0: ; ALL-NEXT: movq (%rdi), %rax ; ALL-NEXT: movq 8(%rdi), %rcx -; ALL-NEXT: notq %rcx ; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, (%rsi) +; ALL-NEXT: notq %rcx ; ALL-NEXT: movq %rcx, 8(%rsi) +; ALL-NEXT: movq %rax, (%rsi) ; ALL-NEXT: movq %rcx, 24(%rsi) ; ALL-NEXT: movq %rax, 16(%rsi) ; ALL-NEXT: movq %rcx, 40(%rsi) @@ -1563,10 +1563,10 @@ ; ALL: # %bb.0: ; ALL-NEXT: movq (%rdi), %rax ; ALL-NEXT: movq 8(%rdi), %rcx -; ALL-NEXT: notq %rcx ; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, (%rsi) +; ALL-NEXT: notq %rcx ; ALL-NEXT: movq %rcx, 8(%rsi) +; ALL-NEXT: movq %rax, (%rsi) ; ALL-NEXT: movq %rcx, 24(%rsi) ; ALL-NEXT: movq %rax, 16(%rsi) ; ALL-NEXT: movq %rcx, 40(%rsi) @@ -1590,22 +1590,22 @@ define void @vec512_i256(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { ; ALL-LABEL: vec512_i256: ; ALL: # %bb.0: -; ALL-NEXT: movq 16(%rdi), %rax -; ALL-NEXT: movq 24(%rdi), %rcx +; ALL-NEXT: movq 24(%rdi), %rax +; ALL-NEXT: movq 16(%rdi), %rcx ; ALL-NEXT: movq (%rdi), %rdx ; ALL-NEXT: movq 8(%rdi), %rdi -; ALL-NEXT: notq %rdi ; ALL-NEXT: notq %rdx +; ALL-NEXT: notq %rdi ; ALL-NEXT: notq %rcx ; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, 16(%rsi) -; ALL-NEXT: movq %rcx, 24(%rsi) -; ALL-NEXT: movq %rdx, (%rsi) +; ALL-NEXT: movq %rax, 24(%rsi) +; ALL-NEXT: movq %rcx, 16(%rsi) ; ALL-NEXT: movq %rdi, 8(%rsi) -; ALL-NEXT: movq %rax, 48(%rsi) -; ALL-NEXT: movq %rcx, 56(%rsi) -; ALL-NEXT: movq %rdx, 32(%rsi) +; ALL-NEXT: movq %rdx, (%rsi) +; ALL-NEXT: movq %rax, 56(%rsi) +; ALL-NEXT: movq %rcx, 48(%rsi) ; ALL-NEXT: movq %rdi, 40(%rsi) +; ALL-NEXT: movq %rdx, 32(%rsi) ; ALL-NEXT: retq %in.elt.not = load i256, ptr %in.elt.ptr, align 64 %in.elt = xor i256 %in.elt.not, -1 diff --git a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll --- a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll +++ b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll @@ -72,39 +72,38 @@ ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: paddd %xmm2, %xmm1 -; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: psubd %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; SSE-NEXT: movd %xmm2, %ecx -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; SSE-NEXT: movd %xmm2, %eax +; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE-NEXT: movd %xmm3, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE-NEXT: movd %xmm3, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx ; SSE-NEXT: movd %eax, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx ; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movdqa %xmm2, (%rdi) +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: vp_sdiv_v4i32: @@ -215,39 +214,38 @@ ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: paddd %xmm2, %xmm1 -; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: psubd %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; SSE-NEXT: movd %xmm2, %ecx -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; SSE-NEXT: movd %xmm2, %eax +; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE-NEXT: movd %xmm3, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE-NEXT: movd %xmm3, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx ; SSE-NEXT: movd %eax, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx ; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movdqa %xmm2, (%rdi) +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: vp_udiv_v4i32: @@ -358,39 +356,38 @@ ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: paddd %xmm2, %xmm1 -; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: psubd %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; SSE-NEXT: movd %xmm2, %ecx -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; SSE-NEXT: movd %xmm2, %eax +; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx -; SSE-NEXT: movd %edx, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE-NEXT: movd %edx, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE-NEXT: movd %xmm3, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE-NEXT: movd %xmm3, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx ; SSE-NEXT: movd %edx, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx -; SSE-NEXT: movd %edx, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: movd %edx, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx ; SSE-NEXT: movd %edx, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movdqa %xmm2, (%rdi) +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: vp_srem_v4i32: @@ -501,39 +498,38 @@ ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: paddd %xmm2, %xmm1 -; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: psubd %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; SSE-NEXT: movd %xmm2, %ecx -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; SSE-NEXT: movd %xmm2, %eax +; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx -; SSE-NEXT: movd %edx, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE-NEXT: movd %edx, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE-NEXT: movd %xmm3, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE-NEXT: movd %xmm3, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx ; SSE-NEXT: movd %edx, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx -; SSE-NEXT: movd %edx, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: movd %edx, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx ; SSE-NEXT: movd %edx, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movdqa %xmm2, (%rdi) +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: vp_urem_v4i32: diff --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll --- a/llvm/test/CodeGen/X86/extract-bits.ll +++ b/llvm/test/CodeGen/X86/extract-bits.ll @@ -6214,13 +6214,13 @@ ; X64-NOBMI-LABEL: bextr64_32_c0: ; X64-NOBMI: # %bb.0: ; X64-NOBMI-NEXT: movq %rsi, %rcx +; X64-NOBMI-NEXT: movq %rdi, %rax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-NOBMI-NEXT: shrq %cl, %rdi +; X64-NOBMI-NEXT: shrq %cl, %rax ; X64-NOBMI-NEXT: negb %dl -; X64-NOBMI-NEXT: movq $-1, %rax ; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: shlq %cl, %rax ; X64-NOBMI-NEXT: shrq %cl, %rax -; X64-NOBMI-NEXT: andl %edi, %eax ; X64-NOBMI-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NOBMI-NEXT: retq ; @@ -6236,7 +6236,8 @@ ; X64-BMI2-LABEL: bextr64_32_c0: ; X64-BMI2: # %bb.0: ; X64-BMI2-NEXT: shrxq %rsi, %rdi, %rax -; X64-BMI2-NEXT: bzhil %edx, %eax, %eax +; X64-BMI2-NEXT: bzhiq %rdx, %rax, %rax +; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax ; X64-BMI2-NEXT: retq %shifted = lshr i64 %val, %numskipbits %numhighbits = sub i64 64, %numlowbits @@ -8130,22 +8131,22 @@ ; ; X64-NOBMI-LABEL: pr38938: ; X64-NOBMI: # %bb.0: -; X64-NOBMI-NEXT: movq (%rsi), %rax -; X64-NOBMI-NEXT: shrq $19, %rax -; X64-NOBMI-NEXT: andl $4092, %eax # imm = 0xFFC -; X64-NOBMI-NEXT: incl (%rdi,%rax) +; X64-NOBMI-NEXT: movl (%rsi), %eax +; X64-NOBMI-NEXT: shrl $21, %eax +; X64-NOBMI-NEXT: andl $1023, %eax # imm = 0x3FF +; X64-NOBMI-NEXT: incl (%rdi,%rax,4) ; X64-NOBMI-NEXT: retq ; ; X64-BMINOTBM-LABEL: pr38938: ; X64-BMINOTBM: # %bb.0: ; X64-BMINOTBM-NEXT: movl $2581, %eax # imm = 0xA15 -; X64-BMINOTBM-NEXT: bextrq %rax, (%rsi), %rax +; X64-BMINOTBM-NEXT: bextrl %eax, (%rsi), %eax ; X64-BMINOTBM-NEXT: incl (%rdi,%rax,4) ; X64-BMINOTBM-NEXT: retq ; ; X64-BMITBM-LABEL: pr38938: ; X64-BMITBM: # %bb.0: -; X64-BMITBM-NEXT: bextrq $2581, (%rsi), %rax # imm = 0xA15 +; X64-BMITBM-NEXT: bextrl $2581, (%rsi), %eax # imm = 0xA15 ; X64-BMITBM-NEXT: incl (%rdi,%rax,4) ; X64-BMITBM-NEXT: retq %tmp = load i64, ptr %a1, align 8 diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll --- a/llvm/test/CodeGen/X86/extract-concat.ll +++ b/llvm/test/CodeGen/X86/extract-concat.ll @@ -9,22 +9,17 @@ ; SSE2-LABEL: foo: ; SSE2: # %bb.0: ; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: shll $8, %ecx -; SSE2-NEXT: orl %eax, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl $65280, %eax # imm = 0xFF00 -; SSE2-NEXT: orl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: pinsrw $1, %eax, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: movd %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: foo: ; SSE42: # %bb.0: ; SSE42-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE42-NEXT: movl $255, %eax ; SSE42-NEXT: pinsrb $3, %eax, %xmm0 ; SSE42-NEXT: movd %xmm0, (%rdi) @@ -33,7 +28,7 @@ ; AVX-LABEL: foo: ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: movl $255, %eax ; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, (%rdi) @@ -162,11 +157,32 @@ ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; SSE-NEXT: retq ; -; AVX-LABEL: cat_ext_straddle: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: retq +; AVX1-LABEL: cat_ext_straddle: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: cat_ext_straddle: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vbroadcastsd (%rsi), %ymm1 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: cat_ext_straddle: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps (%rdi), %ymm0 +; AVX512F-NEXT: vbroadcastsd (%rsi), %ymm1 +; AVX512F-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq %x = load <6 x i32>, ptr %px %y = load <6 x i32>, ptr %py %cat = shufflevector <6 x i32> %x, <6 x i32> %y, <12 x i32> diff --git a/llvm/test/CodeGen/X86/extract-fp.ll b/llvm/test/CodeGen/X86/extract-fp.ll --- a/llvm/test/CodeGen/X86/extract-fp.ll +++ b/llvm/test/CodeGen/X86/extract-fp.ll @@ -86,8 +86,8 @@ define float @ext_maxnum_v4f32(<4 x float> %x) nounwind { ; CHECK-LABEL: ext_maxnum_v4f32: ; CHECK: # %bb.0: +; CHECK-NEXT: maxps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: maxss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %v = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> ) %r = extractelement <4 x float> %v, i32 2 diff --git a/llvm/test/CodeGen/X86/extract-insert.ll b/llvm/test/CodeGen/X86/extract-insert.ll --- a/llvm/test/CodeGen/X86/extract-insert.ll +++ b/llvm/test/CodeGen/X86/extract-insert.ll @@ -32,8 +32,8 @@ define i8 @extractelt_bitcast_extra_use(i32 %x, ptr %p) nounwind { ; X86-LABEL: extractelt_bitcast_extra_use: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/extract-lowbits.ll b/llvm/test/CodeGen/X86/extract-lowbits.ll --- a/llvm/test/CodeGen/X86/extract-lowbits.ll +++ b/llvm/test/CodeGen/X86/extract-lowbits.ll @@ -3031,23 +3031,25 @@ ; X64-NOBMI-LABEL: bzhi64_32_c0: ; X64-NOBMI: # %bb.0: ; X64-NOBMI-NEXT: movq %rsi, %rcx +; X64-NOBMI-NEXT: movq %rdi, %rax ; X64-NOBMI-NEXT: negb %cl -; X64-NOBMI-NEXT: movq $-1, %rax +; X64-NOBMI-NEXT: shlq %cl, %rax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rax -; X64-NOBMI-NEXT: andl %edi, %eax ; X64-NOBMI-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NOBMI-NEXT: retq ; ; X64-BMI1-LABEL: bzhi64_32_c0: ; X64-BMI1: # %bb.0: ; X64-BMI1-NEXT: shll $8, %esi -; X64-BMI1-NEXT: bextrl %esi, %edi, %eax +; X64-BMI1-NEXT: bextrq %rsi, %rdi, %rax +; X64-BMI1-NEXT: # kill: def $eax killed $eax killed $rax ; X64-BMI1-NEXT: retq ; ; X64-BMI2-LABEL: bzhi64_32_c0: ; X64-BMI2: # %bb.0: -; X64-BMI2-NEXT: bzhil %esi, %edi, %eax +; X64-BMI2-NEXT: bzhiq %rsi, %rdi, %rax +; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax ; X64-BMI2-NEXT: retq %numhighbits = sub i64 64, %numlowbits %mask = lshr i64 -1, %numhighbits diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll --- a/llvm/test/CodeGen/X86/extractelement-fp.ll +++ b/llvm/test/CodeGen/X86/extractelement-fp.ll @@ -571,16 +571,16 @@ define float @fmaxnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind { ; X64-LABEL: fmaxnum_v4f32: ; X64: # %bb.0: -; X64-NEXT: vmaxss %xmm0, %xmm1, %xmm2 -; X64-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmaxps %xmm0, %xmm1, %xmm2 +; X64-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: fmaxnum_v4f32: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vmaxss %xmm0, %xmm1, %xmm2 -; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm2 +; X86-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 ; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -594,9 +594,9 @@ define double @fmaxnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind { ; X64-LABEL: fmaxnum_v4f64: ; X64: # %bb.0: -; X64-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 -; X64-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 -; X64-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; X64-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm2 +; X64-NEXT: vmaxpd %xmm0, %xmm1, %xmm0 +; X64-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -606,9 +606,9 @@ ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 -; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 -; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; X86-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm2 +; X86-NEXT: vmaxpd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp @@ -623,16 +623,16 @@ define float @fminnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind { ; X64-LABEL: fminnum_v4f32: ; X64: # %bb.0: -; X64-NEXT: vminss %xmm0, %xmm1, %xmm2 -; X64-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; X64-NEXT: vminps %xmm0, %xmm1, %xmm2 +; X64-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: fminnum_v4f32: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vminss %xmm0, %xmm1, %xmm2 -; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; X86-NEXT: vminps %xmm0, %xmm1, %xmm2 +; X86-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 ; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -646,9 +646,9 @@ define double @fminnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind { ; X64-LABEL: fminnum_v4f64: ; X64: # %bb.0: -; X64-NEXT: vminsd %xmm0, %xmm1, %xmm2 -; X64-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 -; X64-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; X64-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm2 +; X64-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; X64-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -658,9 +658,9 @@ ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: vminsd %xmm0, %xmm1, %xmm2 -; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 -; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; X86-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm2 +; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -406,10 +406,10 @@ ; X32-SSE2-NEXT: andl $-32, %esp ; X32-SSE2-NEXT: subl $64, %esp ; X32-SSE2-NEXT: movdqa zero, %xmm0 -; X32-SSE2-NEXT: movaps n1+16, %xmm1 -; X32-SSE2-NEXT: movaps n1, %xmm2 -; X32-SSE2-NEXT: movaps %xmm2, zero -; X32-SSE2-NEXT: movaps %xmm1, zero+16 +; X32-SSE2-NEXT: movaps n1, %xmm1 +; X32-SSE2-NEXT: movaps n1+16, %xmm2 +; X32-SSE2-NEXT: movaps %xmm2, zero+16 +; X32-SSE2-NEXT: movaps %xmm1, zero ; X32-SSE2-NEXT: movaps {{.*#+}} xmm1 = [2,2,2,2] ; X32-SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X32-SSE2-NEXT: movaps %xmm1, (%esp) @@ -444,8 +444,8 @@ ; X64-SSSE3-NEXT: movq n1@GOTPCREL(%rip), %rax ; X64-SSSE3-NEXT: movaps (%rax), %xmm1 ; X64-SSSE3-NEXT: movaps 16(%rax), %xmm2 -; X64-SSSE3-NEXT: movaps %xmm1, zero(%rip) ; X64-SSSE3-NEXT: movaps %xmm2, zero+16(%rip) +; X64-SSSE3-NEXT: movaps %xmm1, zero(%rip) ; X64-SSSE3-NEXT: movaps {{.*#+}} xmm1 = [2,2,2,2] ; X64-SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; X64-SSSE3-NEXT: movaps %xmm1, (%rsp) diff --git a/llvm/test/CodeGen/X86/f16c-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/f16c-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/f16c-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/f16c-intrinsics-upgrade.ll @@ -92,7 +92,9 @@ ; X86-LABEL: test_x86_vcvtph2ps_128_scalar: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00] +; X86-NEXT: vmovsd (%eax), %xmm0 # encoding: [0xc5,0xfb,0x10,0x00] +; X86-NEXT: # xmm0 = mem[0],zero +; X86-NEXT: vcvtph2ps %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtph2ps_128_scalar: @@ -103,7 +105,9 @@ ; X86-AVX512VL-LABEL: test_x86_vcvtph2ps_128_scalar: ; X86-AVX512VL: # %bb.0: ; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512VL-NEXT: vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00] +; X86-AVX512VL-NEXT: vmovsd (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x00] +; X86-AVX512VL-NEXT: # xmm0 = mem[0],zero +; X86-AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0] ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_128_scalar: @@ -122,7 +126,9 @@ ; X86-LABEL: test_x86_vcvtph2ps_128_scalar2: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00] +; X86-NEXT: vmovsd (%eax), %xmm0 # encoding: [0xc5,0xfb,0x10,0x00] +; X86-NEXT: # xmm0 = mem[0],zero +; X86-NEXT: vcvtph2ps %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtph2ps_128_scalar2: @@ -133,7 +139,9 @@ ; X86-AVX512VL-LABEL: test_x86_vcvtph2ps_128_scalar2: ; X86-AVX512VL: # %bb.0: ; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512VL-NEXT: vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00] +; X86-AVX512VL-NEXT: vmovsd (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x00] +; X86-AVX512VL-NEXT: # xmm0 = mem[0],zero +; X86-AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0] ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_128_scalar2: diff --git a/llvm/test/CodeGen/X86/fdiv.ll b/llvm/test/CodeGen/X86/fdiv.ll --- a/llvm/test/CodeGen/X86/fdiv.ll +++ b/llvm/test/CodeGen/X86/fdiv.ll @@ -85,11 +85,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movaps %xmm0, %xmm3 ; CHECK-NEXT: subss %xmm1, %xmm3 +; CHECK-NEXT: mulss %xmm2, %xmm3 ; CHECK-NEXT: subss %xmm0, %xmm1 -; CHECK-NEXT: mulss %xmm2, %xmm1 -; CHECK-NEXT: subss %xmm2, %xmm3 -; CHECK-NEXT: divss %xmm3, %xmm1 -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: addss %xmm2, %xmm1 +; CHECK-NEXT: divss %xmm1, %xmm3 +; CHECK-NEXT: movaps %xmm3, %xmm0 ; CHECK-NEXT: retq %sub1 = fsub fast float %a0, %a1 %mul2 = fmul fast float %sub1, %a2 diff --git a/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll b/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll --- a/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll +++ b/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll @@ -68,14 +68,14 @@ define float @test_fneg_fma_subx_negy_negz_f32(float %w, float %x, float %y, float %z) { ; FMA3-LABEL: test_fneg_fma_subx_negy_negz_f32: ; FMA3: # %bb.0: # %entry -; FMA3-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; FMA3-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3 +; FMA3-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; FMA3-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 ; FMA3-NEXT: retq ; ; FMA4-LABEL: test_fneg_fma_subx_negy_negz_f32: ; FMA4: # %bb.0: # %entry -; FMA4-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; FMA4-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm3 +; FMA4-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; FMA4-NEXT: vfnmaddss {{.*#+}} xmm0 = -(xmm0 * xmm2) + xmm3 ; FMA4-NEXT: retq entry: %subx = fsub nsz float %w, %x diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll --- a/llvm/test/CodeGen/X86/fma.ll +++ b/llvm/test/CodeGen/X86/fma.ll @@ -443,20 +443,20 @@ ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x54] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] ; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x1c] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x18,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x10,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] ; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x14,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x10,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x18,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; FMACALL32_BDVER2-NEXT: addl $108, %esp ## encoding: [0x83,0xc4,0x6c] ; FMACALL32_BDVER2-NEXT: retl ## encoding: [0xc3] @@ -756,43 +756,43 @@ ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x68] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x74] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x80,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x8c,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x98,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xa4,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x28] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x28,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x18,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x1c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x2c] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x18,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x14,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x14,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x20,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x10,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x10,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x1c,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] ; FMACALL32_BDVER2-NEXT: addl $284, %esp ## encoding: [0x81,0xc4,0x1c,0x01,0x00,0x00] @@ -1336,84 +1336,84 @@ ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x58] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xd0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x40] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xdc,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x60] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x80,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x5c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xe8,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x4c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xf4,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x48] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x00,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x44] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x0c,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x5c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x50] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x18,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x58] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x24,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x54] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x30,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x50] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x3c,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x4c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x54] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x48,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x48] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x54,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x44] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x40] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x3c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x58] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x38,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x40,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x5c] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x28,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x4c,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x4c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x54] ; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x48,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x34,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0],mem[0],xmm2[2,3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x34,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x3c,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x48,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x44,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x30,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1],mem[0],xmm2[3] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x30,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x38,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x44,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x40,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x2c,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1,2],mem[0] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x5c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x50] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x58,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x28,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x54,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x50,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01] ; FMACALL32_BDVER2-NEXT: movl %ebp, %esp ## encoding: [0x89,0xec] @@ -1508,13 +1508,13 @@ ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x58] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20] -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x28] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x28] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: addl $108, %esp ## encoding: [0x83,0xc4,0x6c] ; FMACALL32_BDVER2-NEXT: retl ## encoding: [0xc3] @@ -1723,23 +1723,23 @@ ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x68] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x30] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x38] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x18] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x44] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x30] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x18] -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x30] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x28] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x28] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x18] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0,1] -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x18] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] ; FMACALL32_BDVER2-NEXT: addl $236, %esp ## encoding: [0x81,0xc4,0xec,0x00,0x00,0x00] @@ -2048,44 +2048,44 @@ ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x12,0x84,0x24,0x48,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x78] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x20] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x58] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x30] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x50] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x9c,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xa0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x48] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x50] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x9c,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x68] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x88,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x78] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x58] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x94,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x70] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x68] -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x48] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x78] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x8c,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x58] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0,1] -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x48] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x50] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfb,0x10,0x54,0x24,0x70] ; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x16,0x54,0x24,0x68] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x16,0x54,0x24,0x48] ; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x8c,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x68] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x78] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x58] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01] ; FMACALL32_BDVER2-NEXT: movl %ebp, %esp ## encoding: [0x89,0xec] diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll --- a/llvm/test/CodeGen/X86/fma_patterns.ll +++ b/llvm/test/CodeGen/X86/fma_patterns.ll @@ -1293,20 +1293,20 @@ ; ; FMA-NOINFS-LABEL: test_f32_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA-NOINFS-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; FMA-NOINFS-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA-NOINFS-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_f32_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubss {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 +; FMA4-NOINFS-NEXT: vfnmaddss {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA4-NOINFS-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_f32_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; AVX512-NOINFS-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; AVX512-NOINFS-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; AVX512-NOINFS-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz float 1.0, %t %tx = fmul nsz float %x, %t @@ -1342,20 +1342,20 @@ ; ; FMA-NOINFS-LABEL: test_v4f32_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; FMA-NOINFS-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_v4f32_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 +; FMA4-NOINFS-NEXT: vfnmaddps {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_v4f32_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; AVX512-NOINFS-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz <4 x float> , %t %tx = fmul nsz <4 x float> %x, %t @@ -1391,20 +1391,20 @@ ; ; FMA-NOINFS-LABEL: test_v8f32_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 +; FMA-NOINFS-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm1 +; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_v8f32_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm1 +; FMA4-NOINFS-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm1 +; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_v8f32_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 +; AVX512-NOINFS-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm1 +; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz <8 x float> , %t %tx = fmul nsz <8 x float> %x, %t @@ -1440,20 +1440,20 @@ ; ; FMA-NOINFS-LABEL: test_f64_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213sd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA-NOINFS-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; FMA-NOINFS-NEXT: vfnmadd213sd {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA-NOINFS-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_f64_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubsd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: vfmsubsd {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 +; FMA4-NOINFS-NEXT: vfnmaddsd {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA4-NOINFS-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_f64_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213sd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; AVX512-NOINFS-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; AVX512-NOINFS-NEXT: vfnmadd213sd {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; AVX512-NOINFS-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz double 1.0, %t %tx = fmul nsz double %x, %t @@ -1492,20 +1492,20 @@ ; ; FMA-NOINFS-LABEL: test_v2f64_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; FMA-NOINFS-NEXT: vfnmadd213pd {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA-NOINFS-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_v2f64_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 +; FMA4-NOINFS-NEXT: vfnmaddpd {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA4-NOINFS-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_v2f64_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; AVX512-NOINFS-NEXT: vfnmadd213pd {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; AVX512-NOINFS-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz <2 x double> , %t %tx = fmul nsz <2 x double> %x, %t @@ -1541,20 +1541,20 @@ ; ; FMA-NOINFS-LABEL: test_v4f64_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 +; FMA-NOINFS-NEXT: vfnmadd213pd {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm1 +; FMA-NOINFS-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_v4f64_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm1 +; FMA4-NOINFS-NEXT: vfnmaddpd {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm1 +; FMA4-NOINFS-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_v4f64_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 +; AVX512-NOINFS-NEXT: vfnmadd213pd {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm1 +; AVX512-NOINFS-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz <4 x double> , %t %tx = fmul nsz <4 x double> %x, %t @@ -1612,17 +1612,26 @@ define <4 x float> @test_v4f32_fneg_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { ; FMA-LABEL: test_v4f32_fneg_fnmadd: ; FMA: # %bb.0: -; FMA-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 +; FMA-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; FMA-NEXT: vfnmsub231ps {{.*#+}} xmm3 = -(xmm1 * xmm0) - xmm3 +; FMA-NEXT: vaddps %xmm2, %xmm3, %xmm0 +; FMA-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; FMA-NEXT: retq ; ; FMA4-LABEL: test_v4f32_fneg_fnmadd: ; FMA4: # %bb.0: -; FMA4-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 +; FMA4-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; FMA4-NEXT: vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm3 +; FMA4-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; FMA4-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; FMA4-NEXT: retq ; ; AVX512-LABEL: test_v4f32_fneg_fnmadd: ; AVX512: # %bb.0: -; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 +; AVX512-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512-NEXT: vfnmsub231ps {{.*#+}} xmm3 = -(xmm1 * xmm0) - xmm3 +; AVX512-NEXT: vaddps %xmm2, %xmm3, %xmm0 +; AVX512-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512-NEXT: retq %mul = fmul nsz <4 x float> %a0, %a1 %neg0 = fsub nsz <4 x float> , %mul @@ -1634,17 +1643,23 @@ define <4 x double> @test_v4f64_fneg_fnmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { ; FMA-LABEL: test_v4f64_fneg_fnmsub: ; FMA: # %bb.0: -; FMA-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 +; FMA-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; FMA-NEXT: vfnmsub231pd {{.*#+}} ymm3 = -(ymm1 * ymm0) - ymm3 +; FMA-NEXT: vsubpd %ymm3, %ymm2, %ymm0 ; FMA-NEXT: retq ; ; FMA4-LABEL: test_v4f64_fneg_fnmsub: ; FMA4: # %bb.0: -; FMA4-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2 +; FMA4-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; FMA4-NEXT: vfnmsubpd {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm3 +; FMA4-NEXT: vsubpd %ymm0, %ymm2, %ymm0 ; FMA4-NEXT: retq ; ; AVX512-LABEL: test_v4f64_fneg_fnmsub: ; AVX512: # %bb.0: -; AVX512-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 +; AVX512-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; AVX512-NEXT: vfnmsub231pd {{.*#+}} ymm3 = -(ymm1 * ymm0) - ymm3 +; AVX512-NEXT: vsubpd %ymm3, %ymm2, %ymm0 ; AVX512-NEXT: retq %mul = fmul nsz <4 x double> %a0, %a1 %neg0 = fsub nsz <4 x double> , %mul @@ -1888,28 +1903,26 @@ define <2 x double> @fadd_fma_fmul_3(<2 x double> %x1, <2 x double> %x2, <2 x double> %x3, <2 x double> %x4, <2 x double> %x5, <2 x double> %x6, <2 x double> %x7, <2 x double> %x8) nounwind { ; FMA-LABEL: fadd_fma_fmul_3: ; FMA: # %bb.0: -; FMA-NEXT: vmulpd %xmm3, %xmm2, %xmm2 -; FMA-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2 -; FMA-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm7 * xmm6) + xmm2 -; FMA-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm5 * xmm4) + xmm2 -; FMA-NEXT: vmovapd %xmm2, %xmm0 +; FMA-NEXT: vmulpd %xmm7, %xmm6, %xmm6 +; FMA-NEXT: vfmadd231pd {{.*#+}} xmm6 = (xmm5 * xmm4) + xmm6 +; FMA-NEXT: vfmadd231pd {{.*#+}} xmm6 = (xmm3 * xmm2) + xmm6 +; FMA-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm6 ; FMA-NEXT: retq ; ; FMA4-LABEL: fadd_fma_fmul_3: ; FMA4: # %bb.0: -; FMA4-NEXT: vmulpd %xmm3, %xmm2, %xmm2 +; FMA4-NEXT: vmulpd %xmm7, %xmm6, %xmm6 +; FMA4-NEXT: vfmaddpd {{.*#+}} xmm4 = (xmm4 * xmm5) + xmm6 +; FMA4-NEXT: vfmaddpd {{.*#+}} xmm2 = (xmm2 * xmm3) + xmm4 ; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 -; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm6 * xmm7) + xmm0 -; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm4 * xmm5) + xmm0 ; FMA4-NEXT: retq ; ; AVX512-LABEL: fadd_fma_fmul_3: ; AVX512: # %bb.0: -; AVX512-NEXT: vmulpd %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2 -; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm7 * xmm6) + xmm2 -; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm5 * xmm4) + xmm2 -; AVX512-NEXT: vmovapd %xmm2, %xmm0 +; AVX512-NEXT: vmulpd %xmm7, %xmm6, %xmm6 +; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm6 = (xmm5 * xmm4) + xmm6 +; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm6 = (xmm3 * xmm2) + xmm6 +; AVX512-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm6 ; AVX512-NEXT: retq %m1 = fmul fast <2 x double> %x1, %x2 %m2 = fmul fast <2 x double> %x3, %x4 diff --git a/llvm/test/CodeGen/X86/fma_patterns_wide.ll b/llvm/test/CodeGen/X86/fma_patterns_wide.ll --- a/llvm/test/CodeGen/X86/fma_patterns_wide.ll +++ b/llvm/test/CodeGen/X86/fma_patterns_wide.ll @@ -849,24 +849,24 @@ ; ; FMA-NOINFS-LABEL: test_v16f32_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm2 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm5 * ymm1) - ymm3 +; FMA-NOINFS-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm5 * ymm3) + ymm3 +; FMA-NOINFS-NEXT: vfnmadd213ps {{.*#+}} ymm2 = -(ymm4 * ymm2) + ymm2 +; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm4 * ymm0) + ymm2 +; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm5 * ymm1) + ymm3 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_v16f32_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm4) - ymm2 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm5) - ymm3 +; FMA4-NOINFS-NEXT: vfnmaddps {{.*#+}} ymm3 = -(ymm5 * ymm3) + ymm3 +; FMA4-NOINFS-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm4 * ymm2) + ymm2 +; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm2 +; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} ymm1 = (ymm1 * ymm5) + ymm3 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_v16f32_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} zmm1 = (zmm2 * zmm1) - zmm1 -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm2 * zmm0) - zmm1 +; AVX512-NOINFS-NEXT: vfnmadd213ps {{.*#+}} zmm1 = -(zmm2 * zmm1) + zmm1 +; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm2 * zmm0) + zmm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz <16 x float> , %t %tx = fmul nsz <16 x float> %x, %t @@ -908,24 +908,24 @@ ; ; FMA-NOINFS-LABEL: test_v8f64_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm2 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm1 = (ymm5 * ymm1) - ymm3 +; FMA-NOINFS-NEXT: vfnmadd213pd {{.*#+}} ymm3 = -(ymm5 * ymm3) + ymm3 +; FMA-NOINFS-NEXT: vfnmadd213pd {{.*#+}} ymm2 = -(ymm4 * ymm2) + ymm2 +; FMA-NOINFS-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm4 * ymm0) + ymm2 +; FMA-NOINFS-NEXT: vfmadd213pd {{.*#+}} ymm1 = (ymm5 * ymm1) + ymm3 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_v8f64_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm4) - ymm2 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm1 = (ymm1 * ymm5) - ymm3 +; FMA4-NOINFS-NEXT: vfnmaddpd {{.*#+}} ymm3 = -(ymm5 * ymm3) + ymm3 +; FMA4-NOINFS-NEXT: vfnmaddpd {{.*#+}} ymm2 = -(ymm4 * ymm2) + ymm2 +; FMA4-NOINFS-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm2 +; FMA4-NOINFS-NEXT: vfmaddpd {{.*#+}} ymm1 = (ymm1 * ymm5) + ymm3 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_v8f64_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} zmm1 = (zmm2 * zmm1) - zmm1 -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm2 * zmm0) - zmm1 +; AVX512-NOINFS-NEXT: vfnmadd213pd {{.*#+}} zmm1 = -(zmm2 * zmm1) + zmm1 +; AVX512-NOINFS-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm2 * zmm0) + zmm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz <8 x double> , %t %tx = fmul nsz <8 x double> %x, %t @@ -999,7 +999,10 @@ ; ; AVX512-LABEL: test_v16f32_fneg_fnmadd: ; AVX512: # %bb.0: -; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2 +; AVX512-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512-NEXT: vfnmsub231ps {{.*#+}} zmm3 = -(zmm1 * zmm0) - zmm3 +; AVX512-NEXT: vaddps %zmm2, %zmm3, %zmm0 +; AVX512-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512-NEXT: retq %mul = fmul nsz <16 x float> %a0, %a1 %neg0 = fsub nsz <16 x float> , %mul @@ -1023,7 +1026,9 @@ ; ; AVX512-LABEL: test_v8f64_fneg_fnmsub: ; AVX512: # %bb.0: -; AVX512-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 +; AVX512-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; AVX512-NEXT: vfnmsub231pd {{.*#+}} zmm3 = -(zmm1 * zmm0) - zmm3 +; AVX512-NEXT: vsubpd %zmm3, %zmm2, %zmm0 ; AVX512-NEXT: retq %mul = fmul nsz <8 x double> %a0, %a1 %neg0 = fsub nsz <8 x double> , %mul diff --git a/llvm/test/CodeGen/X86/fmul-combines.ll b/llvm/test/CodeGen/X86/fmul-combines.ll --- a/llvm/test/CodeGen/X86/fmul-combines.ll +++ b/llvm/test/CodeGen/X86/fmul-combines.ll @@ -175,7 +175,10 @@ define <4 x float> @fmul_v4f32_two_consts_no_splat_multiple_use(<4 x float> %x) { ; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_multiple_use: ; CHECK: # %bb.0: +; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; CHECK-NEXT: mulps %xmm0, %xmm1 ; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: addps %xmm1, %xmm0 ; CHECK-NEXT: retq %y = fmul fast <4 x float> %x, %z = fmul fast <4 x float> %y, @@ -269,7 +272,14 @@ ; CHECK-LABEL: getNegatedExpression_crash: ; CHECK: # %bb.0: ; CHECK-NEXT: movl $0, (%rdi) -; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: mulss %xmm0, %xmm1 +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: mulss %xmm0, %xmm2 +; CHECK-NEXT: mulss %xmm2, %xmm0 +; CHECK-NEXT: mulss %xmm2, %xmm0 +; CHECK-NEXT: mulss %xmm1, %xmm0 ; CHECK-NEXT: retq store float 0.0, ptr %p, align 1 %real = load float, ptr %p, align 1 diff --git a/llvm/test/CodeGen/X86/fold-call-3.ll b/llvm/test/CodeGen/X86/fold-call-3.ll --- a/llvm/test/CodeGen/X86/fold-call-3.ll +++ b/llvm/test/CodeGen/X86/fold-call-3.ll @@ -60,8 +60,8 @@ ; pre-RA-NEXT: movq %rax, %rsi ; pre-RA-NEXT: callq *560(%rcx) ; pre-RA-NEXT: incl %ebp -; pre-RA-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; pre-RA-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; pre-RA-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; pre-RA-NEXT: cmpl _NumTrials(%rip), %ebp ; pre-RA-NEXT: jb LBB0_2 ; pre-RA-NEXT: ## %bb.3: diff --git a/llvm/test/CodeGen/X86/fold-masked-merge.ll b/llvm/test/CodeGen/X86/fold-masked-merge.ll --- a/llvm/test/CodeGen/X86/fold-masked-merge.ll +++ b/llvm/test/CodeGen/X86/fold-masked-merge.ll @@ -30,11 +30,10 @@ define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) { ; NOBMI-LABEL: masked_merge1: ; NOBMI: # %bb.0: -; NOBMI-NEXT: movl %edi, %eax -; NOBMI-NEXT: andl %edi, %esi -; NOBMI-NEXT: notl %eax -; NOBMI-NEXT: andl %edx, %eax -; NOBMI-NEXT: orl %esi, %eax +; NOBMI-NEXT: movl %esi, %eax +; NOBMI-NEXT: xorl %edx, %eax +; NOBMI-NEXT: andl %edi, %eax +; NOBMI-NEXT: xorl %edx, %eax ; NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; NOBMI-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/fold-rmw-ops.ll b/llvm/test/CodeGen/X86/fold-rmw-ops.ll --- a/llvm/test/CodeGen/X86/fold-rmw-ops.ll +++ b/llvm/test/CodeGen/X86/fold-rmw-ops.ll @@ -1041,9 +1041,12 @@ define void @and32_imm_br() nounwind { ; CHECK-LABEL: and32_imm_br: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: andl $-2147483648, g32(%rip) # encoding: [0x81,0x25,A,A,A,A,0x00,0x00,0x00,0x80] -; CHECK-NEXT: # fixup A - offset: 2, value: g32-8, kind: reloc_riprel_4byte +; CHECK-NEXT: movl $-2147483648, %eax # encoding: [0xb8,0x00,0x00,0x00,0x80] ; CHECK-NEXT: # imm = 0x80000000 +; CHECK-NEXT: andl g32(%rip), %eax # encoding: [0x23,0x05,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 2, value: g32-4, kind: reloc_riprel_4byte_relax +; CHECK-NEXT: movl %eax, g32(%rip) # encoding: [0x89,0x05,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 2, value: g32-4, kind: reloc_riprel_4byte ; CHECK-NEXT: jne b # TAILCALL ; CHECK-NEXT: # encoding: [0x75,A] ; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1 @@ -1127,9 +1130,12 @@ define void @and16_imm_br() nounwind { ; CHECK-LABEL: and16_imm_br: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: andw $-32768, g16(%rip) # encoding: [0x66,0x81,0x25,A,A,A,A,0x00,0x80] -; CHECK-NEXT: # fixup A - offset: 3, value: g16-6, kind: reloc_riprel_4byte +; CHECK-NEXT: movzwl g16(%rip), %eax # encoding: [0x0f,0xb7,0x05,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 3, value: g16-4, kind: reloc_riprel_4byte +; CHECK-NEXT: andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00] ; CHECK-NEXT: # imm = 0x8000 +; CHECK-NEXT: movw %ax, g16(%rip) # encoding: [0x66,0x89,0x05,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 3, value: g16-4, kind: reloc_riprel_4byte ; CHECK-NEXT: jne b # TAILCALL ; CHECK-NEXT: # encoding: [0x75,A] ; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1 diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll @@ -1133,10 +1133,10 @@ ; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X87-NEXT: movl {{[0-9]+}}(%esp), %edx ; X87-NEXT: movl {{[0-9]+}}(%esp), %edi -; X87-NEXT: movl %edi, 8(%esi) -; X87-NEXT: movl %edx, 12(%esi) -; X87-NEXT: movl %eax, (%esi) +; X87-NEXT: movl %edi, 12(%esi) +; X87-NEXT: movl %edx, 8(%esi) ; X87-NEXT: movl %ecx, 4(%esi) +; X87-NEXT: movl %eax, (%esi) ; X87-NEXT: movl %esi, %eax ; X87-NEXT: addl $36, %esp ; X87-NEXT: popl %esi @@ -1159,10 +1159,10 @@ ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SSE-NEXT: movl %edi, 8(%esi) -; X86-SSE-NEXT: movl %edx, 12(%esi) -; X86-SSE-NEXT: movl %eax, (%esi) +; X86-SSE-NEXT: movl %edi, 12(%esi) +; X86-SSE-NEXT: movl %edx, 8(%esi) ; X86-SSE-NEXT: movl %ecx, 4(%esi) +; X86-SSE-NEXT: movl %eax, (%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $36, %esp ; X86-SSE-NEXT: popl %esi @@ -1478,10 +1478,10 @@ ; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X87-NEXT: movl {{[0-9]+}}(%esp), %edx ; X87-NEXT: movl {{[0-9]+}}(%esp), %edi -; X87-NEXT: movl %edi, 8(%esi) -; X87-NEXT: movl %edx, 12(%esi) -; X87-NEXT: movl %eax, (%esi) +; X87-NEXT: movl %edi, 12(%esi) +; X87-NEXT: movl %edx, 8(%esi) ; X87-NEXT: movl %ecx, 4(%esi) +; X87-NEXT: movl %eax, (%esi) ; X87-NEXT: movl %esi, %eax ; X87-NEXT: addl $36, %esp ; X87-NEXT: popl %esi @@ -1504,10 +1504,10 @@ ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SSE-NEXT: movl %edi, 8(%esi) -; X86-SSE-NEXT: movl %edx, 12(%esi) -; X86-SSE-NEXT: movl %eax, (%esi) +; X86-SSE-NEXT: movl %edi, 12(%esi) +; X86-SSE-NEXT: movl %edx, 8(%esi) ; X86-SSE-NEXT: movl %ecx, 4(%esi) +; X86-SSE-NEXT: movl %eax, (%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $36, %esp ; X86-SSE-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/fp-logic.ll b/llvm/test/CodeGen/X86/fp-logic.ll --- a/llvm/test/CodeGen/X86/fp-logic.ll +++ b/llvm/test/CodeGen/X86/fp-logic.ll @@ -231,8 +231,9 @@ define float @movmsk(float %x) { ; CHECK-LABEL: movmsk: ; CHECK: # %bb.0: -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: andps %xmm1, %xmm0 +; CHECK-NEXT: movmskps %xmm0, %eax +; CHECK-NEXT: shll $31, %eax +; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: retq %bc1 = bitcast float %x to i32 %and = and i32 %bc1, 2147483648 diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll --- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll @@ -496,10 +496,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -635,10 +635,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -675,10 +675,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -715,10 +715,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -753,10 +753,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -792,10 +792,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -833,10 +833,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -873,10 +873,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -913,10 +913,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -951,10 +951,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -990,10 +990,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -1031,10 +1031,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll --- a/llvm/test/CodeGen/X86/fp128-cast.ll +++ b/llvm/test/CodeGen/X86/fp128-cast.ll @@ -1123,11 +1123,9 @@ ; X64-SSE-NEXT: movaps %xmm0, %xmm1 ; X64-SSE-NEXT: callq __multf3@PLT ; X64-SSE-NEXT: movaps %xmm0, (%rsp) -; X64-SSE-NEXT: movq (%rsp), %rcx -; X64-SSE-NEXT: movq %rcx, %rdx -; X64-SSE-NEXT: shrq $32, %rdx +; X64-SSE-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; X64-SSE-NEXT: xorl %eax, %eax -; X64-SSE-NEXT: orl %ecx, %edx +; X64-SSE-NEXT: orl (%rsp), %ecx ; X64-SSE-NEXT: sete %al ; X64-SSE-NEXT: addq $24, %rsp ; X64-SSE-NEXT: retq @@ -1169,11 +1167,9 @@ ; X64-AVX-NEXT: vmovaps %xmm0, %xmm1 ; X64-AVX-NEXT: callq __multf3@PLT ; X64-AVX-NEXT: vmovaps %xmm0, (%rsp) -; X64-AVX-NEXT: movq (%rsp), %rcx -; X64-AVX-NEXT: movq %rcx, %rdx -; X64-AVX-NEXT: shrq $32, %rdx +; X64-AVX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; X64-AVX-NEXT: xorl %eax, %eax -; X64-AVX-NEXT: orl %ecx, %edx +; X64-AVX-NEXT: orl (%rsp), %ecx ; X64-AVX-NEXT: sete %al ; X64-AVX-NEXT: addq $24, %rsp ; X64-AVX-NEXT: retq @@ -1221,14 +1217,14 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: addl $3, %ecx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: adcl $0, %esi +; X32-NEXT: addl $3, %esi ; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, 8(%eax) -; X32-NEXT: movl %edx, 4(%eax) -; X32-NEXT: movl %ecx, (%eax) -; X32-NEXT: movl %edi, 12(%eax) +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %edx +; X32-NEXT: movl %ecx, 8(%eax) +; X32-NEXT: movl %edi, 4(%eax) +; X32-NEXT: movl %esi, (%eax) +; X32-NEXT: movl %edx, 12(%eax) ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: retl $4 diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll --- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll @@ -42,10 +42,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -87,10 +87,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -132,10 +132,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -177,10 +177,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -226,10 +226,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -271,10 +271,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -312,10 +312,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -353,10 +353,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -394,10 +394,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -435,10 +435,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -476,10 +476,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -517,10 +517,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -558,10 +558,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -599,10 +599,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -644,10 +644,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -689,10 +689,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -730,10 +730,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -775,10 +775,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -817,10 +817,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -858,10 +858,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -899,10 +899,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -940,10 +940,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -981,10 +981,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -1022,10 +1022,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -1063,10 +1063,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/fp128-libcalls.ll b/llvm/test/CodeGen/X86/fp128-libcalls.ll --- a/llvm/test/CodeGen/X86/fp128-libcalls.ll +++ b/llvm/test/CodeGen/X86/fp128-libcalls.ll @@ -84,10 +84,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, vf128+8 -; X86-NEXT: movl %edx, vf128+12 -; X86-NEXT: movl %eax, vf128 +; X86-NEXT: movl %esi, vf128+12 +; X86-NEXT: movl %edx, vf128+8 ; X86-NEXT: movl %ecx, vf128+4 +; X86-NEXT: movl %eax, vf128 ; X86-NEXT: addl $24, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -171,10 +171,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, vf128+8 -; X86-NEXT: movl %edx, vf128+12 -; X86-NEXT: movl %eax, vf128 +; X86-NEXT: movl %esi, vf128+12 +; X86-NEXT: movl %edx, vf128+8 ; X86-NEXT: movl %ecx, vf128+4 +; X86-NEXT: movl %eax, vf128 ; X86-NEXT: addl $24, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -258,10 +258,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, vf128+8 -; X86-NEXT: movl %edx, vf128+12 -; X86-NEXT: movl %eax, vf128 +; X86-NEXT: movl %esi, vf128+12 +; X86-NEXT: movl %edx, vf128+8 ; X86-NEXT: movl %ecx, vf128+4 +; X86-NEXT: movl %eax, vf128 ; X86-NEXT: addl $24, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -345,10 +345,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, vf128+8 -; X86-NEXT: movl %edx, vf128+12 -; X86-NEXT: movl %eax, vf128 +; X86-NEXT: movl %esi, vf128+12 +; X86-NEXT: movl %edx, vf128+8 ; X86-NEXT: movl %ecx, vf128+4 +; X86-NEXT: movl %eax, vf128 ; X86-NEXT: addl $24, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -432,10 +432,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, vf128+8 -; X86-NEXT: movl %edx, vf128+12 -; X86-NEXT: movl %eax, vf128 +; X86-NEXT: movl %esi, vf128+12 +; X86-NEXT: movl %edx, vf128+8 ; X86-NEXT: movl %ecx, vf128+4 +; X86-NEXT: movl %eax, vf128 ; X86-NEXT: addl $24, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/fpclamptosat.ll b/llvm/test/CodeGen/X86/fpclamptosat.ll --- a/llvm/test/CodeGen/X86/fpclamptosat.ll +++ b/llvm/test/CodeGen/X86/fpclamptosat.ll @@ -55,7 +55,7 @@ ; CHECK-NEXT: cmovlq %rcx, %rax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: cmovlel %ecx, %eax +; CHECK-NEXT: cmovsl %ecx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq entry: @@ -121,7 +121,7 @@ ; CHECK-NEXT: cmovlq %rcx, %rax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: cmovlel %ecx, %eax +; CHECK-NEXT: cmovsl %ecx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq entry: @@ -203,7 +203,7 @@ ; CHECK-NEXT: cmovlq %rcx, %rax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: cmovlel %ecx, %eax +; CHECK-NEXT: cmovsl %ecx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -266,7 +266,7 @@ ; CHECK-NEXT: cmovll %eax, %ecx ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testl %ecx, %ecx -; CHECK-NEXT: cmovgl %ecx, %eax +; CHECK-NEXT: cmovnsl %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq entry: @@ -325,7 +325,7 @@ ; CHECK-NEXT: cmovll %eax, %ecx ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testl %ecx, %ecx -; CHECK-NEXT: cmovgl %ecx, %eax +; CHECK-NEXT: cmovnsl %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq entry: @@ -402,7 +402,7 @@ ; CHECK-NEXT: cmovll %eax, %ecx ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testl %ecx, %ecx -; CHECK-NEXT: cmovgl %ecx, %eax +; CHECK-NEXT: cmovnsl %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -423,23 +423,13 @@ define i64 @stest_f64i64(double %x) { ; CHECK-LABEL: stest_f64i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: callq __fixdfti@PLT -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: cmpq %rsi, %rax -; CHECK-NEXT: movq %rdx, %rdi -; CHECK-NEXT: sbbq $0, %rdi -; CHECK-NEXT: cmovlq %rdx, %rcx -; CHECK-NEXT: cmovgeq %rsi, %rax -; CHECK-NEXT: movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000 -; CHECK-NEXT: cmpq %rax, %rdx -; CHECK-NEXT: movq $-1, %rsi -; CHECK-NEXT: sbbq %rcx, %rsi -; CHECK-NEXT: cmovgeq %rdx, %rax -; CHECK-NEXT: popq %rcx -; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: cvttsd2si %xmm0, %rax +; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; CHECK-NEXT: cmovbeq %rax, %rcx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: ucomisd %xmm0, %xmm0 +; CHECK-NEXT: cmovnpq %rcx, %rax ; CHECK-NEXT: retq entry: %conv = fptosi double %x to i128 @@ -479,14 +469,11 @@ ; CHECK-NEXT: callq __fixdfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: cmovgq %rcx, %rax ; CHECK-NEXT: movl $1, %esi ; CHECK-NEXT: cmovleq %rdx, %rsi -; CHECK-NEXT: cmovgq %rcx, %rax -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: negq %rdx -; CHECK-NEXT: movl $0, %edx -; CHECK-NEXT: sbbq %rsi, %rdx -; CHECK-NEXT: cmovgeq %rcx, %rax +; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: cmovsq %rcx, %rax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -504,23 +491,13 @@ define i64 @stest_f32i64(float %x) { ; CHECK-LABEL: stest_f32i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: callq __fixsfti@PLT -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: cmpq %rsi, %rax -; CHECK-NEXT: movq %rdx, %rdi -; CHECK-NEXT: sbbq $0, %rdi -; CHECK-NEXT: cmovlq %rdx, %rcx -; CHECK-NEXT: cmovgeq %rsi, %rax -; CHECK-NEXT: movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000 -; CHECK-NEXT: cmpq %rax, %rdx -; CHECK-NEXT: movq $-1, %rsi -; CHECK-NEXT: sbbq %rcx, %rsi -; CHECK-NEXT: cmovgeq %rdx, %rax -; CHECK-NEXT: popq %rcx -; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; CHECK-NEXT: cmovbeq %rax, %rcx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovnpq %rcx, %rax ; CHECK-NEXT: retq entry: %conv = fptosi float %x to i128 @@ -560,14 +537,11 @@ ; CHECK-NEXT: callq __fixsfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: cmovgq %rcx, %rax ; CHECK-NEXT: movl $1, %esi ; CHECK-NEXT: cmovleq %rdx, %rsi -; CHECK-NEXT: cmovgq %rcx, %rax -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: negq %rdx -; CHECK-NEXT: movl $0, %edx -; CHECK-NEXT: sbbq %rsi, %rdx -; CHECK-NEXT: cmovgeq %rcx, %rax +; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: cmovsq %rcx, %rax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -587,19 +561,17 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: callq __fixhfti@PLT -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: cmpq %rsi, %rax -; CHECK-NEXT: movq %rdx, %rdi -; CHECK-NEXT: sbbq $0, %rdi -; CHECK-NEXT: cmovlq %rdx, %rcx -; CHECK-NEXT: cmovgeq %rsi, %rax -; CHECK-NEXT: movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000 -; CHECK-NEXT: cmpq %rax, %rdx -; CHECK-NEXT: movq $-1, %rsi -; CHECK-NEXT: sbbq %rcx, %rsi -; CHECK-NEXT: cmovgeq %rdx, %rax +; CHECK-NEXT: callq __extendhfsf2@PLT +; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; CHECK-NEXT: cmovaeq %rax, %rcx +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF +; CHECK-NEXT: cmovbeq %rcx, %rdx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovnpq %rdx, %rax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -641,14 +613,11 @@ ; CHECK-NEXT: callq __fixhfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: cmovgq %rcx, %rax ; CHECK-NEXT: movl $1, %esi ; CHECK-NEXT: cmovleq %rdx, %rsi -; CHECK-NEXT: cmovgq %rcx, %rax -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: negq %rdx -; CHECK-NEXT: movl $0, %edx -; CHECK-NEXT: sbbq %rsi, %rdx -; CHECK-NEXT: cmovgeq %rcx, %rax +; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: cmovsq %rcx, %rax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll --- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll @@ -15,31 +15,32 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-NEXT: pxor %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 ; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; CHECK-NEXT: pcmpgtd %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm3 ; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; CHECK-NEXT: por %xmm1, %xmm3 ; CHECK-NEXT: pxor %xmm3, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 -; CHECK-NEXT: pcmpeqd %xmm1, %xmm2 -; CHECK-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; CHECK-NEXT: pand %xmm2, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [18446744069414584320,18446744069414584320] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pand %xmm0, %xmm3 -; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: por %xmm3, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: pand %xmm4, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-NEXT: por %xmm0, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm3 +; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: por %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; CHECK-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -76,11 +77,12 @@ ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] ; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-NEXT: pcmpgtd %xmm2, %xmm3 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455] +; CHECK-NEXT: movdqa %xmm2, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm0, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: pand %xmm3, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 ; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -107,14 +109,14 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-NEXT: pxor %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 ; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] -; CHECK-NEXT: pcmpgtd %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm3 ; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -163,27 +165,27 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm4, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm6, %xmm6 -; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 -; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [4294967295,4294967295] -; CHECK-NEXT: movdqa %xmm7, %xmm8 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm8 -; CHECK-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm9 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] -; CHECK-NEXT: por %xmm9, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] +; CHECK-NEXT: movdqa %xmm5, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm7, %xmm8 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] +; CHECK-NEXT: por %xmm8, %xmm1 ; CHECK-NEXT: pand %xmm1, %xmm4 ; CHECK-NEXT: pandn %xmm3, %xmm1 ; CHECK-NEXT: por %xmm4, %xmm1 ; CHECK-NEXT: movdqa %xmm2, %xmm4 ; CHECK-NEXT: pxor %xmm0, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm4, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; CHECK-NEXT: movdqa %xmm5, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-NEXT: pand %xmm7, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; CHECK-NEXT: por %xmm4, %xmm5 ; CHECK-NEXT: pand %xmm5, %xmm2 ; CHECK-NEXT: pandn %xmm3, %xmm5 @@ -191,30 +193,31 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; CHECK-NEXT: movdqa %xmm5, %xmm3 ; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm6 -; CHECK-NEXT: pcmpeqd %xmm6, %xmm4 -; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] -; CHECK-NEXT: pcmpgtd %xmm7, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm8 +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [18446744069414584320,18446744069414584320] +; CHECK-NEXT: movdqa %xmm3, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-NEXT: por %xmm8, %xmm3 -; CHECK-NEXT: pand %xmm3, %xmm5 -; CHECK-NEXT: pandn %xmm2, %xmm3 -; CHECK-NEXT: por %xmm5, %xmm3 +; CHECK-NEXT: pand %xmm7, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; CHECK-NEXT: por %xmm3, %xmm6 +; CHECK-NEXT: pand %xmm6, %xmm5 +; CHECK-NEXT: pandn %xmm2, %xmm6 +; CHECK-NEXT: por %xmm5, %xmm6 ; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm4 -; CHECK-NEXT: pcmpgtd %xmm7, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: por %xmm5, %xmm0 +; CHECK-NEXT: pand %xmm5, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; CHECK-NEXT: por %xmm4, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 ; CHECK-NEXT: pandn %xmm2, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] ; CHECK-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i64> @@ -274,22 +277,22 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; CHECK-NEXT: movdqa %xmm0, %xmm4 ; CHECK-NEXT: pxor %xmm3, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm6, %xmm7 -; CHECK-NEXT: pcmpgtd %xmm5, %xmm7 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455] +; CHECK-NEXT: movdqa %xmm5, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm4 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 -; CHECK-NEXT: pand %xmm7, %xmm4 +; CHECK-NEXT: pand %xmm6, %xmm4 ; CHECK-NEXT: pand %xmm4, %xmm0 ; CHECK-NEXT: pandn %xmm2, %xmm4 ; CHECK-NEXT: por %xmm0, %xmm4 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: pxor %xmm3, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; CHECK-NEXT: pcmpgtd %xmm5, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm3 +; CHECK-NEXT: movdqa %xmm5, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; CHECK-NEXT: pand %xmm6, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 ; CHECK-NEXT: pandn %xmm2, %xmm0 @@ -326,27 +329,27 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm4, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm6, %xmm6 -; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 -; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm7, %xmm8 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm8 -; CHECK-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm9 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] -; CHECK-NEXT: por %xmm9, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] +; CHECK-NEXT: movdqa %xmm5, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm7, %xmm8 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] +; CHECK-NEXT: por %xmm8, %xmm1 ; CHECK-NEXT: pand %xmm1, %xmm4 ; CHECK-NEXT: pandn %xmm3, %xmm1 ; CHECK-NEXT: por %xmm4, %xmm1 ; CHECK-NEXT: movdqa %xmm2, %xmm4 ; CHECK-NEXT: pxor %xmm0, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm4, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; CHECK-NEXT: movdqa %xmm5, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-NEXT: pand %xmm7, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; CHECK-NEXT: por %xmm4, %xmm5 ; CHECK-NEXT: pand %xmm5, %xmm2 ; CHECK-NEXT: pandn %xmm3, %xmm5 @@ -421,28 +424,28 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm3, %xmm1 -; CHECK-NEXT: movdqa %xmm3, %xmm8 +; CHECK-NEXT: movdqa %xmm3, %xmm7 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] -; CHECK-NEXT: movdqa %xmm5, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; CHECK-NEXT: pand %xmm3, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] -; CHECK-NEXT: por %xmm7, %xmm1 -; CHECK-NEXT: pand %xmm1, %xmm8 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; CHECK-NEXT: por %xmm6, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm7 ; CHECK-NEXT: pandn %xmm2, %xmm1 -; CHECK-NEXT: por %xmm8, %xmm1 +; CHECK-NEXT: por %xmm7, %xmm1 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm7, %xmm3 -; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm4, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] +; CHECK-NEXT: movdqa %xmm7, %xmm4 +; CHECK-NEXT: pxor %xmm0, %xmm4 +; CHECK-NEXT: movdqa %xmm3, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; CHECK-NEXT: pand %xmm6, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] ; CHECK-NEXT: por %xmm3, %xmm4 @@ -453,30 +456,31 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; CHECK-NEXT: movdqa %xmm4, %xmm3 ; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm6 -; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 -; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] -; CHECK-NEXT: pcmpgtd %xmm7, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm8 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [18446744069414584320,18446744069414584320] +; CHECK-NEXT: movdqa %xmm3, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm5, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-NEXT: por %xmm8, %xmm3 -; CHECK-NEXT: pand %xmm3, %xmm4 -; CHECK-NEXT: pandn %xmm2, %xmm3 -; CHECK-NEXT: por %xmm4, %xmm3 +; CHECK-NEXT: pand %xmm7, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; CHECK-NEXT: por %xmm3, %xmm6 +; CHECK-NEXT: pand %xmm6, %xmm4 +; CHECK-NEXT: pandn %xmm2, %xmm6 +; CHECK-NEXT: por %xmm4, %xmm6 ; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm4 -; CHECK-NEXT: pcmpgtd %xmm7, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; CHECK-NEXT: movdqa %xmm0, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm5, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] ; CHECK-NEXT: pand %xmm4, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; CHECK-NEXT: por %xmm5, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 ; CHECK-NEXT: pandn %xmm2, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -554,23 +558,23 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; CHECK-NEXT: movdqa %xmm0, %xmm3 ; CHECK-NEXT: pxor %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm5, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm4, %xmm6 +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; CHECK-NEXT: movdqa %xmm4, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm3 -; CHECK-NEXT: pand %xmm6, %xmm3 +; CHECK-NEXT: pand %xmm5, %xmm3 ; CHECK-NEXT: pand %xmm3, %xmm0 ; CHECK-NEXT: pandn %xmm1, %xmm3 ; CHECK-NEXT: por %xmm0, %xmm3 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm6, %xmm0 -; CHECK-NEXT: pxor %xmm2, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; CHECK-NEXT: pcmpgtd %xmm4, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-NEXT: pxor %xmm6, %xmm2 +; CHECK-NEXT: movdqa %xmm4, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; CHECK-NEXT: pand %xmm5, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm6 ; CHECK-NEXT: pandn %xmm1, %xmm0 @@ -625,28 +629,28 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm3, %xmm1 -; CHECK-NEXT: movdqa %xmm3, %xmm8 +; CHECK-NEXT: movdqa %xmm3, %xmm7 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm5, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; CHECK-NEXT: pand %xmm3, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] -; CHECK-NEXT: por %xmm7, %xmm1 -; CHECK-NEXT: pand %xmm1, %xmm8 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; CHECK-NEXT: por %xmm6, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm7 ; CHECK-NEXT: pandn %xmm2, %xmm1 -; CHECK-NEXT: por %xmm8, %xmm1 +; CHECK-NEXT: por %xmm7, %xmm1 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm7, %xmm3 -; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm4, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] +; CHECK-NEXT: movdqa %xmm7, %xmm4 +; CHECK-NEXT: pxor %xmm0, %xmm4 +; CHECK-NEXT: movdqa %xmm3, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; CHECK-NEXT: pand %xmm6, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] ; CHECK-NEXT: por %xmm3, %xmm4 @@ -984,27 +988,27 @@ ; CHECK-NEXT: por %xmm0, %xmm2 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; CHECK-NEXT: # xmm2 = xmm2[0],mem[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-NEXT: movdqa %xmm2, %xmm3 -; CHECK-NEXT: pxor %xmm1, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] -; CHECK-NEXT: movdqa %xmm4, %xmm0 -; CHECK-NEXT: pcmpgtd %xmm3, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-NEXT: movdqa %xmm2, %xmm4 +; CHECK-NEXT: pxor %xmm3, %xmm4 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] +; CHECK-NEXT: movdqa %xmm5, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm3 -; CHECK-NEXT: pxor %xmm3, %xmm0 +; CHECK-NEXT: pandn %xmm1, %xmm0 ; CHECK-NEXT: por %xmm2, %xmm0 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; CHECK-NEXT: pxor %xmm2, %xmm1 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm4 -; CHECK-NEXT: pand %xmm4, %xmm2 -; CHECK-NEXT: pxor %xmm3, %xmm4 -; CHECK-NEXT: por %xmm2, %xmm4 -; CHECK-NEXT: pslld $16, %xmm4 -; CHECK-NEXT: psrad $16, %xmm4 +; CHECK-NEXT: pxor %xmm2, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: pand %xmm5, %xmm2 +; CHECK-NEXT: pandn %xmm1, %xmm5 +; CHECK-NEXT: por %xmm2, %xmm5 +; CHECK-NEXT: pslld $16, %xmm5 +; CHECK-NEXT: psrad $16, %xmm5 ; CHECK-NEXT: pslld $16, %xmm0 ; CHECK-NEXT: psrad $16, %xmm0 -; CHECK-NEXT: packssdw %xmm4, %xmm0 +; CHECK-NEXT: packssdw %xmm5, %xmm0 ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -1591,31 +1595,32 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-NEXT: pxor %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 ; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; CHECK-NEXT: pcmpgtd %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm3 ; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; CHECK-NEXT: por %xmm1, %xmm3 ; CHECK-NEXT: pxor %xmm3, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 -; CHECK-NEXT: pcmpeqd %xmm1, %xmm2 -; CHECK-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; CHECK-NEXT: pand %xmm2, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [18446744069414584320,18446744069414584320] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pand %xmm0, %xmm3 -; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: por %xmm3, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: pand %xmm4, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-NEXT: por %xmm0, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm3 +; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: por %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; CHECK-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -1650,11 +1655,12 @@ ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] ; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-NEXT: pcmpgtd %xmm2, %xmm3 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455] +; CHECK-NEXT: movdqa %xmm2, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm0, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: pand %xmm3, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 ; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1680,14 +1686,14 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-NEXT: pxor %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 ; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] -; CHECK-NEXT: pcmpgtd %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm3 ; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -1733,59 +1739,60 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm3, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm5, %xmm5 -; CHECK-NEXT: pcmpeqd %xmm5, %xmm4 -; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] -; CHECK-NEXT: movdqa %xmm6, %xmm7 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm8 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; CHECK-NEXT: por %xmm8, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] +; CHECK-NEXT: movdqa %xmm4, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm7 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm7, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] ; CHECK-NEXT: pand %xmm1, %xmm3 -; CHECK-NEXT: pandn %xmm4, %xmm1 +; CHECK-NEXT: pandn %xmm5, %xmm1 ; CHECK-NEXT: por %xmm3, %xmm1 ; CHECK-NEXT: movdqa %xmm2, %xmm3 ; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm5, %xmm7 +; CHECK-NEXT: movdqa %xmm4, %xmm6 ; CHECK-NEXT: pcmpgtd %xmm3, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; CHECK-NEXT: pand %xmm7, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; CHECK-NEXT: por %xmm3, %xmm5 -; CHECK-NEXT: pand %xmm5, %xmm2 -; CHECK-NEXT: pandn %xmm4, %xmm5 -; CHECK-NEXT: por %xmm2, %xmm5 -; CHECK-NEXT: movdqa %xmm5, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; CHECK-NEXT: por %xmm3, %xmm4 +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pandn %xmm5, %xmm4 +; CHECK-NEXT: por %xmm2, %xmm4 +; CHECK-NEXT: movdqa %xmm4, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320] -; CHECK-NEXT: pcmpgtd %xmm6, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2] -; CHECK-NEXT: pand %xmm3, %xmm7 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [18446744069414584320,18446744069414584320] +; CHECK-NEXT: movdqa %xmm2, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-NEXT: por %xmm7, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [18446744071562067968,18446744071562067968] -; CHECK-NEXT: pand %xmm2, %xmm5 -; CHECK-NEXT: pandn %xmm3, %xmm2 -; CHECK-NEXT: por %xmm5, %xmm2 -; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm4, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm6, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; CHECK-NEXT: pand %xmm6, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm2, %xmm5 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; CHECK-NEXT: pand %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: por %xmm4, %xmm0 +; CHECK-NEXT: pandn %xmm2, %xmm5 +; CHECK-NEXT: por %xmm4, %xmm5 +; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; CHECK-NEXT: por %xmm3, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: pandn %xmm3, %xmm0 +; CHECK-NEXT: pandn %xmm2, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] ; CHECK-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i64> @@ -1842,26 +1849,26 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; CHECK-NEXT: movdqa %xmm0, %xmm3 ; CHECK-NEXT: pxor %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm5, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm4, %xmm6 +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; CHECK-NEXT: movdqa %xmm4, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm3 -; CHECK-NEXT: pand %xmm6, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] +; CHECK-NEXT: pand %xmm5, %xmm3 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] ; CHECK-NEXT: pand %xmm3, %xmm0 -; CHECK-NEXT: pandn %xmm4, %xmm3 +; CHECK-NEXT: pandn %xmm5, %xmm3 ; CHECK-NEXT: por %xmm0, %xmm3 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: pxor %xmm2, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm2 +; CHECK-NEXT: movdqa %xmm4, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; CHECK-NEXT: pcmpgtd %xmm6, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm0 -; CHECK-NEXT: pand %xmm5, %xmm0 +; CHECK-NEXT: pcmpeqd %xmm4, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: pandn %xmm4, %xmm0 +; CHECK-NEXT: pandn %xmm5, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; CHECK-NEXT: retq @@ -1893,33 +1900,33 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm3, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm5, %xmm5 -; CHECK-NEXT: pcmpeqd %xmm5, %xmm4 -; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm6, %xmm7 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm8 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; CHECK-NEXT: por %xmm8, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] +; CHECK-NEXT: movdqa %xmm4, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm7 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm7, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] ; CHECK-NEXT: pand %xmm1, %xmm3 -; CHECK-NEXT: pandn %xmm4, %xmm1 +; CHECK-NEXT: pandn %xmm5, %xmm1 ; CHECK-NEXT: por %xmm3, %xmm1 ; CHECK-NEXT: movdqa %xmm2, %xmm3 ; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm5, %xmm7 +; CHECK-NEXT: movdqa %xmm4, %xmm6 ; CHECK-NEXT: pcmpgtd %xmm3, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; CHECK-NEXT: pand %xmm7, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; CHECK-NEXT: por %xmm3, %xmm5 -; CHECK-NEXT: pand %xmm5, %xmm2 -; CHECK-NEXT: pandn %xmm4, %xmm5 -; CHECK-NEXT: por %xmm2, %xmm5 -; CHECK-NEXT: movdqa %xmm5, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; CHECK-NEXT: por %xmm3, %xmm4 +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pandn %xmm5, %xmm4 +; CHECK-NEXT: por %xmm2, %xmm4 +; CHECK-NEXT: movdqa %xmm4, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm3 ; CHECK-NEXT: pcmpgtd %xmm0, %xmm3 @@ -1928,7 +1935,7 @@ ; CHECK-NEXT: pand %xmm3, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm3 -; CHECK-NEXT: pand %xmm5, %xmm3 +; CHECK-NEXT: pand %xmm4, %xmm3 ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm4 @@ -1986,63 +1993,64 @@ ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm2, %xmm1 -; CHECK-NEXT: movdqa %xmm2, %xmm7 +; CHECK-NEXT: movdqa %xmm2, %xmm6 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm3, %xmm3 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] -; CHECK-NEXT: movdqa %xmm4, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; CHECK-NEXT: pand %xmm2, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; CHECK-NEXT: movdqa %xmm2, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; CHECK-NEXT: por %xmm5, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] +; CHECK-NEXT: pand %xmm1, %xmm6 +; CHECK-NEXT: pandn %xmm3, %xmm1 ; CHECK-NEXT: por %xmm6, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647] -; CHECK-NEXT: pand %xmm1, %xmm7 -; CHECK-NEXT: pandn %xmm2, %xmm1 -; CHECK-NEXT: por %xmm7, %xmm1 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm7, %xmm5 -; CHECK-NEXT: pxor %xmm0, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm3, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] -; CHECK-NEXT: pand %xmm6, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-NEXT: por %xmm3, %xmm4 -; CHECK-NEXT: movdqa %xmm7, %xmm3 -; CHECK-NEXT: pand %xmm4, %xmm3 -; CHECK-NEXT: pandn %xmm2, %xmm4 -; CHECK-NEXT: por %xmm3, %xmm4 +; CHECK-NEXT: movdqa %xmm7, %xmm4 +; CHECK-NEXT: pxor %xmm0, %xmm4 +; CHECK-NEXT: movdqa %xmm2, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm2, %xmm4 +; CHECK-NEXT: movdqa %xmm7, %xmm2 +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pandn %xmm3, %xmm4 +; CHECK-NEXT: por %xmm2, %xmm4 ; CHECK-NEXT: movdqa %xmm4, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm5, %xmm5 -; CHECK-NEXT: pcmpeqd %xmm5, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320] -; CHECK-NEXT: pcmpgtd %xmm6, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2] -; CHECK-NEXT: pand %xmm3, %xmm7 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [18446744069414584320,18446744069414584320] +; CHECK-NEXT: movdqa %xmm2, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-NEXT: por %xmm7, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [18446744071562067968,18446744071562067968] -; CHECK-NEXT: pand %xmm2, %xmm4 -; CHECK-NEXT: pandn %xmm3, %xmm2 -; CHECK-NEXT: por %xmm4, %xmm2 +; CHECK-NEXT: pand %xmm6, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm2, %xmm5 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] +; CHECK-NEXT: pand %xmm5, %xmm4 +; CHECK-NEXT: pandn %xmm2, %xmm5 +; CHECK-NEXT: por %xmm4, %xmm5 ; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm5, %xmm4 -; CHECK-NEXT: pcmpgtd %xmm6, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: por %xmm5, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; CHECK-NEXT: por %xmm3, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: pandn %xmm3, %xmm0 +; CHECK-NEXT: pandn %xmm2, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -2117,27 +2125,27 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; CHECK-NEXT: movdqa %xmm0, %xmm2 ; CHECK-NEXT: pxor %xmm1, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm4, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259455,9223372039002259455] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm1, %xmm2 -; CHECK-NEXT: pand %xmm5, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] ; CHECK-NEXT: pand %xmm2, %xmm0 -; CHECK-NEXT: pandn %xmm3, %xmm2 +; CHECK-NEXT: pandn %xmm4, %xmm2 ; CHECK-NEXT: por %xmm0, %xmm2 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm6, %xmm0 -; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: pxor %xmm6, %xmm1 +; CHECK-NEXT: movdqa %xmm3, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; CHECK-NEXT: pcmpgtd %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-NEXT: pand %xmm4, %xmm0 +; CHECK-NEXT: pcmpeqd %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm6 -; CHECK-NEXT: pandn %xmm3, %xmm0 +; CHECK-NEXT: pandn %xmm4, %xmm0 ; CHECK-NEXT: por %xmm6, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; CHECK-NEXT: addq $72, %rsp @@ -2187,36 +2195,36 @@ ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm2, %xmm1 -; CHECK-NEXT: movdqa %xmm2, %xmm7 +; CHECK-NEXT: movdqa %xmm2, %xmm6 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm3, %xmm3 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm4, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; CHECK-NEXT: pand %xmm2, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647] +; CHECK-NEXT: movdqa %xmm2, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; CHECK-NEXT: por %xmm5, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; CHECK-NEXT: pand %xmm1, %xmm6 +; CHECK-NEXT: pandn %xmm3, %xmm1 ; CHECK-NEXT: por %xmm6, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] -; CHECK-NEXT: pand %xmm1, %xmm7 -; CHECK-NEXT: pandn %xmm2, %xmm1 -; CHECK-NEXT: por %xmm7, %xmm1 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm7, %xmm5 -; CHECK-NEXT: pxor %xmm0, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm3, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] -; CHECK-NEXT: pand %xmm6, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-NEXT: por %xmm3, %xmm4 -; CHECK-NEXT: movdqa %xmm7, %xmm3 -; CHECK-NEXT: pand %xmm4, %xmm3 -; CHECK-NEXT: pandn %xmm2, %xmm4 -; CHECK-NEXT: por %xmm3, %xmm4 +; CHECK-NEXT: movdqa %xmm7, %xmm4 +; CHECK-NEXT: pxor %xmm0, %xmm4 +; CHECK-NEXT: movdqa %xmm2, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm2, %xmm4 +; CHECK-NEXT: movdqa %xmm7, %xmm2 +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pandn %xmm3, %xmm4 +; CHECK-NEXT: por %xmm2, %xmm4 ; CHECK-NEXT: movdqa %xmm4, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm3 @@ -2540,14 +2548,14 @@ ; CHECK-NEXT: movdqa %xmm4, %xmm0 ; CHECK-NEXT: pcmpgtd %xmm3, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm3 -; CHECK-NEXT: pxor %xmm3, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; CHECK-NEXT: pandn %xmm3, %xmm0 ; CHECK-NEXT: por %xmm2, %xmm0 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; CHECK-NEXT: pxor %xmm2, %xmm1 ; CHECK-NEXT: pcmpgtd %xmm1, %xmm4 ; CHECK-NEXT: pand %xmm4, %xmm2 -; CHECK-NEXT: pxor %xmm3, %xmm4 +; CHECK-NEXT: pandn %xmm3, %xmm4 ; CHECK-NEXT: por %xmm2, %xmm4 ; CHECK-NEXT: pslld $16, %xmm4 ; CHECK-NEXT: psrad $16, %xmm4 diff --git a/llvm/test/CodeGen/X86/fpenv-combine.ll b/llvm/test/CodeGen/X86/fpenv-combine.ll --- a/llvm/test/CodeGen/X86/fpenv-combine.ll +++ b/llvm/test/CodeGen/X86/fpenv-combine.ll @@ -22,13 +22,13 @@ ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; X64-NEXT: movq %rsi, 24(%r14) -; X64-NEXT: movq %rcx, (%r14) -; X64-NEXT: movq %rdx, 8(%r14) -; X64-NEXT: movq %rax, 16(%r14) -; X64-NEXT: movq %rax, 16(%rbx) +; X64-NEXT: movq %rdx, 16(%r14) +; X64-NEXT: movq %rcx, 8(%r14) +; X64-NEXT: movq %rax, (%r14) ; X64-NEXT: movq %rsi, 24(%rbx) -; X64-NEXT: movq %rcx, (%rbx) -; X64-NEXT: movq %rdx, 8(%rbx) +; X64-NEXT: movq %rdx, 16(%rbx) +; X64-NEXT: movq %rcx, 8(%rbx) +; X64-NEXT: movq %rax, (%rbx) ; X64-NEXT: addq $40, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r14 @@ -72,8 +72,8 @@ ; X64-NEXT: movq (%rsp), %rax ; X64-NEXT: andl $1, %eax ; X64-NEXT: movq %rax, (%rbx) -; X64-NEXT: movq $0, 16(%rbx) ; X64-NEXT: movq $0, 24(%rbx) +; X64-NEXT: movq $0, 16(%rbx) ; X64-NEXT: movq $0, 8(%rbx) ; X64-NEXT: addq $32, %rsp ; X64-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/fpenv.ll b/llvm/test/CodeGen/X86/fpenv.ll --- a/llvm/test/CodeGen/X86/fpenv.ll +++ b/llvm/test/CodeGen/X86/fpenv.ll @@ -13,7 +13,9 @@ ; X86-NOSSE: # %bb.0: ; X86-NOSSE-NEXT: pushl %eax ; X86-NOSSE-NEXT: fnstcw (%esp) -; X86-NOSSE-NEXT: orb $12, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movzwl (%esp), %eax +; X86-NOSSE-NEXT: orl $3072, %eax # imm = 0xC00 +; X86-NOSSE-NEXT: movw %ax, (%esp) ; X86-NOSSE-NEXT: fldcw (%esp) ; X86-NOSSE-NEXT: popl %eax ; X86-NOSSE-NEXT: retl @@ -22,7 +24,9 @@ ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %eax ; X86-SSE-NEXT: fnstcw (%esp) -; X86-SSE-NEXT: orb $12, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movzwl (%esp), %eax +; X86-SSE-NEXT: orl $3072, %eax # imm = 0xC00 +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: fldcw (%esp) ; X86-SSE-NEXT: stmxcsr (%esp) ; X86-SSE-NEXT: orb $96, {{[0-9]+}}(%esp) @@ -33,7 +37,9 @@ ; X64-LABEL: func_01: ; X64: # %bb.0: ; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) -; X64-NEXT: orb $12, -{{[0-9]+}}(%rsp) +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: orl $3072, %eax # imm = 0xC00 +; X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) ; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) ; X64-NEXT: stmxcsr -{{[0-9]+}}(%rsp) ; X64-NEXT: orb $96, -{{[0-9]+}}(%rsp) @@ -83,8 +89,8 @@ ; X86-NOSSE: # %bb.0: ; X86-NOSSE-NEXT: pushl %eax ; X86-NOSSE-NEXT: fnstcw (%esp) -; X86-NOSSE-NEXT: movl $-3073, %eax # imm = 0xF3FF -; X86-NOSSE-NEXT: andl (%esp), %eax +; X86-NOSSE-NEXT: movzwl (%esp), %eax +; X86-NOSSE-NEXT: andl $62463, %eax # imm = 0xF3FF ; X86-NOSSE-NEXT: orl $2048, %eax # imm = 0x800 ; X86-NOSSE-NEXT: movw %ax, (%esp) ; X86-NOSSE-NEXT: fldcw (%esp) @@ -95,8 +101,8 @@ ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %eax ; X86-SSE-NEXT: fnstcw (%esp) -; X86-SSE-NEXT: movl $-3073, %eax # imm = 0xF3FF -; X86-SSE-NEXT: andl (%esp), %eax +; X86-SSE-NEXT: movzwl (%esp), %eax +; X86-SSE-NEXT: andl $62463, %eax # imm = 0xF3FF ; X86-SSE-NEXT: orl $2048, %eax # imm = 0x800 ; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: fldcw (%esp) @@ -112,8 +118,8 @@ ; X64-LABEL: func_03: ; X64: # %bb.0: ; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) -; X64-NEXT: movl $-3073, %eax # imm = 0xF3FF -; X64-NEXT: andl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: andl $62463, %eax # imm = 0xF3FF ; X64-NEXT: orl $2048, %eax # imm = 0x800 ; X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) ; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) @@ -133,8 +139,8 @@ ; X86-NOSSE: # %bb.0: ; X86-NOSSE-NEXT: pushl %eax ; X86-NOSSE-NEXT: fnstcw (%esp) -; X86-NOSSE-NEXT: movl $-3073, %eax # imm = 0xF3FF -; X86-NOSSE-NEXT: andl (%esp), %eax +; X86-NOSSE-NEXT: movzwl (%esp), %eax +; X86-NOSSE-NEXT: andl $62463, %eax # imm = 0xF3FF ; X86-NOSSE-NEXT: orl $1024, %eax # imm = 0x400 ; X86-NOSSE-NEXT: movw %ax, (%esp) ; X86-NOSSE-NEXT: fldcw (%esp) @@ -145,8 +151,8 @@ ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %eax ; X86-SSE-NEXT: fnstcw (%esp) -; X86-SSE-NEXT: movl $-3073, %eax # imm = 0xF3FF -; X86-SSE-NEXT: andl (%esp), %eax +; X86-SSE-NEXT: movzwl (%esp), %eax +; X86-SSE-NEXT: andl $62463, %eax # imm = 0xF3FF ; X86-SSE-NEXT: orl $1024, %eax # imm = 0x400 ; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: fldcw (%esp) @@ -162,8 +168,8 @@ ; X64-LABEL: func_04: ; X64: # %bb.0: ; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) -; X64-NEXT: movl $-3073, %eax # imm = 0xF3FF -; X64-NEXT: andl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: andl $62463, %eax # imm = 0xF3FF ; X64-NEXT: orl $1024, %eax # imm = 0x400 ; X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) ; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) @@ -189,8 +195,8 @@ ; X86-NOSSE-NEXT: shll %cl, %eax ; X86-NOSSE-NEXT: andl $3072, %eax # imm = 0xC00 ; X86-NOSSE-NEXT: fnstcw (%esp) -; X86-NOSSE-NEXT: movl $-3073, %ecx # imm = 0xF3FF -; X86-NOSSE-NEXT: andl (%esp), %ecx +; X86-NOSSE-NEXT: movzwl (%esp), %ecx +; X86-NOSSE-NEXT: andl $62463, %ecx # imm = 0xF3FF ; X86-NOSSE-NEXT: orl %eax, %ecx ; X86-NOSSE-NEXT: movw %cx, (%esp) ; X86-NOSSE-NEXT: fldcw (%esp) @@ -207,8 +213,8 @@ ; X86-SSE-NEXT: shll %cl, %eax ; X86-SSE-NEXT: andl $3072, %eax # imm = 0xC00 ; X86-SSE-NEXT: fnstcw (%esp) -; X86-SSE-NEXT: movl $-3073, %ecx # imm = 0xF3FF -; X86-SSE-NEXT: andl (%esp), %ecx +; X86-SSE-NEXT: movzwl (%esp), %ecx +; X86-SSE-NEXT: andl $62463, %ecx # imm = 0xF3FF ; X86-SSE-NEXT: orl %eax, %ecx ; X86-SSE-NEXT: movw %cx, (%esp) ; X86-SSE-NEXT: fldcw (%esp) @@ -230,8 +236,8 @@ ; X64-NEXT: shll %cl, %eax ; X64-NEXT: andl $3072, %eax # imm = 0xC00 ; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) -; X64-NEXT: movl $-3073, %ecx # imm = 0xF3FF -; X64-NEXT: andl -{{[0-9]+}}(%rsp), %ecx +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %ecx +; X64-NEXT: andl $62463, %ecx # imm = 0xF3FF ; X64-NEXT: orl %eax, %ecx ; X64-NEXT: movw %cx, -{{[0-9]+}}(%rsp) ; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/freeze-unary.ll b/llvm/test/CodeGen/X86/freeze-unary.ll --- a/llvm/test/CodeGen/X86/freeze-unary.ll +++ b/llvm/test/CodeGen/X86/freeze-unary.ll @@ -6,6 +6,7 @@ ; X86-LABEL: freeze_sext: ; X86: # %bb.0: ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cwtl ; X86-NEXT: retl ; ; X64-LABEL: freeze_sext: @@ -40,6 +41,7 @@ ; X86-LABEL: freeze_zext: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: retl ; ; X64-LABEL: freeze_zext: diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll --- a/llvm/test/CodeGen/X86/freeze-vector.ll +++ b/llvm/test/CodeGen/X86/freeze-vector.ll @@ -349,15 +349,16 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl (%edx), %edx ; X86-NEXT: andl $15, %edx -; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 -; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7] -; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-NEXT: vmovdqa %xmm0, (%ecx) ; X86-NEXT: vmovd %edx, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X86-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7] +; X86-NEXT: vpand %xmm3, %xmm1, %xmm1 +; X86-NEXT: vmovdqa %xmm1, (%ecx) ; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] -; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpand %xmm3, %xmm0, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%eax) ; X86-NEXT: retl ; @@ -365,15 +366,15 @@ ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: andl $15, %eax -; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] -; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-NEXT: vmovdqa %xmm0, (%rdx) ; X64-NEXT: vmovd %eax, %xmm0 ; X64-NEXT: vpbroadcastd %xmm0, %xmm0 -; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] -; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2,3] +; X64-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7] +; X64-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X64-NEXT: vmovdqa %xmm2, (%rdx) +; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] +; X64-NEXT: vpand %xmm3, %xmm0, %xmm0 ; X64-NEXT: vmovdqa %xmm0, (%rcx) ; X64-NEXT: retq %i0.src = load i32, ptr %origin0 diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll --- a/llvm/test/CodeGen/X86/funnel-shift.ll +++ b/llvm/test/CodeGen/X86/funnel-shift.ll @@ -180,7 +180,7 @@ ; X64-AVX2-NEXT: andq %rdx, %rax ; X64-AVX2-NEXT: movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5 ; X64-AVX2-NEXT: mulq %rdx -; X64-AVX2-NEXT: leal (%rdx,%rdx,8), %eax +; X64-AVX2-NEXT: leaq (%rdx,%rdx,8), %rax ; X64-AVX2-NEXT: leal (%rdx,%rax,4), %eax ; X64-AVX2-NEXT: subl %eax, %ecx ; X64-AVX2-NEXT: shlq $27, %rsi @@ -347,7 +347,7 @@ ; X64-AVX2-NEXT: andq %rdx, %rax ; X64-AVX2-NEXT: movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5 ; X64-AVX2-NEXT: mulq %rdx -; X64-AVX2-NEXT: leal (%rdx,%rdx,8), %eax +; X64-AVX2-NEXT: leaq (%rdx,%rdx,8), %rax ; X64-AVX2-NEXT: leal (%rdx,%rax,4), %eax ; X64-AVX2-NEXT: subl %eax, %ecx ; X64-AVX2-NEXT: addl $27, %ecx @@ -980,20 +980,18 @@ ; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: leal (%eax,%eax,2), %edx -; X86-SSE2-NEXT: movzwl 8(%ecx,%edx,4), %esi -; X86-SSE2-NEXT: movl 4(%ecx,%edx,4), %edi -; X86-SSE2-NEXT: shrdl $8, %esi, %edi +; X86-SSE2-NEXT: leal (%eax,%eax,2), %esi +; X86-SSE2-NEXT: movzwl 8(%ecx,%esi,4), %edx +; X86-SSE2-NEXT: movl 4(%ecx,%esi,4), %edi +; X86-SSE2-NEXT: shrdl $8, %edx, %edi ; X86-SSE2-NEXT: xorl %eax, %edi ; X86-SSE2-NEXT: sarl $31, %eax -; X86-SSE2-NEXT: movzbl 10(%ecx,%edx,4), %ecx -; X86-SSE2-NEXT: shll $16, %ecx -; X86-SSE2-NEXT: orl %esi, %ecx -; X86-SSE2-NEXT: shll $8, %ecx -; X86-SSE2-NEXT: movl %ecx, %edx -; X86-SSE2-NEXT: sarl $8, %edx +; X86-SSE2-NEXT: movsbl 10(%ecx,%esi,4), %ecx +; X86-SSE2-NEXT: movl %ecx, %esi +; X86-SSE2-NEXT: shll $16, %esi +; X86-SSE2-NEXT: orl %edx, %esi ; X86-SSE2-NEXT: sarl $31, %ecx -; X86-SSE2-NEXT: shldl $24, %edx, %ecx +; X86-SSE2-NEXT: shldl $24, %esi, %ecx ; X86-SSE2-NEXT: xorl %eax, %ecx ; X86-SSE2-NEXT: orl %ecx, %edi ; X86-SSE2-NEXT: jne .LBB46_1 diff --git a/llvm/test/CodeGen/X86/h-registers-2.ll b/llvm/test/CodeGen/X86/h-registers-2.ll --- a/llvm/test/CodeGen/X86/h-registers-2.ll +++ b/llvm/test/CodeGen/X86/h-registers-2.ll @@ -8,8 +8,7 @@ ; CHECK-LABEL: foo: ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movzbl %ah, %eax +; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movb $77, (%ecx,%eax,8) ; CHECK-NEXT: shll $3, %eax ; CHECK-NEXT: retl diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll --- a/llvm/test/CodeGen/X86/haddsub-2.ll +++ b/llvm/test/CodeGen/X86/haddsub-2.ll @@ -195,12 +195,12 @@ ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %esi ; SSE3-NEXT: addl %eax, %esi +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: movd %xmm1, %edi +; SSE3-NEXT: addl %eax, %edi ; SSE3-NEXT: movd %esi, %xmm0 -; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSE3-NEXT: movd %xmm2, %eax -; SSE3-NEXT: movd %xmm1, %esi -; SSE3-NEXT: addl %eax, %esi -; SSE3-NEXT: movd %esi, %xmm1 +; SSE3-NEXT: movd %edi, %xmm1 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE3-NEXT: movd %ecx, %xmm2 ; SSE3-NEXT: movd %edx, %xmm0 @@ -311,12 +311,12 @@ ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] ; SSE3-NEXT: movd %xmm0, %esi ; SSE3-NEXT: subl %esi, %edx -; SSE3-NEXT: movd %edx, %xmm0 -; SSE3-NEXT: movd %xmm1, %edx -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE3-NEXT: movd %xmm1, %esi -; SSE3-NEXT: subl %esi, %edx -; SSE3-NEXT: movd %edx, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE3-NEXT: movd %xmm0, %edi +; SSE3-NEXT: subl %edi, %esi +; SSE3-NEXT: movd %edx, %xmm0 +; SSE3-NEXT: movd %esi, %xmm1 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE3-NEXT: movd %eax, %xmm2 ; SSE3-NEXT: movd %ecx, %xmm0 diff --git a/llvm/test/CodeGen/X86/haddsub-3.ll b/llvm/test/CodeGen/X86/haddsub-3.ll --- a/llvm/test/CodeGen/X86/haddsub-3.ll +++ b/llvm/test/CodeGen/X86/haddsub-3.ll @@ -72,11 +72,11 @@ ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; SSE2-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: movapd %xmm2, %xmm3 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE2-NEXT: addpd %xmm3, %xmm2 -; SSE2-NEXT: divpd %xmm2, %xmm1 -; SSE2-NEXT: divpd %xmm2, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE2-NEXT: addsd %xmm2, %xmm3 +; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0,0] +; SSE2-NEXT: divpd %xmm3, %xmm1 +; SSE2-NEXT: divpd %xmm3, %xmm0 ; SSE2-NEXT: xorpd %xmm2, %xmm2 ; SSE2-NEXT: addpd %xmm2, %xmm0 ; SSE2-NEXT: addpd %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/haddsub-4.ll b/llvm/test/CodeGen/X86/haddsub-4.ll --- a/llvm/test/CodeGen/X86/haddsub-4.ll +++ b/llvm/test/CodeGen/X86/haddsub-4.ll @@ -121,25 +121,26 @@ define <8 x float> @hadd_reverse3_v8f32(<8 x float> %a0, <8 x float> %a1) { ; SSE-LABEL: hadd_reverse3_v8f32: ; SSE: # %bb.0: -; SSE-NEXT: haddps %xmm1, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0,3,2] -; SSE-NEXT: haddps %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0,3,2] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: haddps %xmm2, %xmm4 +; SSE-NEXT: haddps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2,1,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,2,1,0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: hadd_reverse3_v8f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vhaddps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse3_v8f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vhaddps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: retq %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll --- a/llvm/test/CodeGen/X86/haddsub-shuf.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -554,6 +554,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_v8i32b: @@ -670,6 +671,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hsub_v8i32b: @@ -814,6 +816,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_v16i16b: @@ -954,6 +957,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hsub_v16i16b: @@ -1013,15 +1017,45 @@ } define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) { -; SSE-LABEL: PR34724_2: -; SSE: # %bb.0: -; SSE-NEXT: haddps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE_SLOW-LABEL: PR34724_2: +; SSE_SLOW: # %bb.0: +; SSE_SLOW-NEXT: haddps %xmm1, %xmm0 +; SSE_SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE_SLOW-NEXT: addps %xmm1, %xmm2 +; SSE_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] +; SSE_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE_SLOW-NEXT: retq ; -; AVX-LABEL: PR34724_2: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE_FAST-LABEL: PR34724_2: +; SSE_FAST: # %bb.0: +; SSE_FAST-NEXT: haddps %xmm1, %xmm0 +; SSE_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: PR34724_2: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX1_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] +; AVX1_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: PR34724_2: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: PR34724_2: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX2_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] +; AVX2_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: PR34724_2: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX2_FAST-NEXT: retq %t0 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %t1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %t2 = fadd <4 x float> %t0, %t1 diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -216,7 +216,7 @@ ; AVX-FAST-LABEL: test8_undef: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,1] ; AVX-FAST-NEXT: retq %vecext = extractelement <4 x float> %a, i32 0 %vecext1 = extractelement <4 x float> %a, i32 1 @@ -504,32 +504,17 @@ } define <4 x float> @add_ps_030(<4 x float> %x) { -; SSE-SLOW-LABEL: add_ps_030: -; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,2,3] -; SSE-SLOW-NEXT: addps %xmm1, %xmm0 -; SSE-SLOW-NEXT: retq -; -; SSE-FAST-LABEL: add_ps_030: -; SSE-FAST: # %bb.0: -; SSE-FAST-NEXT: haddps %xmm0, %xmm0 -; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,2,3] -; SSE-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: add_ps_030: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: retq +; SSE-LABEL: add_ps_030: +; SSE: # %bb.0: +; SSE-NEXT: haddps %xmm0, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; SSE-NEXT: retq ; -; AVX-FAST-LABEL: add_ps_030: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] -; AVX-FAST-NEXT: retq +; AVX-LABEL: add_ps_030: +; AVX: # %bb.0: +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %r @@ -584,14 +569,14 @@ ; SSE-LABEL: add_ps_016: ; SSE: # %bb.0: ; SSE-NEXT: haddps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,1] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: add_ps_016: ; AVX: # %bb.0: ; AVX-NEXT: vhaddps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,3,1] ; AVX-NEXT: retq %3 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> %4 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> @@ -1127,40 +1112,68 @@ ; SSE-SLOW-LABEL: PR34724_add_v4f64_u123: ; SSE-SLOW: # %bb.0: ; SSE-SLOW-NEXT: haddpd %xmm2, %xmm1 +; SSE-SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSE-SLOW-NEXT: movapd %xmm3, %xmm2 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-SLOW-NEXT: addsd %xmm3, %xmm2 -; SSE-SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0] ; SSE-SLOW-NEXT: retq ; ; SSE-FAST-LABEL: PR34724_add_v4f64_u123: ; SSE-FAST: # %bb.0: ; SSE-FAST-NEXT: movapd %xmm1, %xmm0 -; SSE-FAST-NEXT: haddpd %xmm3, %xmm2 ; SSE-FAST-NEXT: haddpd %xmm1, %xmm0 +; SSE-FAST-NEXT: haddpd %xmm3, %xmm2 ; SSE-FAST-NEXT: movapd %xmm2, %xmm1 ; SSE-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR34724_add_v4f64_u123: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0] -; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR34724_add_v4f64_u123: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm2 = xmm0[0,0] +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddsd %xmm3, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: PR34724_add_v4f64_u123: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX-FAST-NEXT: vhaddpd %ymm0, %ymm1, %ymm0 -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: PR34724_add_v4f64_u123: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 +; AVX1-FAST-NEXT: vhaddpd %ymm0, %ymm2, %ymm0 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-FAST-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: PR34724_add_v4f64_u123: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX512-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: PR34724_add_v4f64_u123: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 +; AVX512-FAST-NEXT: vhaddpd %ymm0, %ymm2, %ymm0 +; AVX512-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX512-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX512-FAST-NEXT: retq %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> %5 = fadd <2 x double> %3, %4 @@ -1193,21 +1206,48 @@ ; SSE-FAST-NEXT: haddpd %xmm3, %xmm1 ; SSE-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR34724_add_v4f64_0u23: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0] -; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR34724_add_v4f64_0u23: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0] +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: PR34724_add_v4f64_0u23: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: PR34724_add_v4f64_0u23: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-FAST-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: PR34724_add_v4f64_0u23: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX512-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: PR34724_add_v4f64_0u23: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX512-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX512-FAST-NEXT: retq %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> %5 = fadd <2 x double> %3, %4 @@ -1239,28 +1279,42 @@ ; SSE-FAST-NEXT: movapd %xmm3, %xmm1 ; SSE-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR34724_add_v4f64_01u3: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] -; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR34724_add_v4f64_01u3: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: PR34724_add_v4f64_01u3: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX1-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] +; AVX1-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: retq ; +; AVX512-SLOW-LABEL: PR34724_add_v4f64_01u3: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX512-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX512-SLOW-NEXT: retq +; ; AVX512-FAST-LABEL: PR34724_add_v4f64_01u3: ; AVX512-FAST: # %bb.0: -; AVX512-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 -; AVX512-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-FAST-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX512-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX512-FAST-NEXT: retq %3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> %4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> @@ -1292,22 +1346,39 @@ ; SSE-FAST-NEXT: movapd %xmm2, %xmm1 ; SSE-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR34724_add_v4f64_012u: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR34724_add_v4f64_012u: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: PR34724_add_v4f64_012u: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: PR34724_add_v4f64_012u: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 +; AVX1-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: PR34724_add_v4f64_012u: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX512-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: PR34724_add_v4f64_012u: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: retq %3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> %4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> %5 = fadd <2 x double> %3, %4 diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll --- a/llvm/test/CodeGen/X86/haddsub.ll +++ b/llvm/test/CodeGen/X86/haddsub.ll @@ -1638,33 +1638,25 @@ ; ; SSE3-FAST-LABEL: fadd_reduce_v8f32: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: haddps %xmm1, %xmm2 -; SSE3-FAST-NEXT: haddps %xmm2, %xmm2 +; SSE3-FAST-NEXT: addps %xmm2, %xmm1 +; SSE3-FAST-NEXT: movaps %xmm1, %xmm2 +; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE3-FAST-NEXT: addps %xmm1, %xmm2 ; SSE3-FAST-NEXT: haddps %xmm2, %xmm2 ; SSE3-FAST-NEXT: addss %xmm2, %xmm0 ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: fadd_reduce_v8f32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: fadd_reduce_v8f32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-FAST-NEXT: vhaddps %xmm1, %xmm2, %xmm1 -; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX-LABEL: fadd_reduce_v8f32: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1) ret float %r } @@ -1681,29 +1673,20 @@ ; ; SSE3-FAST-LABEL: fadd_reduce_v4f64: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: haddpd %xmm1, %xmm2 -; SSE3-FAST-NEXT: haddpd %xmm2, %xmm2 -; SSE3-FAST-NEXT: addsd %xmm2, %xmm0 +; SSE3-FAST-NEXT: addpd %xmm2, %xmm1 +; SSE3-FAST-NEXT: haddpd %xmm1, %xmm1 +; SSE3-FAST-NEXT: addsd %xmm1, %xmm0 ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: fadd_reduce_v4f64: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: fadd_reduce_v4f64: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm2, %xmm1 -; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 -; AVX-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX-LABEL: fadd_reduce_v4f64: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1) ret double %r } @@ -1751,15 +1734,6 @@ ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: PR39936_v8f32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq %2 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> %3 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> %4 = fadd <8 x float> %2, %3 @@ -1830,22 +1804,14 @@ ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: hadd32_8: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: hadd32_8: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX-LABEL: hadd32_8: +; AVX: # %bb.0: +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> %x227 = fadd <8 x float> %x225, %x226 %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> @@ -1880,14 +1846,6 @@ ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: hadd32_16: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> %x227 = fadd <16 x float> %x225, %x226 %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> @@ -1932,7 +1890,8 @@ ; AVX: # %bb.0: ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> @@ -1951,14 +1910,6 @@ ; SSE3-NEXT: addps %xmm1, %xmm0 ; SSE3-NEXT: haddps %xmm0, %xmm0 ; SSE3-NEXT: retq -; -; AVX-LABEL: hadd32_16_optsize: -; AVX: # %bb.0: -; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> %x227 = fadd <16 x float> %x225, %x226 %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> @@ -2003,7 +1954,8 @@ ; AVX: # %bb.0: ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> @@ -2022,14 +1974,6 @@ ; SSE3-NEXT: addps %xmm1, %xmm0 ; SSE3-NEXT: haddps %xmm0, %xmm0 ; SSE3-NEXT: retq -; -; AVX-LABEL: hadd32_16_pgso: -; AVX: # %bb.0: -; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> %x227 = fadd <16 x float> %x225, %x226 %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> @@ -2056,21 +2000,14 @@ ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: partial_reduction_fadd_v8f32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX-LABEL: partial_reduction_fadd_v8f32: +; AVX: # %bb.0: +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> %x0213 = fadd <8 x float> %x, %x23 %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> @@ -2100,22 +2037,14 @@ ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX-LABEL: partial_reduction_fadd_v8f32_wrong_flags: +; AVX: # %bb.0: +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> %x0213 = fadd fast <8 x float> %x, %x23 %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> @@ -2150,13 +2079,6 @@ ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: partial_reduction_fadd_v16f32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq %x23 = shufflevector <16 x float> %x, <16 x float> undef, <16 x i32> %x0213 = fadd <16 x float> %x, %x23 %x13 = shufflevector <16 x float> %x0213, <16 x float> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -64,9 +64,10 @@ ; ; CHECK-I686-LABEL: test_bitcast_to_half: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: movw {{[0-9]+}}(%esp), %ax -; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-I686-NEXT: movw %ax, (%ecx) +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; CHECK-I686-NEXT: pextrw $0, %xmm0, %ecx +; CHECK-I686-NEXT: movw %cx, (%eax) ; CHECK-I686-NEXT: retl %val_fp = bitcast i16 %in to half store half %val_fp, ptr %addr @@ -1235,7 +1236,7 @@ ; CHECK-LIBCALL-LABEL: fcopysign: ; CHECK-LIBCALL: # %bb.0: ; CHECK-LIBCALL-NEXT: pextrw $0, %xmm1, %eax -; CHECK-LIBCALL-NEXT: andl $-32768, %eax # imm = 0x8000 +; CHECK-LIBCALL-NEXT: andl $32768, %eax # imm = 0x8000 ; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %ecx ; CHECK-LIBCALL-NEXT: andl $32767, %ecx # imm = 0x7FFF ; CHECK-LIBCALL-NEXT: orl %eax, %ecx @@ -1245,7 +1246,7 @@ ; BWON-F16C-LABEL: fcopysign: ; BWON-F16C: # %bb.0: ; BWON-F16C-NEXT: vpextrw $0, %xmm1, %eax -; BWON-F16C-NEXT: andl $-32768, %eax # imm = 0x8000 +; BWON-F16C-NEXT: andl $32768, %eax # imm = 0x8000 ; BWON-F16C-NEXT: vpextrw $0, %xmm0, %ecx ; BWON-F16C-NEXT: andl $32767, %ecx # imm = 0x7FFF ; BWON-F16C-NEXT: orl %eax, %ecx @@ -1254,8 +1255,8 @@ ; ; CHECK-I686-LABEL: fcopysign: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: movl $-32768, %eax # imm = 0x8000 -; CHECK-I686-NEXT: andl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: andl $32768, %eax # imm = 0x8000 ; CHECK-I686-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; CHECK-I686-NEXT: andl $32767, %ecx # imm = 0x7FFF ; CHECK-I686-NEXT: orl %eax, %ecx @@ -2113,37 +2114,37 @@ define void @pr63114() { ; CHECK-LIBCALL-LABEL: pr63114: ; CHECK-LIBCALL: # %bb.0: -; CHECK-LIBCALL-NEXT: movdqu (%rax), %xmm4 -; CHECK-LIBCALL-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,3,4,5,6,7] +; CHECK-LIBCALL-NEXT: movdqu (%rax), %xmm3 +; CHECK-LIBCALL-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,3,3,4,5,6,7] ; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] ; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm0 ; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0] ; CHECK-LIBCALL-NEXT: por %xmm2, %xmm0 -; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] -; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm0 +; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,65535,0] +; CHECK-LIBCALL-NEXT: pand %xmm4, %xmm0 ; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm5 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60] ; CHECK-LIBCALL-NEXT: por %xmm5, %xmm0 -; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5,7,7] +; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,7,7] +; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,5,5,5,5] +; CHECK-LIBCALL-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3,0,3] +; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm3 +; CHECK-LIBCALL-NEXT: por %xmm2, %xmm3 +; CHECK-LIBCALL-NEXT: pand %xmm4, %xmm3 +; CHECK-LIBCALL-NEXT: por %xmm5, %xmm3 ; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] ; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm6 ; CHECK-LIBCALL-NEXT: por %xmm2, %xmm6 -; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm6 +; CHECK-LIBCALL-NEXT: pand %xmm4, %xmm6 ; CHECK-LIBCALL-NEXT: por %xmm5, %xmm6 -; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,5,5,5,5] -; CHECK-LIBCALL-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3,0,3] -; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm4 -; CHECK-LIBCALL-NEXT: por %xmm2, %xmm4 -; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm4 -; CHECK-LIBCALL-NEXT: por %xmm5, %xmm4 ; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm7 ; CHECK-LIBCALL-NEXT: por %xmm2, %xmm7 -; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm7 +; CHECK-LIBCALL-NEXT: pand %xmm4, %xmm7 ; CHECK-LIBCALL-NEXT: por %xmm5, %xmm7 ; CHECK-LIBCALL-NEXT: movdqu %xmm7, 0 -; CHECK-LIBCALL-NEXT: movdqu %xmm4, 32 ; CHECK-LIBCALL-NEXT: movdqu %xmm6, 48 +; CHECK-LIBCALL-NEXT: movdqu %xmm3, 32 ; CHECK-LIBCALL-NEXT: movdqu %xmm0, 16 ; CHECK-LIBCALL-NEXT: retq ; @@ -2154,61 +2155,61 @@ ; BWON-F16C-NEXT: vbroadcastss (%rax), %xmm2 ; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; BWON-F16C-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[0,0] -; BWON-F16C-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 -; BWON-F16C-NEXT: vpsllq $48, %xmm3, %xmm4 -; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5,6,7] -; BWON-F16C-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; BWON-F16C-NEXT: vpor %xmm3, %xmm2, %xmm2 -; BWON-F16C-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,3],xmm1[2,0] -; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7] -; BWON-F16C-NEXT: vpor %xmm3, %xmm1, %xmm1 -; BWON-F16C-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,3,3,4,5,6,7] -; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5,6,7] -; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7] -; BWON-F16C-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3],xmm0[4,5,6,7] -; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm3[7] +; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,3,3,4,5,6,7] +; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] +; BWON-F16C-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; BWON-F16C-NEXT: vpsllq $48, %xmm4, %xmm5 +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3],xmm3[4,5,6,7] +; BWON-F16C-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1] +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] +; BWON-F16C-NEXT: vpshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,5,5,5,5] +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm5[3],xmm6[4,5,6,7] +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm4[7] +; BWON-F16C-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3],xmm2[4,5,6,7] +; BWON-F16C-NEXT: vpor %xmm4, %xmm2, %xmm2 +; BWON-F16C-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3],xmm0[4,5,6,7] +; BWON-F16C-NEXT: vpor %xmm4, %xmm0, %xmm0 ; BWON-F16C-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; BWON-F16C-NEXT: vmovups %ymm0, 0 -; BWON-F16C-NEXT: vmovups %ymm1, 32 +; BWON-F16C-NEXT: vmovups %ymm0, 32 +; BWON-F16C-NEXT: vmovups %ymm3, 0 ; BWON-F16C-NEXT: vzeroupper ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: pr63114: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: movdqu (%eax), %xmm6 -; CHECK-I686-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,3,3,4,5,6,7] +; CHECK-I686-NEXT: movdqu (%eax), %xmm3 +; CHECK-I686-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,3,3,4,5,6,7] ; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; CHECK-I686-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] ; CHECK-I686-NEXT: pand %xmm1, %xmm0 ; CHECK-I686-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0] ; CHECK-I686-NEXT: por %xmm2, %xmm0 -; CHECK-I686-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] -; CHECK-I686-NEXT: pand %xmm3, %xmm0 -; CHECK-I686-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60] -; CHECK-I686-NEXT: por %xmm4, %xmm0 -; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5,7,7] -; CHECK-I686-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; CHECK-I686-NEXT: pand %xmm1, %xmm5 -; CHECK-I686-NEXT: por %xmm2, %xmm5 -; CHECK-I686-NEXT: pand %xmm3, %xmm5 -; CHECK-I686-NEXT: por %xmm4, %xmm5 -; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,5,5,5,5] -; CHECK-I686-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3,0,3] -; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; CHECK-I686-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,65535,0] +; CHECK-I686-NEXT: pand %xmm4, %xmm0 +; CHECK-I686-NEXT: movdqa {{.*#+}} xmm5 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60] +; CHECK-I686-NEXT: por %xmm5, %xmm0 +; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,7,7] +; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,5,5,5,5] +; CHECK-I686-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3,0,3] +; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; CHECK-I686-NEXT: pand %xmm1, %xmm3 +; CHECK-I686-NEXT: por %xmm2, %xmm3 +; CHECK-I686-NEXT: pand %xmm4, %xmm3 +; CHECK-I686-NEXT: por %xmm5, %xmm3 +; CHECK-I686-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] ; CHECK-I686-NEXT: pand %xmm1, %xmm6 ; CHECK-I686-NEXT: por %xmm2, %xmm6 -; CHECK-I686-NEXT: pand %xmm3, %xmm6 -; CHECK-I686-NEXT: por %xmm4, %xmm6 +; CHECK-I686-NEXT: pand %xmm4, %xmm6 +; CHECK-I686-NEXT: por %xmm5, %xmm6 ; CHECK-I686-NEXT: pand %xmm1, %xmm7 ; CHECK-I686-NEXT: por %xmm2, %xmm7 -; CHECK-I686-NEXT: pand %xmm3, %xmm7 -; CHECK-I686-NEXT: por %xmm4, %xmm7 +; CHECK-I686-NEXT: pand %xmm4, %xmm7 +; CHECK-I686-NEXT: por %xmm5, %xmm7 ; CHECK-I686-NEXT: movdqu %xmm7, 0 -; CHECK-I686-NEXT: movdqu %xmm6, 32 -; CHECK-I686-NEXT: movdqu %xmm5, 48 +; CHECK-I686-NEXT: movdqu %xmm6, 48 +; CHECK-I686-NEXT: movdqu %xmm3, 32 ; CHECK-I686-NEXT: movdqu %xmm0, 16 ; CHECK-I686-NEXT: retl %1 = load <24 x half>, ptr poison, align 2 diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll --- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll @@ -409,16 +409,18 @@ ; X64-BMI1-LABEL: scalar_i64_lowestbit_eq: ; X64-BMI1: # %bb.0: ; X64-BMI1-NEXT: movq %rsi, %rcx +; X64-BMI1-NEXT: movl $1, %eax ; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-BMI1-NEXT: shlq %cl, %rdi -; X64-BMI1-NEXT: testb $1, %dil +; X64-BMI1-NEXT: shrq %cl, %rax +; X64-BMI1-NEXT: testl %edi, %eax ; X64-BMI1-NEXT: sete %al ; X64-BMI1-NEXT: retq ; ; X64-BMI2-LABEL: scalar_i64_lowestbit_eq: ; X64-BMI2: # %bb.0: -; X64-BMI2-NEXT: shlxq %rsi, %rdi, %rax -; X64-BMI2-NEXT: testb $1, %al +; X64-BMI2-NEXT: movl $1, %eax +; X64-BMI2-NEXT: shrxq %rsi, %rax, %rax +; X64-BMI2-NEXT: testl %edi, %eax ; X64-BMI2-NEXT: sete %al ; X64-BMI2-NEXT: retq %t0 = lshr i64 1, %y @@ -497,45 +499,45 @@ define <4 x i1> @vec_4xi32_splat_eq(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-SSE2-LABEL: vec_4xi32_splat_eq: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pxor %xmm2, %xmm2 ; X86-SSE2-NEXT: pslld $23, %xmm1 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm2, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 ; X86-SSE2-NEXT: retl ; ; AVX2-LABEL: vec_4xi32_splat_eq: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} ; ; X64-SSE2-LABEL: vec_4xi32_splat_eq: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pxor %xmm2, %xmm2 ; X64-SSE2-NEXT: pslld $23, %xmm1 ; X64-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X64-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X64-SSE2-NEXT: pmuludq %xmm2, %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X64-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; X64-SSE2-NEXT: pxor %xmm0, %xmm0 +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 ; X64-SSE2-NEXT: retq %t0 = lshr <4 x i32> , %y %t1 = and <4 x i32> %t0, %x @@ -581,45 +583,45 @@ define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pxor %xmm2, %xmm2 ; X86-SSE2-NEXT: pslld $23, %xmm1 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm2, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 ; X86-SSE2-NEXT: retl ; ; AVX2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} ; ; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pxor %xmm2, %xmm2 ; X64-SSE2-NEXT: pslld $23, %xmm1 ; X64-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X64-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X64-SSE2-NEXT: pmuludq %xmm2, %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X64-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; X64-SSE2-NEXT: pxor %xmm0, %xmm0 +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 ; X64-SSE2-NEXT: retq %t0 = lshr <4 x i32> , %y %t1 = and <4 x i32> %t0, %x diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll --- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll @@ -471,10 +471,10 @@ ; AVX2-LABEL: vec_4xi32_splat_eq: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} ; ; X64-SSE2-LABEL: vec_4xi32_splat_eq: @@ -559,10 +559,10 @@ ; AVX2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} ; ; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq: diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll @@ -99,7 +99,7 @@ ; AVX1-FAST-LABEL: PR37890_v4f64: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -235,7 +235,7 @@ ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -480,8 +480,8 @@ ; X86-AVX2-LABEL: test_reduce_v4i64: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -553,8 +553,8 @@ ; X64-AVX2-LABEL: test_reduce_v4i64: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -565,7 +565,7 @@ ; X64-AVX512-LABEL: test_reduce_v4i64: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -629,9 +629,9 @@ ; X86-AVX2-LABEL: test_reduce_v8i32: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -685,9 +685,9 @@ ; X64-AVX2-LABEL: test_reduce_v8i32: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -697,9 +697,9 @@ ; X64-AVX512-LABEL: test_reduce_v8i32: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -758,11 +758,14 @@ ; X86-AVX2-LABEL: test_reduce_v16i16: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -806,11 +809,14 @@ ; X64-AVX2-LABEL: test_reduce_v16i16: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -818,11 +824,14 @@ ; X64-AVX512-LABEL: test_reduce_v16i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -910,13 +919,16 @@ ; X86-AVX2-LABEL: test_reduce_v32i8: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: xorb $127, %al ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -988,13 +1000,16 @@ ; X64-AVX2-LABEL: test_reduce_v32i8: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: xorb $127, %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1002,13 +1017,16 @@ ; X64-AVX512-LABEL: test_reduce_v32i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorb $127, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1147,8 +1165,8 @@ ; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -1264,8 +1282,8 @@ ; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -1278,7 +1296,7 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -1361,9 +1379,9 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -1433,9 +1451,9 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -1447,9 +1465,9 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -1519,11 +1537,14 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1575,11 +1596,14 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1587,13 +1611,16 @@ ; X64-AVX512-LABEL: test_reduce_v32i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1700,13 +1727,16 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: xorb $127, %al ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1794,13 +1824,16 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: xorb $127, %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1808,15 +1841,18 @@ ; X64-AVX512-LABEL: test_reduce_v64i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorb $127, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1869,15 +1905,28 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v16i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorl $32767, %eax ## imm = 0x7FFF -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -1901,22 +1950,38 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v16i16_v8i16: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: xorl $32767, %eax ## imm = 0x7FFF -; X64-AVX1OR2-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v16i16_v8i16: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1956,15 +2021,28 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorl $32767, %eax ## imm = 0x7FFF -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -1988,22 +2066,38 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v32i16_v8i16: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: xorl $32767, %eax ## imm = 0x7FFF -; X64-AVX1OR2-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v32i16_v8i16: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2065,17 +2159,32 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorb $127, %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorb $127, %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2121,26 +2230,44 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v32i8_v16i8: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1OR2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: xorb $127, %al -; X64-AVX1OR2-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorb $127, %al +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v32i8_v16i8: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorb $127, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2205,17 +2332,32 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v64i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorb $127, %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorb $127, %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2261,26 +2403,44 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v64i8_v16i8: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1OR2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: xorb $127, %al -; X64-AVX1OR2-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorb $127, %al +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v64i8_v16i8: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorb $127, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -483,8 +483,8 @@ ; X86-AVX2-LABEL: test_reduce_v4i64: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -557,8 +557,8 @@ ; X64-AVX2-LABEL: test_reduce_v4i64: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -569,7 +569,7 @@ ; X64-AVX512-LABEL: test_reduce_v4i64: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsq %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -633,9 +633,9 @@ ; X86-AVX2-LABEL: test_reduce_v8i32: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -689,9 +689,9 @@ ; X64-AVX2-LABEL: test_reduce_v8i32: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -701,9 +701,9 @@ ; X64-AVX512-LABEL: test_reduce_v8i32: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -762,11 +762,14 @@ ; X86-AVX2-LABEL: test_reduce_v16i16: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -810,11 +813,14 @@ ; X64-AVX2-LABEL: test_reduce_v16i16: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -822,11 +828,14 @@ ; X64-AVX512-LABEL: test_reduce_v16i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -914,13 +923,16 @@ ; X86-AVX2-LABEL: test_reduce_v32i8: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: addb $-128, %al ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -992,13 +1004,16 @@ ; X64-AVX2-LABEL: test_reduce_v32i8: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: addb $-128, %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1006,13 +1021,16 @@ ; X64-AVX512-LABEL: test_reduce_v32i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: addb $-128, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1151,8 +1169,8 @@ ; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 ; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -1268,8 +1286,8 @@ ; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 ; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -1282,7 +1300,7 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -1365,9 +1383,9 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -1437,9 +1455,9 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -1451,9 +1469,9 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -1523,11 +1541,14 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1579,11 +1600,14 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1591,13 +1615,16 @@ ; X64-AVX512-LABEL: test_reduce_v32i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1704,13 +1731,16 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: addb $-128, %al ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1798,13 +1828,16 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: addb $-128, %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1812,15 +1845,18 @@ ; X64-AVX512-LABEL: test_reduce_v64i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: addb $-128, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1873,15 +1909,28 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v16i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorl $32768, %eax ## imm = 0x8000 -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -1905,22 +1954,38 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v16i16_v8i16: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: xorl $32768, %eax ## imm = 0x8000 -; X64-AVX1OR2-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v16i16_v8i16: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1960,15 +2025,28 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorl $32768, %eax ## imm = 0x8000 -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -1992,22 +2070,38 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v32i16_v8i16: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: xorl $32768, %eax ## imm = 0x8000 -; X64-AVX1OR2-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v32i16_v8i16: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2069,17 +2163,32 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: addb $-128, %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: addb $-128, %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2125,26 +2234,44 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v32i8_v16i8: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1OR2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: addb $-128, %al -; X64-AVX1OR2-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: addb $-128, %al +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v32i8_v16i8: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: addb $-128, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2209,17 +2336,32 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v64i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: addb $-128, %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: addb $-128, %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2265,26 +2407,44 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v64i8_v16i8: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1OR2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: addb $-128, %al -; X64-AVX1OR2-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: addb $-128, %al +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v64i8_v16i8: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: addb $-128, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -545,11 +545,11 @@ ; X86-AVX2-LABEL: test_reduce_v4i64: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -636,10 +636,10 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 -; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -652,7 +652,7 @@ ; X64-AVX512-LABEL: test_reduce_v4i64: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -725,9 +725,9 @@ ; X86-AVX2-LABEL: test_reduce_v8i32: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -790,9 +790,9 @@ ; X64-AVX2-LABEL: test_reduce_v8i32: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -802,9 +802,9 @@ ; X64-AVX512-LABEL: test_reduce_v8i32: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -869,12 +869,14 @@ ; X86-AVX2-LABEL: test_reduce_v16i16: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: notl %eax ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -924,12 +926,14 @@ ; X64-AVX2-LABEL: test_reduce_v16i16: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notl %eax ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -937,11 +941,14 @@ ; X64-AVX512-LABEL: test_reduce_v16i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notl %eax ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1011,14 +1018,16 @@ ; X86-AVX2-LABEL: test_reduce_v32i8: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: notb %al ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1072,14 +1081,16 @@ ; X64-AVX2-LABEL: test_reduce_v32i8: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notb %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1087,13 +1098,16 @@ ; X64-AVX512-LABEL: test_reduce_v32i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notb %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1256,10 +1270,10 @@ ; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 ; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm3 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -1401,10 +1415,10 @@ ; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 ; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm3 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 -; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -1419,7 +1433,7 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -1517,9 +1531,9 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -1604,9 +1618,9 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -1618,9 +1632,9 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -1698,12 +1712,14 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: notl %eax ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1763,12 +1779,14 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notl %eax ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1776,13 +1794,16 @@ ; X64-AVX512-LABEL: test_reduce_v32i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notl %eax ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1863,14 +1884,16 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: notb %al ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1932,14 +1955,16 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notb %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1947,15 +1972,18 @@ ; X64-AVX512-LABEL: test_reduce_v64i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notb %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2012,16 +2040,29 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v16i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: notl %eax -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: notl %eax +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -2062,21 +2103,26 @@ ; ; X64-AVX2-LABEL: test_reduce_v16i16_v8i16: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notl %eax ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v16i16_v8i16: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notl %eax ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2120,16 +2166,29 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: notl %eax -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: notl %eax +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -2170,21 +2229,26 @@ ; ; X64-AVX2-LABEL: test_reduce_v32i16_v8i16: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notl %eax ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v32i16_v8i16: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notl %eax ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2231,18 +2295,33 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: notb %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: notb %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2288,25 +2367,30 @@ ; ; X64-AVX2-LABEL: test_reduce_v32i8_v16i8: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notb %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v32i8_v16i8: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notb %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2356,18 +2440,33 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v64i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: notb %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: notb %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2413,25 +2512,30 @@ ; ; X64-AVX2-LABEL: test_reduce_v64i8_v16i8: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notb %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v64i8_v16i8: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notb %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -489,11 +489,11 @@ ; X86-AVX2-LABEL: test_reduce_v4i64: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -582,10 +582,10 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 -; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -598,7 +598,7 @@ ; X64-AVX512-LABEL: test_reduce_v4i64: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminuq %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -671,9 +671,9 @@ ; X86-AVX2-LABEL: test_reduce_v8i32: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -736,9 +736,9 @@ ; X64-AVX2-LABEL: test_reduce_v8i32: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -748,9 +748,9 @@ ; X64-AVX512-LABEL: test_reduce_v8i32: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -813,8 +813,13 @@ ; X86-AVX2-LABEL: test_reduce_v16i16: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper @@ -863,8 +868,13 @@ ; X64-AVX2-LABEL: test_reduce_v16i16: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper @@ -873,8 +883,13 @@ ; X64-AVX512-LABEL: test_reduce_v16i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper @@ -939,10 +954,15 @@ ; X86-AVX2-LABEL: test_reduce_v32i8: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper @@ -991,10 +1011,15 @@ ; X64-AVX2-LABEL: test_reduce_v32i8: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper @@ -1003,10 +1028,15 @@ ; X64-AVX512-LABEL: test_reduce_v32i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper @@ -1172,10 +1202,10 @@ ; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 ; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -1319,10 +1349,10 @@ ; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 ; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 -; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -1337,7 +1367,7 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -1435,9 +1465,9 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -1522,9 +1552,9 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -1536,9 +1566,9 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -1616,8 +1646,13 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper @@ -1678,8 +1713,13 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper @@ -1688,10 +1728,15 @@ ; X64-AVX512-LABEL: test_reduce_v32i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper @@ -1767,10 +1812,15 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper @@ -1827,10 +1877,15 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper @@ -1839,12 +1894,17 @@ ; X64-AVX512-LABEL: test_reduce_v64i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper @@ -1902,13 +1962,26 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v16i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -1936,13 +2009,39 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v16i16_v8i16: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> %2 = icmp ult <16 x i16> %a0, %1 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1 @@ -1983,13 +2082,26 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -2017,13 +2129,39 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v32i16_v8i16: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> %2 = icmp ult <32 x i16> %a0, %1 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1 @@ -2064,15 +2202,30 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2100,15 +2253,45 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v32i8_v16i8: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> %2 = icmp ult <32 x i8> %a0, %1 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1 @@ -2152,15 +2335,30 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v64i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2188,15 +2386,45 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v64i8_v16i8: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> %2 = icmp ult <64 x i8> %a0, %1 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1 diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -20,9 +20,16 @@ ; SSSE3-SLOW-LABEL: pair_sum_v4f32_v4f32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm3 -; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm0 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm2 +; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,1] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,1] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm3 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v4f32_v4f32: @@ -32,17 +39,17 @@ ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm0 ; SSSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: pair_sum_v4f32_v4f32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1] -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1] -; AVX-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 -; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] -; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: pair_sum_v4f32_v4f32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1] +; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 +; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: pair_sum_v4f32_v4f32: ; AVX-FAST: # %bb.0: @@ -50,6 +57,25 @@ ; AVX-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm1 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: pair_sum_v4f32_v4f32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX2-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX2-SLOW-NEXT: vaddss %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; AVX2-SLOW-NEXT: vaddss %xmm1, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX2-SLOW-NEXT: retq %5 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> %6 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> %7 = fadd <2 x float> %5, %6 @@ -82,13 +108,19 @@ ; SSSE3-SLOW-LABEL: pair_sum_v4i32_v4i32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm3 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,1,3] +; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm2 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm3 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm0 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v4i32_v4i32: @@ -103,15 +135,15 @@ ; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm2 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] +; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm2 +; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6,7] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: pair_sum_v4i32_v4i32: @@ -123,18 +155,18 @@ ; ; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32: ; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm2 -; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,1,3] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: retq %5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> %6 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> @@ -173,24 +205,22 @@ ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm5 -; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm2 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1,3,2] +; SSSE3-SLOW-NEXT: movaps %xmm5, %xmm1 +; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm1 +; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm2 ; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6 -; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,1] -; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm4 +; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm4 +; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm1 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0 -; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm2 +; SSSE3-FAST-NEXT: haddps %xmm2, %xmm0 ; SSSE3-FAST-NEXT: haddps %xmm5, %xmm4 -; SSSE3-FAST-NEXT: haddps %xmm4, %xmm2 -; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-FAST-NEXT: haddps %xmm7, %xmm6 ; SSSE3-FAST-NEXT: haddps %xmm6, %xmm4 ; SSSE3-FAST-NEXT: movaps %xmm4, %xmm1 @@ -352,16 +382,12 @@ ; SSSE3-FAST-LABEL: pair_sum_v8i32_v4i32: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0 -; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 ; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2 +; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm0 ; SSSE3-FAST-NEXT: phaddd %xmm5, %xmm4 -; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm2 -; SSSE3-FAST-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-FAST-NEXT: phaddd %xmm6, %xmm6 -; SSSE3-FAST-NEXT: phaddd %xmm7, %xmm7 ; SSSE3-FAST-NEXT: phaddd %xmm7, %xmm6 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,2] -; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1 +; SSSE3-FAST-NEXT: phaddd %xmm6, %xmm4 +; SSSE3-FAST-NEXT: movdqa %xmm4, %xmm1 ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: pair_sum_v8i32_v4i32: @@ -425,8 +451,10 @@ ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] @@ -448,8 +476,10 @@ ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] @@ -524,7 +554,7 @@ ; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm5 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4 ; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[3,2] ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm0 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] @@ -550,7 +580,7 @@ ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm5 ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 ; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[3,2] ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3] ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2 @@ -638,20 +668,23 @@ ; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm4 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSSE3-SLOW-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm5 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm6 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0] -; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSSE3-SLOW-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSSE3-SLOW-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm1 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm0 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm3 +; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm3 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] +; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: sequential_sum_v4i32_v4i32: @@ -660,19 +693,22 @@ ; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm4 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSSE3-FAST-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-FAST-NEXT: paddd %xmm0, %xmm4 ; SSSE3-FAST-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1 +; SSSE3-FAST-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSSE3-FAST-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSSE3-FAST-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1 -; SSSE3-FAST-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm5 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; SSSE3-FAST-NEXT: paddd %xmm5, %xmm6 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0] -; SSSE3-FAST-NEXT: paddd %xmm4, %xmm0 +; SSSE3-FAST-NEXT: paddd %xmm4, %xmm1 +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm3 +; SSSE3-FAST-NEXT: paddd %xmm3, %xmm0 +; SSSE3-FAST-NEXT: paddd %xmm0, %xmm2 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] +; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0 ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: sequential_sum_v4i32_v4i32: @@ -948,24 +984,25 @@ ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm5[1,1,3,3] -; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSSE3-SLOW-NEXT: addss %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSSE3-SLOW-NEXT: addss %xmm4, %xmm1 +; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1 -; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm3 -; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] -; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSSE3-SLOW-NEXT: addss %xmm1, %xmm2 +; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSSE3-SLOW-NEXT: addss %xmm1, %xmm3 +; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: @@ -983,27 +1020,31 @@ ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSSE3-FAST-NEXT: addps %xmm3, %xmm2 -; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSSE3-FAST-NEXT: haddps %xmm1, %xmm2 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[2,0] ; SSSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX-SLOW-NEXT: vaddss %xmm4, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0] -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm2, %xmm2 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm3, %xmm3 -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1],xmm2[1,1] -; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0] -; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: @@ -1015,10 +1056,12 @@ ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0] -; AVX-FAST-NEXT: vaddps %xmm2, %xmm3, %xmm2 -; AVX-FAST-NEXT: vhaddps %xmm2, %xmm1, %xmm1 -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3] +; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] +; AVX-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1 +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX-FAST-NEXT: retq %5 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0) %6 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1) @@ -1035,24 +1078,23 @@ ; SSSE3-SLOW-LABEL: reduction_sum_v4i32_v4i32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm5 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm6 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2 +; SSSE3-SLOW-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,0] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32: @@ -1066,69 +1108,73 @@ ; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSSE3-FAST-NEXT: paddd %xmm3, %xmm2 -; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm2 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[2,0] ; SSSE3-FAST-NEXT: retq ; -; AVX1-SLOW-LABEL: reduction_sum_v4i32_v4i32: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm6, %xmm3, %xmm3 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-SLOW-NEXT: retq +; AVX-SLOW-LABEL: reduction_sum_v4i32_v4i32: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vmovd %xmm1, %eax +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; AVX-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vmovd %xmm1, %ecx +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; AVX-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vmovd %xmm1, %edx +; AVX-SLOW-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; AVX-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; AVX-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1 -; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; AVX-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; AVX-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1 -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: reduction_sum_v4i32_v4i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm2 +; AVX1-FAST-NEXT: vmovd %xmm2, %eax +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,1] +; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; AVX1-FAST-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX1-FAST-NEXT: retq ; -; AVX2-SLOW-LABEL: reduction_sum_v4i32_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: retq +; AVX2-FAST-LABEL: reduction_sum_v4i32_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovd %xmm2, %eax +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX2-FAST-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX2-FAST-NEXT: retq %5 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %0) %6 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %1) %7 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %2) diff --git a/llvm/test/CodeGen/X86/i128-add.ll b/llvm/test/CodeGen/X86/i128-add.ll --- a/llvm/test/CodeGen/X86/i128-add.ll +++ b/llvm/test/CodeGen/X86/i128-add.ll @@ -74,13 +74,9 @@ ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: addq %rdx, %rax ; X64-NEXT: adcq %rcx, %rsi -; X64-NEXT: movq %rax, %xmm0 -; X64-NEXT: movq %rsi, %xmm1 -; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: addq $1, %rax -; X64-NEXT: adcq $0, %rdx +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: movq %rsi, %rdx ; X64-NEXT: retq %t0 = add <1 x i128> %x, %t1 = add <1 x i128> %y, %t0 diff --git a/llvm/test/CodeGen/X86/i64-to-float.ll b/llvm/test/CodeGen/X86/i64-to-float.ll --- a/llvm/test/CodeGen/X86/i64-to-float.ll +++ b/llvm/test/CodeGen/X86/i64-to-float.ll @@ -323,31 +323,32 @@ ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; X64-SSE-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE-NEXT: pxor %xmm1, %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; X64-SSE-NEXT: pcmpeqd %xmm4, %xmm4 -; X64-SSE-NEXT: pcmpeqd %xmm3, %xmm4 -; X64-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; X64-SSE-NEXT: pand %xmm4, %xmm3 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm3 = [18446744071562067713,18446744071562067713] +; X64-SSE-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE-NEXT: pcmpgtd %xmm3, %xmm4 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X64-SSE-NEXT: pcmpeqd %xmm3, %xmm2 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X64-SSE-NEXT: por %xmm3, %xmm2 -; X64-SSE-NEXT: pand %xmm2, %xmm0 -; X64-SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; X64-SSE-NEXT: por %xmm0, %xmm2 -; X64-SSE-NEXT: pxor %xmm2, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; X64-SSE-NEXT: pxor %xmm3, %xmm3 -; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm3 +; X64-SSE-NEXT: pand %xmm5, %xmm2 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X64-SSE-NEXT: por %xmm2, %xmm3 +; X64-SSE-NEXT: pand %xmm3, %xmm0 +; X64-SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; X64-SSE-NEXT: por %xmm0, %xmm3 +; X64-SSE-NEXT: pxor %xmm3, %xmm1 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm0 = [2147483903,2147483903] -; X64-SSE-NEXT: pcmpgtd %xmm1, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; X64-SSE-NEXT: pand %xmm3, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X64-SSE-NEXT: por %xmm1, %xmm0 -; X64-SSE-NEXT: pand %xmm0, %xmm2 -; X64-SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE-NEXT: por %xmm2, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pcmpgtd %xmm1, %xmm2 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X64-SSE-NEXT: pand %xmm4, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X64-SSE-NEXT: por %xmm0, %xmm1 +; X64-SSE-NEXT: pand %xmm1, %xmm3 +; X64-SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE-NEXT: por %xmm3, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm0 ; X64-SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/iabs.ll b/llvm/test/CodeGen/X86/iabs.ll --- a/llvm/test/CodeGen/X86/iabs.ll +++ b/llvm/test/CodeGen/X86/iabs.ll @@ -39,7 +39,7 @@ ; X86-NO-CMOV: # %bb.0: ; X86-NO-CMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NO-CMOV-NEXT: movswl %ax, %ecx -; X86-NO-CMOV-NEXT: sarl $15, %ecx +; X86-NO-CMOV-NEXT: shrl $15, %ecx ; X86-NO-CMOV-NEXT: xorl %ecx, %eax ; X86-NO-CMOV-NEXT: subl %ecx, %eax ; X86-NO-CMOV-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll --- a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll +++ b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll @@ -513,11 +513,13 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] ; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551487,18446744073709551487,18446744073709551487,18446744073709551487] ; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -572,9 +574,16 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] ; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551487,18446744073709551487,18446744073709551487,18446744073709551487] ; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: eq_or_to_abs_vec4x64_sext: @@ -645,12 +654,15 @@ ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] ; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18446744073709551487,18446744073709551487,18446744073709551487,18446744073709551487] ; AVX2-NEXT: vpcmpeqq %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -713,10 +725,18 @@ ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] ; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18446744073709551487,18446744073709551487,18446744073709551487,18446744073709551487] ; AVX2-NEXT: vpcmpeqq %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: ne_and_to_abs_vec4x64_sext: @@ -1002,6 +1022,22 @@ ; AVX2-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpextrb $1, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: negb %al +; AVX2-NEXT: vpextrb $0, %xmm0, %ecx +; AVX2-NEXT: andb $1, %cl +; AVX2-NEXT: negb %cl +; AVX2-NEXT: vmovd %ecx, %xmm1 +; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpextrb $2, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: negb %al +; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpextrb $3, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: negb %al +; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: eq_or_to_abs_vec4x8_sext: @@ -1010,6 +1046,27 @@ ; SSE41-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pextrb $1, %xmm0, %eax +; SSE41-NEXT: andb $1, %al +; SSE41-NEXT: negb %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pextrb $0, %xmm0, %ecx +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: negb %cl +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: movd %ecx, %xmm1 +; SSE41-NEXT: pinsrb $1, %eax, %xmm1 +; SSE41-NEXT: pextrb $2, %xmm0, %eax +; SSE41-NEXT: andb $1, %al +; SSE41-NEXT: negb %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $2, %eax, %xmm1 +; SSE41-NEXT: pextrb $3, %xmm0, %eax +; SSE41-NEXT: andb $1, %al +; SSE41-NEXT: negb %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; SSE2-LABEL: eq_or_to_abs_vec4x8_sext: @@ -1018,6 +1075,19 @@ ; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: shll $8, %ecx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: orl %ecx, %edx +; SSE2-NEXT: shll $8, %eax +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: orl %eax, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: pinsrw $1, %edx, %xmm0 ; SSE2-NEXT: retq %cmp1 = icmp eq <4 x i8> %x, %cmp2 = icmp eq <4 x i8> %x, @@ -1114,6 +1184,22 @@ ; AVX2-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpextrw $1, %xmm0, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: vpextrw $0, %xmm0, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: negl %ecx +; AVX2-NEXT: vmovd %ecx, %xmm1 +; AVX2-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpextrw $2, %xmm0, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpextrw $3, %xmm0, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: ne_and_to_abs_vec4x16_sext: @@ -1124,7 +1210,22 @@ ; SSE41-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: pandn %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pextrw $1, %xmm1, %eax +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: negl %eax +; SSE41-NEXT: pextrw $0, %xmm1, %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: negl %ecx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrw $1, %eax, %xmm0 +; SSE41-NEXT: pextrw $2, %xmm1, %eax +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: negl %eax +; SSE41-NEXT: pinsrw $2, %eax, %xmm0 +; SSE41-NEXT: pextrw $3, %xmm1, %eax +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: negl %eax +; SSE41-NEXT: pinsrw $3, %eax, %xmm0 ; SSE41-NEXT: retq ; ; SSE2-LABEL: ne_and_to_abs_vec4x16_sext: @@ -1135,7 +1236,22 @@ ; SSE2-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pextrw $1, %xmm1, %eax +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pextrw $0, %xmm1, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: negl %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: pinsrw $1, %eax, %xmm0 +; SSE2-NEXT: pextrw $2, %xmm1, %eax +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pinsrw $2, %eax, %xmm0 +; SSE2-NEXT: pextrw $3, %xmm1, %eax +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pinsrw $3, %eax, %xmm0 ; SSE2-NEXT: retq %cmp1 = icmp ne <4 x i16> %x, %cmp2 = icmp ne <4 x i16> %x, diff --git a/llvm/test/CodeGen/X86/icmp-abs-C.ll b/llvm/test/CodeGen/X86/icmp-abs-C.ll --- a/llvm/test/CodeGen/X86/icmp-abs-C.ll +++ b/llvm/test/CodeGen/X86/icmp-abs-C.ll @@ -163,7 +163,7 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movswl %cx, %eax -; X86-NEXT: sarl $15, %eax +; X86-NEXT: shrl $15, %eax ; X86-NEXT: xorl %eax, %ecx ; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl %ecx, %eax diff --git a/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll b/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll --- a/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll +++ b/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll @@ -198,7 +198,7 @@ ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movswl %ax, %ecx -; X86-NEXT: sarl $15, %ecx +; X86-NEXT: shrl $15, %ecx ; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: movzwl %ax, %eax diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll --- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll +++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll @@ -13,30 +13,34 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB0_1: # %loop ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: addl $1, %edi +; X86-NEXT: addl $1, %ecx ; X86-NEXT: adcl $0, %esi ; X86-NEXT: adcl $0, %edx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: orl %ecx, %ebx -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: orl %edx, %ebp -; X86-NEXT: orl %ecx, %ebp -; X86-NEXT: shrdl $28, %ebx, %ebp +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: shldl $4, %edx, %edi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: shldl $4, %esi, %ebp +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: shrl $28, %ecx +; X86-NEXT: orl %ebp, %ecx +; X86-NEXT: orl %edi, %ecx +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: jne .LBB0_1 ; X86-NEXT: # %bb.2: # %exit -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %ebx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -52,9 +56,11 @@ ; X64-NEXT: # =>This Inner Loop Header: Depth=1 ; X64-NEXT: addq $1, %rax ; X64-NEXT: adcq $0, %rdx -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: shrq $60, %rcx -; X64-NEXT: orq %rdx, %rcx +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: shldq $4, %rax, %rcx +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: shrq $60, %rsi +; X64-NEXT: orq %rcx, %rsi ; X64-NEXT: jne .LBB0_1 ; X64-NEXT: # %bb.2: # %exit ; X64-NEXT: retq @@ -73,21 +79,27 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_srl_eq_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: shldl $15, %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shldl $15, %ecx, %esi +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: shrl $17, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shrdl $17, %ecx, %eax +; X86-NEXT: orl %edx, %eax ; X86-NEXT: sete %al +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_srl_eq_zero: ; X64: # %bb.0: -; X64-NEXT: shrq $17, %rdi -; X64-NEXT: orq %rsi, %rdi +; X64-NEXT: shrdq $17, %rsi, %rdi +; X64-NEXT: shrq $17, %rsi +; X64-NEXT: orq %rdi, %rsi ; X64-NEXT: sete %al ; X64-NEXT: retq %srl = lshr i128 %a, 17 @@ -98,21 +110,27 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_srl_ne_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: shldl $15, %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shldl $15, %ecx, %esi +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: shrl $17, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shrdl $17, %ecx, %eax +; X86-NEXT: orl %edx, %eax ; X86-NEXT: setne %al +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_srl_ne_zero: ; X64: # %bb.0: -; X64-NEXT: shrq $17, %rdi -; X64-NEXT: orq %rsi, %rdi +; X64-NEXT: shrdq $17, %rsi, %rdi +; X64-NEXT: shrq $17, %rsi +; X64-NEXT: orq %rdi, %rsi ; X64-NEXT: setne %al ; X64-NEXT: retq %srl = lshr i128 %a, 17 @@ -123,19 +141,27 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_shl_eq_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $17, %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: shldl $17, %eax, %edx +; X86-NEXT: shll $17, %esi +; X86-NEXT: orl %edx, %esi ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl %ecx, %eax +; X86-NEXT: shldl $17, %ecx, %eax +; X86-NEXT: orl %esi, %eax ; X86-NEXT: sete %al +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_shl_eq_zero: ; X64: # %bb.0: -; X64-NEXT: shlq $17, %rsi -; X64-NEXT: orq %rdi, %rsi +; X64-NEXT: shldq $17, %rdi, %rsi +; X64-NEXT: shlq $17, %rdi +; X64-NEXT: orq %rsi, %rdi ; X64-NEXT: sete %al ; X64-NEXT: retq %shl = shl i128 %a, 17 @@ -146,19 +172,27 @@ define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_shl_ne_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $17, %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: shldl $17, %eax, %edx +; X86-NEXT: shll $17, %esi +; X86-NEXT: orl %edx, %esi ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl %ecx, %eax +; X86-NEXT: shldl $17, %ecx, %eax +; X86-NEXT: orl %esi, %eax ; X86-NEXT: setne %al +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_shl_ne_zero: ; X64: # %bb.0: -; X64-NEXT: shlq $17, %rsi -; X64-NEXT: orq %rdi, %rsi +; X64-NEXT: shldq $17, %rdi, %rsi +; X64-NEXT: shlq $17, %rdi +; X64-NEXT: orq %rsi, %rdi ; X64-NEXT: setne %al ; X64-NEXT: retq %shl = shl i128 %a, 17 @@ -233,8 +267,9 @@ ; ; X64-LABEL: opt_setcc_expanded_shl_correct_shifts: ; X64: # %bb.0: -; X64-NEXT: shlq $17, %rdi -; X64-NEXT: orq %rsi, %rdi +; X64-NEXT: shldq $17, %rsi, %rdi +; X64-NEXT: shlq $17, %rsi +; X64-NEXT: orq %rdi, %rsi ; X64-NEXT: sete %al ; X64-NEXT: retq %shl.a = shl i64 %a, 17 diff --git a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll --- a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll +++ b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll @@ -108,7 +108,15 @@ ; ; X64-LABEL: i56_or: ; X64: # %bb.0: -; X64-NEXT: orl $384, (%rdi) # imm = 0x180 +; X64-NEXT: movzwl 4(%rdi), %eax +; X64-NEXT: movzbl 6(%rdi), %ecx +; X64-NEXT: shll $16, %ecx +; X64-NEXT: orl %eax, %ecx +; X64-NEXT: shlq $32, %rcx +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: orq %rcx, %rax +; X64-NEXT: orq $384, %rax # imm = 0x180 +; X64-NEXT: movl %eax, (%rdi) ; X64-NEXT: retq %aa = load i56, ptr %a, align 1 %b = or i56 %aa, 384 @@ -163,19 +171,20 @@ ; ; X64-LABEL: i56_insert_bit: ; X64: # %bb.0: -; X64-NEXT: movzwl 4(%rdi), %eax -; X64-NEXT: movzbl 6(%rdi), %ecx -; X64-NEXT: shll $16, %ecx -; X64-NEXT: orl %eax, %ecx -; X64-NEXT: shlq $32, %rcx -; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movzwl 4(%rdi), %ecx +; X64-NEXT: movzbl 6(%rdi), %edx +; X64-NEXT: shll $16, %edx +; X64-NEXT: orl %ecx, %edx +; X64-NEXT: shlq $32, %rdx +; X64-NEXT: movl (%rdi), %ecx +; X64-NEXT: orq %rdx, %rcx +; X64-NEXT: shlq $13, %rax +; X64-NEXT: andq $-8193, %rcx # imm = 0xDFFF ; X64-NEXT: orq %rcx, %rax -; X64-NEXT: shll $13, %esi -; X64-NEXT: andq $-8193, %rax # imm = 0xDFFF -; X64-NEXT: orl %eax, %esi -; X64-NEXT: shrq $32, %rax -; X64-NEXT: movw %ax, 4(%rdi) -; X64-NEXT: movl %esi, (%rdi) +; X64-NEXT: shrq $32, %rcx +; X64-NEXT: movw %cx, 4(%rdi) +; X64-NEXT: movl %eax, (%rdi) ; X64-NEXT: retq %extbit = zext i1 %bit to i56 %b = load i56, ptr %a, align 1 diff --git a/llvm/test/CodeGen/X86/insertelement-duplicates.ll b/llvm/test/CodeGen/X86/insertelement-duplicates.ll --- a/llvm/test/CodeGen/X86/insertelement-duplicates.ll +++ b/llvm/test/CodeGen/X86/insertelement-duplicates.ll @@ -31,18 +31,18 @@ ; AVX-32: # %bb.0: # %L.entry ; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; AVX-32-NEXT: vbroadcastss 304(%ecx), %xmm0 -; AVX-32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] +; AVX-32-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-32-NEXT: vbroadcastss 304(%ecx), %xmm1 +; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] ; AVX-32-NEXT: vmovups %ymm0, 608(%eax) ; AVX-32-NEXT: vzeroupper ; AVX-32-NEXT: retl ; ; AVX-64-LABEL: PR15298: ; AVX-64: # %bb.0: # %L.entry -; AVX-64-NEXT: vbroadcastss 304(%rdi), %xmm0 -; AVX-64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] +; AVX-64-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-64-NEXT: vbroadcastss 304(%rdi), %xmm1 +; AVX-64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] ; AVX-64-NEXT: vmovups %ymm0, 608(%rsi) ; AVX-64-NEXT: vzeroupper ; AVX-64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll --- a/llvm/test/CodeGen/X86/insertelement-var-index.ll +++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll @@ -2270,14 +2270,14 @@ ; SSE: # %bb.0: ; SSE-NEXT: movl (%rdi), %eax ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm0, 96(%rdi) ; SSE-NEXT: movdqa %xmm0, 112(%rdi) -; SSE-NEXT: movdqa %xmm0, 64(%rdi) +; SSE-NEXT: movdqa %xmm0, 96(%rdi) ; SSE-NEXT: movdqa %xmm0, 80(%rdi) -; SSE-NEXT: movdqa %xmm0, 32(%rdi) +; SSE-NEXT: movdqa %xmm0, 64(%rdi) ; SSE-NEXT: movdqa %xmm0, 48(%rdi) -; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: movdqa %xmm0, 32(%rdi) ; SSE-NEXT: movdqa %xmm0, 16(%rdi) +; SSE-NEXT: movdqa %xmm0, (%rdi) ; SSE-NEXT: leal 2147483647(%rax), %ecx ; SSE-NEXT: testl %eax, %eax ; SSE-NEXT: cmovnsl %eax, %ecx @@ -2293,8 +2293,8 @@ ; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX1-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-NEXT: vmovaps %ymm0, 64(%rdi) ; AVX1-NEXT: vmovaps %ymm0, 96(%rdi) +; AVX1-NEXT: vmovaps %ymm0, 64(%rdi) ; AVX1-NEXT: vmovaps %ymm0, 32(%rdi) ; AVX1-NEXT: movl (%rdi), %eax ; AVX1-NEXT: vmovaps %ymm1, (%rdi) @@ -2314,8 +2314,8 @@ ; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 ; AVX2-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqa %ymm0, 64(%rdi) ; AVX2-NEXT: vmovdqa %ymm0, 96(%rdi) +; AVX2-NEXT: vmovdqa %ymm0, 64(%rdi) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi) ; AVX2-NEXT: movl (%rdi), %eax ; AVX2-NEXT: vmovdqa %ymm1, (%rdi) @@ -2357,8 +2357,8 @@ ; X86AVX2-NEXT: vbroadcastsd (%ecx), %ymm0 ; X86AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; X86AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; X86AVX2-NEXT: vmovaps %ymm0, 64(%ecx) ; X86AVX2-NEXT: vmovaps %ymm0, 96(%ecx) +; X86AVX2-NEXT: vmovaps %ymm0, 64(%ecx) ; X86AVX2-NEXT: vmovaps %ymm0, 32(%ecx) ; X86AVX2-NEXT: movl (%ecx), %eax ; X86AVX2-NEXT: vmovaps %ymm1, (%ecx) diff --git a/llvm/test/CodeGen/X86/insertelement-zero.ll b/llvm/test/CodeGen/X86/insertelement-zero.ll --- a/llvm/test/CodeGen/X86/insertelement-zero.ll +++ b/llvm/test/CodeGen/X86/insertelement-zero.ll @@ -337,19 +337,22 @@ ; SSE2-LABEL: insert_v16i16_z12345z789ABCDEz: ; SSE2: # %bb.0: ; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: pinsrw $7, %eax, %xmm1 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v16i16_z12345z789ABCDEz: ; SSE3: # %bb.0: ; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE3-NEXT: xorl %eax, %eax +; SSE3-NEXT: pinsrw $7, %eax, %xmm1 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v16i16_z12345z789ABCDEz: ; SSSE3: # %bb.0: ; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: xorl %eax, %eax +; SSSE3-NEXT: pinsrw $7, %eax, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v16i16_z12345z789ABCDEz: @@ -359,10 +362,10 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_v16i16_z12345z789ABCDEz: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: insert_v16i16_z12345z789ABCDEz: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq %1 = insertelement <16 x i16> %a, i16 0, i32 0 %2 = insertelement <16 x i16> %1, i16 0, i32 6 %3 = insertelement <16 x i16> %2, i16 0, i32 15 diff --git a/llvm/test/CodeGen/X86/is_fpclass-fp80.ll b/llvm/test/CodeGen/X86/is_fpclass-fp80.ll --- a/llvm/test/CodeGen/X86/is_fpclass-fp80.ll +++ b/llvm/test/CodeGen/X86/is_fpclass-fp80.ll @@ -253,7 +253,7 @@ define i1 @is_inf_f80(x86_fp80 %x) { ; CHECK-32-LABEL: is_inf_f80: ; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; CHECK-32-NEXT: notl %eax ; CHECK-32-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 ; CHECK-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx @@ -265,7 +265,7 @@ ; ; CHECK-64-LABEL: is_inf_f80: ; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; CHECK-64-NEXT: notl %eax ; CHECK-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 ; CHECK-64-NEXT: xorq {{[0-9]+}}(%rsp), %rcx @@ -308,9 +308,9 @@ ; CHECK-32-LABEL: is_neginf_f80: ; CHECK-32: # %bb.0: # %entry ; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: xorl $65535, %eax # imm = 0xFFFF ; CHECK-32-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 ; CHECK-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: xorl $65535, %eax # imm = 0xFFFF ; CHECK-32-NEXT: orl {{[0-9]+}}(%esp), %eax ; CHECK-32-NEXT: orl %ecx, %eax ; CHECK-32-NEXT: sete %al @@ -319,10 +319,10 @@ ; CHECK-64-LABEL: is_neginf_f80: ; CHECK-64: # %bb.0: # %entry ; CHECK-64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; CHECK-64-NEXT: xorq $65535, %rax # imm = 0xFFFF ; CHECK-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 ; CHECK-64-NEXT: xorq {{[0-9]+}}(%rsp), %rcx -; CHECK-64-NEXT: orq %rax, %rcx +; CHECK-64-NEXT: xorq $65535, %rax # imm = 0xFFFF +; CHECK-64-NEXT: orq %rcx, %rax ; CHECK-64-NEXT: sete %al ; CHECK-64-NEXT: retq entry: @@ -370,22 +370,22 @@ ; CHECK-32-NEXT: pushl %esi ; CHECK-32-NEXT: .cfi_def_cfa_offset 8 ; CHECK-32-NEXT: .cfi_offset %esi, -8 -; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; CHECK-32-NEXT: movswl %dx, %ecx -; CHECK-32-NEXT: sarl $15, %ecx +; CHECK-32-NEXT: movswl {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: movl %ecx, %edx +; CHECK-32-NEXT: sarl $31, %edx ; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: andl $32767, %edx # imm = 0x7FFF -; CHECK-32-NEXT: decl %edx -; CHECK-32-NEXT: movzwl %dx, %edx +; CHECK-32-NEXT: andl $32767, %ecx # imm = 0x7FFF +; CHECK-32-NEXT: decl %ecx +; CHECK-32-NEXT: movzwl %cx, %ecx ; CHECK-32-NEXT: xorl %esi, %esi -; CHECK-32-NEXT: cmpl $32766, %edx # imm = 0x7FFE +; CHECK-32-NEXT: cmpl $32766, %ecx # imm = 0x7FFE ; CHECK-32-NEXT: sbbl %esi, %esi -; CHECK-32-NEXT: setb %dl -; CHECK-32-NEXT: testl %ecx, %ecx -; CHECK-32-NEXT: setns %cl +; CHECK-32-NEXT: setb %cl +; CHECK-32-NEXT: testl %edx, %edx +; CHECK-32-NEXT: setns %dl ; CHECK-32-NEXT: shrl $31, %eax -; CHECK-32-NEXT: andb %cl, %al ; CHECK-32-NEXT: andb %dl, %al +; CHECK-32-NEXT: andb %cl, %al ; CHECK-32-NEXT: # kill: def $al killed $al killed $eax ; CHECK-32-NEXT: popl %esi ; CHECK-32-NEXT: .cfi_def_cfa_offset 4 @@ -441,9 +441,10 @@ ; ; CHECK-64-LABEL: is_negnormal_f80: ; CHECK-64: # %bb.0: # %entry +; CHECK-64-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; CHECK-64-NEXT: movswq %cx, %rdx ; CHECK-64-NEXT: movq {{[0-9]+}}(%rsp), %rax -; CHECK-64-NEXT: movswq {{[0-9]+}}(%rsp), %rcx -; CHECK-64-NEXT: testq %rcx, %rcx +; CHECK-64-NEXT: testq %rdx, %rdx ; CHECK-64-NEXT: sets %dl ; CHECK-64-NEXT: andl $32767, %ecx # imm = 0x7FFF ; CHECK-64-NEXT: decl %ecx diff --git a/llvm/test/CodeGen/X86/ispow2.ll b/llvm/test/CodeGen/X86/ispow2.ll --- a/llvm/test/CodeGen/X86/ispow2.ll +++ b/llvm/test/CodeGen/X86/ispow2.ll @@ -78,22 +78,22 @@ ; CHECK-NOBMI-LABEL: is_pow2_non_zero_4xv64: ; CHECK-NOBMI: # %bb.0: ; CHECK-NOBMI-NEXT: movdqa {{.*#+}} xmm2 = [256,256] -; CHECK-NOBMI-NEXT: por %xmm2, %xmm0 ; CHECK-NOBMI-NEXT: por %xmm2, %xmm1 +; CHECK-NOBMI-NEXT: por %xmm2, %xmm0 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm2 -; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm3 +; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm3 ; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm3 -; CHECK-NOBMI-NEXT: pand %xmm1, %xmm3 +; CHECK-NOBMI-NEXT: pand %xmm3, %xmm0 +; CHECK-NOBMI-NEXT: paddq %xmm1, %xmm2 +; CHECK-NOBMI-NEXT: pand %xmm1, %xmm2 ; CHECK-NOBMI-NEXT: pxor %xmm1, %xmm1 -; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm3 -; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,0,3,2] -; CHECK-NOBMI-NEXT: pand %xmm3, %xmm4 -; CHECK-NOBMI-NEXT: paddq %xmm0, %xmm2 -; CHECK-NOBMI-NEXT: pand %xmm2, %xmm0 +; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm2 +; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; CHECK-NOBMI-NEXT: pand %xmm2, %xmm3 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; CHECK-NOBMI-NEXT: pand %xmm1, %xmm0 -; CHECK-NOBMI-NEXT: packssdw %xmm4, %xmm0 +; CHECK-NOBMI-NEXT: packssdw %xmm3, %xmm0 ; CHECK-NOBMI-NEXT: retq ; ; CHECK-AVX2-LABEL: is_pow2_non_zero_4xv64: @@ -129,9 +129,12 @@ ; CHECK-NOBMI-LABEL: neither_pow2_non_zero_4xv64: ; CHECK-NOBMI: # %bb.0: ; CHECK-NOBMI-NEXT: movdqa {{.*#+}} xmm2 = [256,256] -; CHECK-NOBMI-NEXT: por %xmm2, %xmm0 ; CHECK-NOBMI-NEXT: por %xmm2, %xmm1 +; CHECK-NOBMI-NEXT: por %xmm2, %xmm0 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm2 +; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm3 +; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm3 +; CHECK-NOBMI-NEXT: pand %xmm3, %xmm0 ; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm3 ; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm3 ; CHECK-NOBMI-NEXT: pand %xmm1, %xmm3 @@ -140,9 +143,6 @@ ; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,0,3,2] ; CHECK-NOBMI-NEXT: pand %xmm3, %xmm4 ; CHECK-NOBMI-NEXT: pxor %xmm2, %xmm4 -; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm3 -; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm3 -; CHECK-NOBMI-NEXT: pand %xmm3, %xmm0 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; CHECK-NOBMI-NEXT: pand %xmm1, %xmm0 @@ -189,40 +189,39 @@ ; CHECK-NOBMI-NEXT: paddq %xmm3, %xmm4 ; CHECK-NOBMI-NEXT: pand %xmm1, %xmm4 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm1 -; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,0,3,2] -; CHECK-NOBMI-NEXT: pand %xmm1, %xmm5 -; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm4 -; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2] -; CHECK-NOBMI-NEXT: pand %xmm4, %xmm1 -; CHECK-NOBMI-NEXT: pxor %xmm3, %xmm1 -; CHECK-NOBMI-NEXT: por %xmm5, %xmm1 -; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm4 +; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm5 +; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm5 +; CHECK-NOBMI-NEXT: movdqa %xmm5, %xmm6 +; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm1[1,3] +; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] +; CHECK-NOBMI-NEXT: andps %xmm6, %xmm5 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm4 -; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2] -; CHECK-NOBMI-NEXT: pand %xmm4, %xmm5 -; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm4 -; CHECK-NOBMI-NEXT: paddq %xmm3, %xmm4 -; CHECK-NOBMI-NEXT: pand %xmm4, %xmm0 +; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NOBMI-NEXT: paddq %xmm3, %xmm1 +; CHECK-NOBMI-NEXT: pand %xmm1, %xmm0 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm0 -; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2] -; CHECK-NOBMI-NEXT: pand %xmm2, %xmm0 -; CHECK-NOBMI-NEXT: pxor %xmm3, %xmm0 -; CHECK-NOBMI-NEXT: por %xmm5, %xmm0 -; CHECK-NOBMI-NEXT: packssdw %xmm1, %xmm0 +; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3] +; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; CHECK-NOBMI-NEXT: andps %xmm1, %xmm0 +; CHECK-NOBMI-NEXT: xorps %xmm3, %xmm0 +; CHECK-NOBMI-NEXT: orps %xmm5, %xmm0 ; CHECK-NOBMI-NEXT: retq ; ; CHECK-AVX2-LABEL: neither_pow2_non_zero_4xv64_x_maybe_z: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm2 +; CHECK-AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; CHECK-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; CHECK-AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 ; CHECK-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm4 ; CHECK-AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; CHECK-AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; CHECK-AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 -; CHECK-AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 ; CHECK-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; CHECK-AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 ; CHECK-AVX2-NEXT: vzeroupper ; CHECK-AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/jump_sign.ll b/llvm/test/CodeGen/X86/jump_sign.ll --- a/llvm/test/CodeGen/X86/jump_sign.ll +++ b/llvm/test/CodeGen/X86/jump_sign.ll @@ -229,11 +229,13 @@ ; CHECK-NEXT: jne .LBB12_8 ; CHECK-NEXT: # %bb.4: # %if.end29 ; CHECK-NEXT: movzwl (%eax), %eax -; CHECK-NEXT: imull $-13107, %eax, %eax # imm = 0xCCCD -; CHECK-NEXT: rorw %ax ; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: cmpl $6554, %eax # imm = 0x199A -; CHECK-NEXT: jae .LBB12_5 +; CHECK-NEXT: imull $52429, %eax, %ecx # imm = 0xCCCD +; CHECK-NEXT: shrl $18, %ecx +; CHECK-NEXT: andl $-2, %ecx +; CHECK-NEXT: leal (%ecx,%ecx,4), %ecx +; CHECK-NEXT: cmpw %cx, %ax +; CHECK-NEXT: jne .LBB12_5 ; CHECK-NEXT: .LBB12_8: # %if.then44 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al @@ -389,11 +391,10 @@ ; CHECK-LABEL: func_test1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl b, %eax -; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: setb %cl ; CHECK-NEXT: movl a, %eax -; CHECK-NEXT: testl %eax, %ecx +; CHECK-NEXT: testb %al, %cl ; CHECK-NEXT: je .LBB18_2 ; CHECK-NEXT: # %bb.1: # %if.then ; CHECK-NEXT: decl %eax diff --git a/llvm/test/CodeGen/X86/known-bits-vector.ll b/llvm/test/CodeGen/X86/known-bits-vector.ll --- a/llvm/test/CodeGen/X86/known-bits-vector.ll +++ b/llvm/test/CodeGen/X86/known-bits-vector.ll @@ -349,12 +349,26 @@ define <4 x i32> @knownbits_mask_srem_shuffle_lshr(<4 x i32> %a0) nounwind { ; X86-LABEL: knownbits_mask_srem_shuffle_lshr: ; X86: # %bb.0: -; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 +; X86-NEXT: vpsrad $31, %xmm0, %xmm0 +; X86-NEXT: vpsrld $28, %xmm0, %xmm0 +; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,3] +; X86-NEXT: vpsrld $22, %xmm0, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: knownbits_mask_srem_shuffle_lshr: ; X64: # %bb.0: -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; X64-NEXT: vpsrad $31, %xmm0, %xmm0 +; X64-NEXT: vpsrld $28, %xmm0, %xmm0 +; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,3] +; X64-NEXT: vpsrld $22, %xmm0, %xmm0 ; X64-NEXT: retq %1 = and <4 x i32> %a0, %2 = srem <4 x i32> %1, diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -157,8 +157,9 @@ ; ; X64-LABEL: signbits_ashr_extract_sitofp_0: ; X64: # %bb.0: -; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X64-NEXT: vmovq %xmm0, %rax +; X64-NEXT: shrq $32, %rax +; X64-NEXT: vcvtsi2ss %eax, %xmm1, %xmm0 ; X64-NEXT: retq %1 = ashr <2 x i64> %a0, %2 = extractelement <2 x i64> %1, i32 0 @@ -179,8 +180,9 @@ ; ; X64-LABEL: signbits_ashr_extract_sitofp_1: ; X64: # %bb.0: -; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X64-NEXT: vmovq %xmm0, %rax +; X64-NEXT: shrq $32, %rax +; X64-NEXT: vcvtsi2ss %eax, %xmm1, %xmm0 ; X64-NEXT: retq %1 = ashr <2 x i64> %a0, %2 = extractelement <2 x i64> %1, i32 0 @@ -203,10 +205,10 @@ ; ; X64-LABEL: signbits_ashr_shl_extract_sitofp: ; X64: # %bb.0: -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X64-NEXT: vpsrad $29, %xmm0, %xmm0 -; X64-NEXT: vpsllq $20, %xmm0, %xmm0 -; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X64-NEXT: vmovq %xmm0, %rax +; X64-NEXT: sarq $61, %rax +; X64-NEXT: shll $20, %eax +; X64-NEXT: vcvtsi2ss %eax, %xmm1, %xmm0 ; X64-NEXT: retq %1 = ashr <2 x i64> %a0, %2 = shl <2 x i64> %1, @@ -220,9 +222,9 @@ ; X86: # %bb.0: ; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shrdl $30, %ecx, %eax ; X86-NEXT: sarl $30, %ecx -; X86-NEXT: shll $2, %eax ; X86-NEXT: vmovd %eax, %xmm0 ; X86-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X86-NEXT: vpsrlq $3, %xmm0, %xmm0 @@ -235,9 +237,8 @@ ; X64-LABEL: signbits_ashr_insert_ashr_extract_sitofp: ; X64: # %bb.0: ; X64-NEXT: sarq $30, %rdi -; X64-NEXT: vmovq %rdi, %xmm0 -; X64-NEXT: vpsrlq $3, %xmm0, %xmm0 -; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X64-NEXT: shrq $3, %rdi +; X64-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 ; X64-NEXT: retq %1 = ashr i64 %a0, 30 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 @@ -352,7 +353,8 @@ ; X64: # %bb.0: ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X64-NEXT: vpsrad $29, %xmm0, %xmm0 -; X64-NEXT: vmovd %edi, %xmm1 +; X64-NEXT: movslq %edi, %rax +; X64-NEXT: vmovq %rax, %xmm1 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq @@ -405,24 +407,24 @@ ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $16, %esp -; X86-NEXT: vmovapd 8(%ebp), %xmm3 -; X86-NEXT: vpsrad $31, %xmm2, %xmm4 -; X86-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; X86-NEXT: vpsrad $1, %xmm5, %xmm5 -; X86-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] -; X86-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X86-NEXT: vpmovsxdq 8(%ebp), %xmm4 +; X86-NEXT: vpmovsxdq 16(%ebp), %xmm3 ; X86-NEXT: vpsrad $31, %xmm2, %xmm5 +; X86-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] +; X86-NEXT: vpsrad $1, %xmm6, %xmm6 +; X86-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7] +; X86-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X86-NEXT: vpsrad $31, %xmm2, %xmm6 ; X86-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; X86-NEXT: vpsrad $1, %xmm2, %xmm2 -; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] -; X86-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,3,3] +; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3],xmm2[4,5],xmm6[6,7] ; X86-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6 +; X86-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm4 ; X86-NEXT: vextractf128 $1, %ymm1, %xmm1 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 ; X86-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm5, %xmm0 -; X86-NEXT: vblendvpd %xmm6, %xmm4, %xmm3, %xmm1 -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 ; X86-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] diff --git a/llvm/test/CodeGen/X86/lea-recursion.ll b/llvm/test/CodeGen/X86/lea-recursion.ll --- a/llvm/test/CodeGen/X86/lea-recursion.ll +++ b/llvm/test/CodeGen/X86/lea-recursion.ll @@ -21,27 +21,27 @@ ; CHECK-NEXT: leal 1(%rax,%rcx), %eax ; CHECK-NEXT: movl %eax, g0+4(%rip) ; CHECK-NEXT: movl g1+4(%rip), %eax -; CHECK-NEXT: leal 1(%rax,%rdx), %ecx +; CHECK-NEXT: leal (%rdx,%rax), %ecx ; CHECK-NEXT: leal 2(%rax,%rdx), %eax ; CHECK-NEXT: movl %eax, g0+8(%rip) ; CHECK-NEXT: movl g1+8(%rip), %eax -; CHECK-NEXT: leal 1(%rax,%rcx), %edx -; CHECK-NEXT: leal 2(%rax,%rcx), %eax +; CHECK-NEXT: leal (%rcx,%rax), %edx +; CHECK-NEXT: leal 3(%rax,%rcx), %eax ; CHECK-NEXT: movl %eax, g0+12(%rip) ; CHECK-NEXT: movl g1+12(%rip), %eax -; CHECK-NEXT: leal 1(%rax,%rdx), %ecx -; CHECK-NEXT: leal 2(%rax,%rdx), %eax +; CHECK-NEXT: leal (%rdx,%rax), %ecx +; CHECK-NEXT: leal 4(%rax,%rdx), %eax ; CHECK-NEXT: movl %eax, g0+16(%rip) ; CHECK-NEXT: movl g1+16(%rip), %eax -; CHECK-NEXT: leal 1(%rax,%rcx), %edx -; CHECK-NEXT: leal 2(%rax,%rcx), %eax +; CHECK-NEXT: leal (%rcx,%rax), %edx +; CHECK-NEXT: leal 5(%rax,%rcx), %eax ; CHECK-NEXT: movl %eax, g0+20(%rip) ; CHECK-NEXT: movl g1+20(%rip), %eax -; CHECK-NEXT: leal 1(%rax,%rdx), %ecx -; CHECK-NEXT: leal 2(%rax,%rdx), %eax +; CHECK-NEXT: leal (%rdx,%rax), %ecx +; CHECK-NEXT: leal 6(%rax,%rdx), %eax ; CHECK-NEXT: movl %eax, g0+24(%rip) ; CHECK-NEXT: movl g1+24(%rip), %eax -; CHECK-NEXT: leal 2(%rax,%rcx), %eax +; CHECK-NEXT: leal 7(%rax,%rcx), %eax ; CHECK-NEXT: movl %eax, g0+28(%rip) ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/legalize-shift.ll b/llvm/test/CodeGen/X86/legalize-shift.ll --- a/llvm/test/CodeGen/X86/legalize-shift.ll +++ b/llvm/test/CodeGen/X86/legalize-shift.ll @@ -5,13 +5,17 @@ define void @PR36250() nounwind { ; X86-LABEL: PR36250: ; X86: # %bb.0: -; X86-NEXT: cmpl $0, (%eax) +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: leal (%eax,%eax), %ecx +; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete (%eax) ; X86-NEXT: retl ; ; X64-LABEL: PR36250: ; X64: # %bb.0: -; X64-NEXT: cmpq $0, (%rax) +; X64-NEXT: movq (%rax), %rax +; X64-NEXT: leaq (%rax,%rax), %rcx +; X64-NEXT: orq %rax, %rcx ; X64-NEXT: sete (%rax) ; X64-NEXT: retq %1 = load i448, ptr undef diff --git a/llvm/test/CodeGen/X86/lifetime-alias.ll b/llvm/test/CodeGen/X86/lifetime-alias.ll --- a/llvm/test/CodeGen/X86/lifetime-alias.ll +++ b/llvm/test/CodeGen/X86/lifetime-alias.ll @@ -28,10 +28,10 @@ ; CHECK: # %bb.0: # %_ZNSt3__312basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEED2Ev.exit50 ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movaps {{.*#+}} xmm0 = [97,97,97,97,97,97,97,97,97,97,97,97,97,97,97,97] -; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movabsq $7016996765293437281, %rax # imm = 0x6161616161616161 ; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [97,97,97,97,97,97,97,97,97,97,97,97,97,97,97,97] +; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $5632, {{[0-9]+}}(%rsp) # imm = 0x1600 ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) @@ -44,13 +44,11 @@ ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $21, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movabsq $7308613581744070988, %rax # imm = 0x656D69547473614C -; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movups .L.str.1(%rip), %xmm1 ; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: movabsq $7308613581744070988, %rax # imm = 0x656D69547473614C ; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax @@ -61,6 +59,8 @@ ; CHECK-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax diff --git a/llvm/test/CodeGen/X86/load-chain.ll b/llvm/test/CodeGen/X86/load-chain.ll --- a/llvm/test/CodeGen/X86/load-chain.ll +++ b/llvm/test/CodeGen/X86/load-chain.ll @@ -11,9 +11,9 @@ ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: movl $-32707, %ebp # imm = 0x803D -; CHECK-NEXT: andl (%rdi), %ebp +; CHECK-NEXT: movzwl (%rdi), %ebp ; CHECK-NEXT: callq maybe_mutate@PLT +; CHECK-NEXT: andl $32829, %ebp # imm = 0x803D ; CHECK-NEXT: orl $514, %ebp # imm = 0x202 ; CHECK-NEXT: movw %bp, (%rbx) ; CHECK-NEXT: addq $8, %rsp diff --git a/llvm/test/CodeGen/X86/load-combine.ll b/llvm/test/CodeGen/X86/load-combine.ll --- a/llvm/test/CodeGen/X86/load-combine.ll +++ b/llvm/test/CodeGen/X86/load-combine.ll @@ -894,7 +894,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl 12(%eax,%ecx), %eax +; CHECK-NEXT: movl 12(%ecx,%eax), %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: load_i32_by_i8_base_offset_index: @@ -939,13 +939,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl 13(%eax,%ecx), %eax +; CHECK-NEXT: movl 13(%ecx,%eax), %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: load_i32_by_i8_base_offset_index_2: ; CHECK64: # %bb.0: ; CHECK64-NEXT: movl %esi, %eax -; CHECK64-NEXT: movl 13(%rax,%rdi), %eax +; CHECK64-NEXT: movl 13(%rdi,%rax), %eax ; CHECK64-NEXT: retq %tmp = add nuw nsw i32 %i, 4 %tmp2 = add nuw nsw i32 %i, 3 @@ -995,7 +995,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl 12(%eax,%ecx), %eax +; CHECK-NEXT: movl 12(%ecx,%eax), %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: load_i32_by_i8_zaext_loads: @@ -1051,7 +1051,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl 12(%eax,%ecx), %eax +; CHECK-NEXT: movl 12(%ecx,%eax), %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: load_i32_by_i8_zsext_loads: diff --git a/llvm/test/CodeGen/X86/load-local-v3i1.ll b/llvm/test/CodeGen/X86/load-local-v3i1.ll --- a/llvm/test/CodeGen/X86/load-local-v3i1.ll +++ b/llvm/test/CodeGen/X86/load-local-v3i1.ll @@ -19,10 +19,10 @@ ; CHECK-NEXT: andb $1, %dl ; CHECK-NEXT: addb %dl, %dl ; CHECK-NEXT: orb %sil, %dl -; CHECK-NEXT: andb $1, %cl ; CHECK-NEXT: shlb $2, %cl ; CHECK-NEXT: orb %dl, %cl -; CHECK-NEXT: testb $1, %cl +; CHECK-NEXT: andb $7, %cl +; CHECK-NEXT: testb %sil, %sil ; CHECK-NEXT: # implicit-def: $xmm0 ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.2: # %else @@ -56,10 +56,10 @@ ; CHECK-NEXT: andb $1, %dl ; CHECK-NEXT: addb %dl, %dl ; CHECK-NEXT: orb %sil, %dl -; CHECK-NEXT: andb $1, %cl ; CHECK-NEXT: shlb $2, %cl ; CHECK-NEXT: orb %dl, %cl -; CHECK-NEXT: testb $1, %cl +; CHECK-NEXT: andb $7, %cl +; CHECK-NEXT: testb %sil, %sil ; CHECK-NEXT: jne .LBB1_1 ; CHECK-NEXT: # %bb.2: # %else ; CHECK-NEXT: testb $2, %cl diff --git a/llvm/test/CodeGen/X86/load-local-v3i129.ll b/llvm/test/CodeGen/X86/load-local-v3i129.ll --- a/llvm/test/CodeGen/X86/load-local-v3i129.ll +++ b/llvm/test/CodeGen/X86/load-local-v3i129.ll @@ -5,28 +5,35 @@ define void @_start() nounwind { ; FAST-SHLD-LABEL: _start: ; FAST-SHLD: # %bb.0: # %Entry -; FAST-SHLD-NEXT: movq -40(%rsp), %rax -; FAST-SHLD-NEXT: movq -32(%rsp), %rcx -; FAST-SHLD-NEXT: movq %rcx, %rdx -; FAST-SHLD-NEXT: shlq $62, %rdx -; FAST-SHLD-NEXT: shrq $2, %rcx -; FAST-SHLD-NEXT: shldq $2, %rdx, %rcx -; FAST-SHLD-NEXT: andq $-4, %rax -; FAST-SHLD-NEXT: orq $1, %rax -; FAST-SHLD-NEXT: movq %rax, -40(%rsp) -; FAST-SHLD-NEXT: movq %rcx, -32(%rsp) -; FAST-SHLD-NEXT: orq $-2, -56(%rsp) +; FAST-SHLD-NEXT: movl -24(%rsp), %eax +; FAST-SHLD-NEXT: movl %eax, %ecx +; FAST-SHLD-NEXT: shrl $2, %ecx +; FAST-SHLD-NEXT: movq -40(%rsp), %rdx +; FAST-SHLD-NEXT: movq -32(%rsp), %rsi +; FAST-SHLD-NEXT: shldq $62, %rsi, %rax +; FAST-SHLD-NEXT: shrdq $2, %rsi, %rdx +; FAST-SHLD-NEXT: leaq 1(,%rdx,4), %rsi +; FAST-SHLD-NEXT: movq %rsi, -40(%rsp) +; FAST-SHLD-NEXT: shrdq $62, %rax, %rdx +; FAST-SHLD-NEXT: movq %rdx, -32(%rsp) +; FAST-SHLD-NEXT: shrdq $62, %rcx, %rax +; FAST-SHLD-NEXT: andl $7, %eax +; FAST-SHLD-NEXT: movb %al, -24(%rsp) ; FAST-SHLD-NEXT: movq $-1, -48(%rsp) +; FAST-SHLD-NEXT: orq $-2, -56(%rsp) ; FAST-SHLD-NEXT: retq ; ; SLOW-SHLD-LABEL: _start: ; SLOW-SHLD: # %bb.0: # %Entry ; SLOW-SHLD-NEXT: movq -40(%rsp), %rax +; SLOW-SHLD-NEXT: movzbl -24(%rsp), %ecx +; SLOW-SHLD-NEXT: andl $7, %ecx +; SLOW-SHLD-NEXT: movb %cl, -24(%rsp) ; SLOW-SHLD-NEXT: andq $-4, %rax ; SLOW-SHLD-NEXT: orq $1, %rax ; SLOW-SHLD-NEXT: movq %rax, -40(%rsp) -; SLOW-SHLD-NEXT: orq $-2, -56(%rsp) ; SLOW-SHLD-NEXT: movq $-1, -48(%rsp) +; SLOW-SHLD-NEXT: orq $-2, -56(%rsp) ; SLOW-SHLD-NEXT: retq Entry: %y = alloca <3 x i129>, align 16 diff --git a/llvm/test/CodeGen/X86/load-partial.ll b/llvm/test/CodeGen/X86/load-partial.ll --- a/llvm/test/CodeGen/X86/load-partial.ll +++ b/llvm/test/CodeGen/X86/load-partial.ll @@ -104,14 +104,32 @@ } define <4 x float> @load_float4_float3_as_float2_float(ptr nocapture readonly dereferenceable(16)) nofree nosync { -; SSE-LABEL: load_float4_float3_as_float2_float: -; SSE: # %bb.0: -; SSE-NEXT: movups (%rdi), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: load_float4_float3_as_float2_float: +; SSE2: # %bb.0: +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_float4_float3_as_float2_float: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_float4_float3_as_float2_float: +; SSE41: # %bb.0: +; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; SSE41-NEXT: retq ; ; AVX-LABEL: load_float4_float3_as_float2_float: ; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; AVX-NEXT: retq %2 = load <2 x float>, ptr %0, align 4 %3 = extractelement <2 x float> %2, i32 0 @@ -380,40 +398,48 @@ } define dso_local void @PR43227(ptr %explicit_0, ptr %explicit_1) { -; SSE-LABEL: PR43227: -; SSE: # %bb.0: -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: psrlq $32, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm1, 672(%rsi) -; SSE-NEXT: movdqa %xmm0, 688(%rsi) -; SSE-NEXT: retq +; SSE2-LABEL: PR43227: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, 672(%rsi) +; SSE2-NEXT: movdqa %xmm0, 688(%rsi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: PR43227: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: psrlq $32, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, 672(%rsi) +; SSSE3-NEXT: movdqa %xmm0, 688(%rsi) +; SSSE3-NEXT: retq ; -; AVX1-LABEL: PR43227: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, 672(%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq +; SSE41-LABEL: PR43227: +; SSE41: # %bb.0: +; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, 672(%rsi) +; SSE41-NEXT: movaps %xmm1, 688(%rsi) +; SSE41-NEXT: retq ; -; AVX2-LABEL: PR43227: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 672(%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: PR43227: +; AVX: # %bb.0: +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: vmovaps %ymm0, 672(%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %1 = getelementptr i32, ptr %explicit_0, i64 63 %2 = load <3 x i32>, ptr %1, align 1 %3 = shufflevector <3 x i32> %2, <3 x i32> undef, <2 x i32> @@ -423,3 +449,6 @@ store <8 x i32> %5, ptr %6, align 32 ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX1: {{.*}} +; AVX2: {{.*}} diff --git a/llvm/test/CodeGen/X86/load-scalar-as-vector.ll b/llvm/test/CodeGen/X86/load-scalar-as-vector.ll --- a/llvm/test/CodeGen/X86/load-scalar-as-vector.ll +++ b/llvm/test/CodeGen/X86/load-scalar-as-vector.ll @@ -365,7 +365,7 @@ ; SSE-NEXT: movzwl %cx, %eax ; SSE-NEXT: movswl %ax, %ecx ; SSE-NEXT: shrl $15, %eax -; SSE-NEXT: sarl $5, %ecx +; SSE-NEXT: shrl $5, %ecx ; SSE-NEXT: addl %eax, %ecx ; SSE-NEXT: movd %ecx, %xmm0 ; SSE-NEXT: retq @@ -379,7 +379,7 @@ ; AVX-NEXT: movzwl %cx, %eax ; AVX-NEXT: movswl %ax, %ecx ; AVX-NEXT: shrl $15, %eax -; AVX-NEXT: sarl $5, %ecx +; AVX-NEXT: shrl $5, %ecx ; AVX-NEXT: addl %eax, %ecx ; AVX-NEXT: vmovd %ecx, %xmm0 ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -40,9 +40,9 @@ ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: .LBB0_1: # %vector.body ; AVX-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpmovsxwd (%rdi,%rcx,2), %xmm1 +; AVX-NEXT: vpmovsxwd (%rsi,%rcx,2), %xmm2 +; AVX-NEXT: vpmulld %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: addq $8, %rcx ; AVX-NEXT: cmpq %rcx, %rax @@ -96,7 +96,16 @@ ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm2 ; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm3 -; SSE2-NEXT: pmaddwd %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pmulhw %xmm2, %xmm4 +; SSE2-NEXT: pmullw %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: addq $8, %rcx ; SSE2-NEXT: cmpq %rcx, %rax @@ -118,8 +127,13 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB1_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1 -; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxwd (%rdi,%rcx,2), %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%rdi,%rcx,2), %xmm2 +; AVX1-NEXT: vpmovsxwd (%rsi,%rcx,2), %xmm3 +; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%rsi,%rcx,2), %xmm3 +; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: addq $8, %rcx @@ -144,17 +158,20 @@ ; AVX256-NEXT: .p2align 4, 0x90 ; AVX256-NEXT: .LBB1_1: # %vector.body ; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX256-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1 -; AVX256-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1 +; AVX256-NEXT: vpmovsxwd (%rdi,%rcx,2), %ymm1 +; AVX256-NEXT: vpmovsxwd (%rsi,%rcx,2), %ymm2 +; AVX256-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vphaddd %xmm2, %xmm1, %xmm1 ; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX256-NEXT: addq $8, %rcx ; AVX256-NEXT: cmpq %rcx, %rax ; AVX256-NEXT: jne .LBB1_1 ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vmovd %xmm0, %eax @@ -203,14 +220,32 @@ ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB2_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm3 -; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm4 -; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm5 -; SSE2-NEXT: pmaddwd %xmm3, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm3 -; SSE2-NEXT: pmaddwd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm4 +; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm5 +; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm3 +; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm6 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: pmulhw %xmm4, %xmm7 +; SSE2-NEXT: pmullw %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pmulhw %xmm5, %xmm7 +; SSE2-NEXT: pmullw %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm5[0,2] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm4[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm5[1,3] +; SSE2-NEXT: paddd %xmm7, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm4[1,3] +; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB2_1 @@ -234,14 +269,24 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB2_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %xmm2 -; AVX1-NEXT: vmovdqu 16(%rsi,%rcx,2), %xmm3 -; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2 -; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxwd 24(%rdi,%rcx,2), %xmm2 +; AVX1-NEXT: vpmovsxwd 16(%rdi,%rcx,2), %xmm3 +; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovsxwd 8(%rdi,%rcx,2), %xmm3 +; AVX1-NEXT: vpmovsxwd (%rdi,%rcx,2), %xmm4 +; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovsxwd 24(%rsi,%rcx,2), %xmm4 +; AVX1-NEXT: vpmovsxwd 16(%rsi,%rcx,2), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpmovsxwd 8(%rsi,%rcx,2), %xmm4 +; AVX1-NEXT: vpmovsxwd (%rsi,%rcx,2), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: addq $16, %rcx ; AVX1-NEXT: cmpq %rcx, %rax ; AVX1-NEXT: jne .LBB2_1 @@ -268,8 +313,15 @@ ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB2_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %ymm2 -; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm2, %ymm2 +; AVX2-NEXT: vpmovsxwd 16(%rdi,%rcx,2), %ymm2 +; AVX2-NEXT: vpmovsxwd (%rdi,%rcx,2), %ymm3 +; AVX2-NEXT: vpackssdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovsxwd 16(%rsi,%rcx,2), %ymm3 +; AVX2-NEXT: vpmovsxwd (%rsi,%rcx,2), %ymm4 +; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: addq $16, %rcx ; AVX2-NEXT: cmpq %rcx, %rax @@ -277,9 +329,9 @@ ; AVX2-NEXT: # %bb.2: # %middle.block ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -294,8 +346,12 @@ ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB2_1: # %vector.body ; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512-NEXT: vmovdqu (%rsi,%rcx,2), %ymm1 -; AVX512-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm1, %ymm1 +; AVX512-NEXT: vpmovsxwd (%rdi,%rcx,2), %zmm1 +; AVX512-NEXT: vpmovsxwd (%rsi,%rcx,2), %zmm2 +; AVX512-NEXT: vpmulld %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vphaddd %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: addq $16, %rcx ; AVX512-NEXT: cmpq %rcx, %rax @@ -304,9 +360,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -350,7 +406,6 @@ ; SSE2-LABEL: _Z10test_shortPsS_i_1024: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm4 @@ -359,26 +414,63 @@ ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB3_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm5 -; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm6 -; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm7 -; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm8 -; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm9 -; SSE2-NEXT: pmaddwd %xmm5, %xmm9 -; SSE2-NEXT: paddd %xmm9, %xmm2 -; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm5 -; SSE2-NEXT: pmaddwd %xmm6, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm4 -; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm5 -; SSE2-NEXT: pmaddwd %xmm7, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm1 -; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm5 -; SSE2-NEXT: pmaddwd %xmm8, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm3 +; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm7 +; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm10 +; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm11 +; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm12 +; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm5 +; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm6 +; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm8 +; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm9 +; SSE2-NEXT: movdqa %xmm5, %xmm13 +; SSE2-NEXT: pmulhw %xmm7, %xmm13 +; SSE2-NEXT: pmullw %xmm7, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSE2-NEXT: movdqa %xmm6, %xmm13 +; SSE2-NEXT: pmulhw %xmm10, %xmm13 +; SSE2-NEXT: pmullw %xmm10, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3] +; SSE2-NEXT: movdqa %xmm8, %xmm13 +; SSE2-NEXT: pmulhw %xmm11, %xmm13 +; SSE2-NEXT: pmullw %xmm11, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] +; SSE2-NEXT: movdqa %xmm9, %xmm13 +; SSE2-NEXT: pmulhw %xmm12, %xmm13 +; SSE2-NEXT: pmullw %xmm12, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; SSE2-NEXT: movdqa %xmm9, %xmm13 +; SSE2-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm12[0,2] +; SSE2-NEXT: movdqa %xmm8, %xmm14 +; SSE2-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm11[0,2] +; SSE2-NEXT: movdqa %xmm6, %xmm15 +; SSE2-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm10[0,2] +; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm12[1,3] +; SSE2-NEXT: paddd %xmm13, %xmm9 +; SSE2-NEXT: paddd %xmm9, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm11[1,3] +; SSE2-NEXT: paddd %xmm14, %xmm8 +; SSE2-NEXT: paddd %xmm8, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm10[1,3] +; SSE2-NEXT: paddd %xmm15, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm7[1,3] +; SSE2-NEXT: paddd %xmm0, %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB3_1 ; SSE2-NEXT: # %bb.2: # %middle.block +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm3 @@ -403,22 +495,42 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB3_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %xmm3 -; AVX1-NEXT: vmovdqu 16(%rsi,%rcx,2), %xmm4 -; AVX1-NEXT: vmovdqu 32(%rsi,%rcx,2), %xmm5 -; AVX1-NEXT: vmovdqu 48(%rsi,%rcx,2), %xmm6 -; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm3, %xmm3 -; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4 -; AVX1-NEXT: vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5 -; AVX1-NEXT: vpmaddwd 48(%rdi,%rcx,2), %xmm6, %xmm6 +; AVX1-NEXT: vpmovsxwd 56(%rdi,%rcx,2), %xmm3 +; AVX1-NEXT: vpmovsxwd 48(%rdi,%rcx,2), %xmm4 +; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovsxwd 40(%rdi,%rcx,2), %xmm4 +; AVX1-NEXT: vpmovsxwd 32(%rdi,%rcx,2), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmovsxwd 24(%rdi,%rcx,2), %xmm5 +; AVX1-NEXT: vpmovsxwd 16(%rdi,%rcx,2), %xmm6 +; AVX1-NEXT: vpackssdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmovsxwd 8(%rdi,%rcx,2), %xmm6 +; AVX1-NEXT: vpmovsxwd (%rdi,%rcx,2), %xmm7 +; AVX1-NEXT: vpackssdw %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpmovsxwd 56(%rsi,%rcx,2), %xmm7 +; AVX1-NEXT: vpmovsxwd 48(%rsi,%rcx,2), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpmaddwd %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpmovsxwd 40(%rsi,%rcx,2), %xmm7 +; AVX1-NEXT: vpmovsxwd 32(%rsi,%rcx,2), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpmovsxwd 24(%rsi,%rcx,2), %xmm7 +; AVX1-NEXT: vpmovsxwd 16(%rsi,%rcx,2), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpmovsxwd 8(%rsi,%rcx,2), %xmm7 +; AVX1-NEXT: vpmovsxwd (%rsi,%rcx,2), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpaddd %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: addq $16, %rcx ; AVX1-NEXT: cmpq %rcx, %rax ; AVX1-NEXT: jne .LBB3_1 @@ -451,11 +563,25 @@ ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB3_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %ymm3 -; AVX2-NEXT: vmovdqu 32(%rsi,%rcx,2), %ymm4 -; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm3 +; AVX2-NEXT: vpmovsxwd 48(%rdi,%rcx,2), %ymm3 +; AVX2-NEXT: vpmovsxwd 32(%rdi,%rcx,2), %ymm4 +; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpmovsxwd 16(%rdi,%rcx,2), %ymm4 +; AVX2-NEXT: vpmovsxwd (%rdi,%rcx,2), %ymm5 +; AVX2-NEXT: vpackssdw %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpmovsxwd 48(%rsi,%rcx,2), %ymm5 +; AVX2-NEXT: vpmovsxwd 32(%rsi,%rcx,2), %ymm6 +; AVX2-NEXT: vpackssdw %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vpmovsxwd 16(%rsi,%rcx,2), %ymm6 +; AVX2-NEXT: vpmovsxwd (%rsi,%rcx,2), %ymm7 +; AVX2-NEXT: vpackssdw %ymm6, %ymm7, %ymm6 +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,1,3] +; AVX2-NEXT: vpmaddwd %ymm4, %ymm6, %ymm4 +; AVX2-NEXT: vpaddd %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,2,1,3] +; AVX2-NEXT: vpmaddwd %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: addq $16, %rcx ; AVX2-NEXT: cmpq %rcx, %rax @@ -465,9 +591,9 @@ ; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -483,10 +609,16 @@ ; AVX512F-NEXT: .p2align 4, 0x90 ; AVX512F-NEXT: .LBB3_1: # %vector.body ; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512F-NEXT: vmovdqu (%rsi,%rcx,2), %ymm2 -; AVX512F-NEXT: vmovdqu 32(%rsi,%rcx,2), %ymm3 -; AVX512F-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm3, %ymm3 -; AVX512F-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm2, %ymm2 +; AVX512F-NEXT: vpmovsxwd (%rdi,%rcx,2), %zmm2 +; AVX512F-NEXT: vpmovsxwd 32(%rdi,%rcx,2), %zmm3 +; AVX512F-NEXT: vpmovsxwd (%rsi,%rcx,2), %zmm4 +; AVX512F-NEXT: vpmovsxwd 32(%rsi,%rcx,2), %zmm5 +; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512F-NEXT: vpmovdw %zmm5, %ymm5 +; AVX512F-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512F-NEXT: vpmaddwd %ymm2, %ymm4, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: addq $16, %rcx @@ -497,9 +629,9 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax @@ -515,8 +647,17 @@ ; AVX512BW-NEXT: .p2align 4, 0x90 ; AVX512BW-NEXT: .LBB3_1: # %vector.body ; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512BW-NEXT: vmovdqu64 (%rsi,%rcx,2), %zmm2 -; AVX512BW-NEXT: vpmaddwd (%rdi,%rcx,2), %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovsxwd 32(%rdi,%rcx,2), %zmm2 +; AVX512BW-NEXT: vpmovsxwd (%rdi,%rcx,2), %zmm3 +; AVX512BW-NEXT: vpmovsxwd 32(%rsi,%rcx,2), %zmm4 +; AVX512BW-NEXT: vpmovsxwd (%rsi,%rcx,2), %zmm5 +; AVX512BW-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512BW-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpmovdw %zmm5, %ymm3 +; AVX512BW-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: addq $16, %rcx ; AVX512BW-NEXT: cmpq %rcx, %rax @@ -526,9 +667,9 @@ ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax @@ -668,7 +809,15 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm2 ; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: pmaddwd %xmm2, %xmm3 +; SSE2-NEXT: pmullw %xmm2, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax @@ -690,9 +839,13 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB5_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm1 -; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm2 -; AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovsxbd (%rdi,%rcx), %xmm1 +; AVX1-NEXT: vpmovsxbd 4(%rdi,%rcx), %xmm2 +; AVX1-NEXT: vpmovsxbd (%rsi,%rcx), %xmm3 +; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpmovsxbd 4(%rsi,%rcx), %xmm3 +; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: addq $16, %rcx @@ -717,18 +870,20 @@ ; AVX256-NEXT: .p2align 4, 0x90 ; AVX256-NEXT: .LBB5_1: # %vector.body ; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX256-NEXT: vpmovsxbw (%rdi,%rcx), %xmm1 -; AVX256-NEXT: vpmovsxbw (%rsi,%rcx), %xmm2 -; AVX256-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 +; AVX256-NEXT: vpmovsxbd (%rdi,%rcx), %ymm1 +; AVX256-NEXT: vpmovsxbd (%rsi,%rcx), %ymm2 +; AVX256-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vphaddd %xmm2, %xmm1, %xmm1 ; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX256-NEXT: addq $16, %rcx ; AVX256-NEXT: cmpq %rcx, %rax ; AVX256-NEXT: jne .LBB5_1 ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vmovd %xmm0, %eax @@ -783,14 +938,30 @@ ; SSE2-NEXT: psraw $8, %xmm5 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: pmaddwd %xmm5, %xmm6 -; SSE2-NEXT: paddd %xmm6, %xmm2 +; SSE2-NEXT: pmullw %xmm5, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm4 -; SSE2-NEXT: pmaddwd %xmm3, %xmm4 +; SSE2-NEXT: pmullw %xmm3, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm3[0,2] +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm5[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm3[1,3] +; SSE2-NEXT: paddd %xmm7, %xmm4 ; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm5[1,3] +; SSE2-NEXT: paddd %xmm8, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm2 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB6_1 @@ -814,11 +985,19 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB6_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm2 -; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm3 -; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm4 +; AVX1-NEXT: vpmovsxbd 12(%rdi,%rcx), %xmm2 +; AVX1-NEXT: vpmovsxbd 8(%rdi,%rcx), %xmm3 +; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovsxbd 4(%rdi,%rcx), %xmm3 +; AVX1-NEXT: vpmovsxbd (%rdi,%rcx), %xmm4 +; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovsxbd 12(%rsi,%rcx), %xmm4 +; AVX1-NEXT: vpmovsxbd 8(%rsi,%rcx), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm4 +; AVX1-NEXT: vpmovsxbd 4(%rsi,%rcx), %xmm4 +; AVX1-NEXT: vpmovsxbd (%rsi,%rcx), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 @@ -850,8 +1029,14 @@ ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB6_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm2 -; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 +; AVX2-NEXT: vpmovsxbd 8(%rdi,%rcx), %ymm2 +; AVX2-NEXT: vpmovsxbd (%rdi,%rcx), %ymm3 +; AVX2-NEXT: vpackssdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovsxbd 8(%rsi,%rcx), %ymm3 +; AVX2-NEXT: vpmovsxbd (%rsi,%rcx), %ymm4 +; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] ; AVX2-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: addq $16, %rcx @@ -860,9 +1045,9 @@ ; AVX2-NEXT: # %bb.2: # %middle.block ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -877,9 +1062,12 @@ ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB6_1: # %vector.body ; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm1 -; AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm2 -; AVX512-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vpmovsxbd (%rdi,%rcx), %zmm1 +; AVX512-NEXT: vpmovsxbd (%rsi,%rcx), %zmm2 +; AVX512-NEXT: vpmulld %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vphaddd %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: addq $16, %rcx ; AVX512-NEXT: cmpq %rcx, %rax @@ -888,9 +1076,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -934,7 +1122,6 @@ ; SSE2-LABEL: _Z9test_charPcS_i_1024: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm4 @@ -944,37 +1131,70 @@ ; SSE2-NEXT: .LBB7_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm7 -; SSE2-NEXT: movdqu 16(%rdi,%rcx), %xmm6 +; SSE2-NEXT: movdqu 16(%rdi,%rcx), %xmm10 ; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm8 -; SSE2-NEXT: movdqu 16(%rsi,%rcx), %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; SSE2-NEXT: psraw $8, %xmm9 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; SSE2-NEXT: psraw $8, %xmm10 -; SSE2-NEXT: pmaddwd %xmm9, %xmm10 -; SSE2-NEXT: paddd %xmm10, %xmm2 +; SSE2-NEXT: movdqu 16(%rsi,%rcx), %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; SSE2-NEXT: psraw $8, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; SSE2-NEXT: psraw $8, %xmm11 +; SSE2-NEXT: pmullw %xmm5, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSE2-NEXT: psrad $16, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: psraw $8, %xmm8 -; SSE2-NEXT: pmaddwd %xmm7, %xmm8 -; SSE2-NEXT: paddd %xmm8, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: psraw $8, %xmm7 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; SSE2-NEXT: psraw $8, %xmm8 -; SSE2-NEXT: pmaddwd %xmm7, %xmm8 -; SSE2-NEXT: paddd %xmm8, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: pmaddwd %xmm6, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] +; SSE2-NEXT: psraw $8, %xmm11 +; SSE2-NEXT: pmullw %xmm7, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] +; SSE2-NEXT: psrad $16, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; SSE2-NEXT: psrad $16, %xmm7 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; SSE2-NEXT: psraw $8, %xmm11 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; SSE2-NEXT: psraw $8, %xmm13 +; SSE2-NEXT: pmullw %xmm11, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; SSE2-NEXT: psrad $16, %xmm12 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; SSE2-NEXT: psrad $16, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: psraw $8, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: psraw $8, %xmm9 +; SSE2-NEXT: pmullw %xmm10, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE2-NEXT: psrad $16, %xmm10 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm13 +; SSE2-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm10[0,2] +; SSE2-NEXT: movdqa %xmm11, %xmm14 +; SSE2-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm12[0,2] +; SSE2-NEXT: movdqa %xmm7, %xmm15 +; SSE2-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm8[0,2] +; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm10[1,3] +; SSE2-NEXT: paddd %xmm13, %xmm9 +; SSE2-NEXT: paddd %xmm9, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,3],xmm12[1,3] +; SSE2-NEXT: paddd %xmm14, %xmm11 +; SSE2-NEXT: paddd %xmm11, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm8[1,3] +; SSE2-NEXT: paddd %xmm15, %xmm7 +; SSE2-NEXT: paddd %xmm7, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm6[1,3] +; SSE2-NEXT: paddd %xmm0, %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: addq $32, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB7_1 ; SSE2-NEXT: # %bb.2: # %middle.block +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm3 @@ -999,17 +1219,33 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB7_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovsxbw 24(%rdi,%rcx), %xmm3 -; AVX1-NEXT: vpmovsxbw 16(%rdi,%rcx), %xmm4 -; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm5 -; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm6 -; AVX1-NEXT: vpmovsxbw 24(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd 28(%rdi,%rcx), %xmm3 +; AVX1-NEXT: vpmovsxbd 24(%rdi,%rcx), %xmm4 +; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovsxbd 20(%rdi,%rcx), %xmm4 +; AVX1-NEXT: vpmovsxbd 16(%rdi,%rcx), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmovsxbd 12(%rdi,%rcx), %xmm5 +; AVX1-NEXT: vpmovsxbd 8(%rdi,%rcx), %xmm6 +; AVX1-NEXT: vpackssdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmovsxbd 4(%rdi,%rcx), %xmm6 +; AVX1-NEXT: vpmovsxbd (%rdi,%rcx), %xmm7 +; AVX1-NEXT: vpackssdw %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpmovsxbd 28(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd 24(%rsi,%rcx), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpmaddwd %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpmovsxbw 16(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd 20(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd 16(%rsi,%rcx), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd 12(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd 8(%rsi,%rcx), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd 4(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd (%rsi,%rcx), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 ; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3 @@ -1051,14 +1287,26 @@ ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB7_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 -; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 -; AVX2-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 -; AVX2-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vpmovsxbd 24(%rdi,%rcx), %ymm3 +; AVX2-NEXT: vpmovsxbd 16(%rdi,%rcx), %ymm4 +; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpmovsxbd 8(%rdi,%rcx), %ymm4 +; AVX2-NEXT: vpmovsxbd (%rdi,%rcx), %ymm5 +; AVX2-NEXT: vpackssdw %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpmovsxbd 24(%rsi,%rcx), %ymm5 +; AVX2-NEXT: vpmovsxbd 16(%rsi,%rcx), %ymm6 +; AVX2-NEXT: vpackssdw %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vpmovsxbd 8(%rsi,%rcx), %ymm6 +; AVX2-NEXT: vpmovsxbd (%rsi,%rcx), %ymm7 +; AVX2-NEXT: vpackssdw %ymm6, %ymm7, %ymm6 +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,1,3] +; AVX2-NEXT: vpmaddwd %ymm4, %ymm6, %ymm4 +; AVX2-NEXT: vpaddd %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,2,1,3] +; AVX2-NEXT: vpmaddwd %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 -; AVX2-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: addq $32, %rcx ; AVX2-NEXT: cmpq %rcx, %rax ; AVX2-NEXT: jne .LBB7_1 @@ -1067,9 +1315,9 @@ ; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1101,9 +1349,9 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax @@ -1119,8 +1367,12 @@ ; AVX512BW-NEXT: .p2align 4, 0x90 ; AVX512BW-NEXT: .LBB7_1: # %vector.body ; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512BW-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 -; AVX512BW-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 +; AVX512BW-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm2 +; AVX512BW-NEXT: vpmovsxbw (%rdi,%rcx), %ymm3 +; AVX512BW-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm4 +; AVX512BW-NEXT: vpmovsxbw (%rsi,%rcx), %ymm5 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm3 ; AVX512BW-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: addq $32, %rcx @@ -1131,9 +1383,9 @@ ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax @@ -1335,9 +1587,9 @@ ; AVX256-NEXT: jne .LBB9_1 ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vmovd %xmm0, %eax @@ -1490,9 +1742,9 @@ ; AVX2-NEXT: # %bb.2: # %middle.block ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1518,9 +1770,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1743,9 +1995,9 @@ ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1777,9 +2029,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1824,13 +2076,39 @@ define <4 x i32> @pmaddwd_8(<8 x i16> %A, <8 x i16> %B) { ; SSE2-LABEL: pmaddwd_8: ; SSE2: # %bb.0: -; SSE2-NEXT: pmaddwd %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pmulhw %xmm1, %xmm2 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: retq ; -; AVX-LABEL: pmaddwd_8: -; AVX: # %bb.0: -; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: pmaddwd_8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX256-LABEL: pmaddwd_8: +; AVX256: # %bb.0: +; AVX256-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX256-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %a = sext <8 x i16> %A to <8 x i32> %b = sext <8 x i16> %B to <8 x i32> %m = mul nsw <8 x i32> %a, %b @@ -1843,13 +2121,39 @@ define <4 x i32> @pmaddwd_8_swapped(<8 x i16> %A, <8 x i16> %B) { ; SSE2-LABEL: pmaddwd_8_swapped: ; SSE2: # %bb.0: -; SSE2-NEXT: pmaddwd %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pmulhw %xmm1, %xmm2 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: retq ; -; AVX-LABEL: pmaddwd_8_swapped: -; AVX: # %bb.0: -; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: pmaddwd_8_swapped: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX256-LABEL: pmaddwd_8_swapped: +; AVX256: # %bb.0: +; AVX256-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX256-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %a = sext <8 x i16> %A to <8 x i32> %b = sext <8 x i16> %B to <8 x i32> %m = mul nsw <8 x i32> %a, %b @@ -1877,13 +2181,24 @@ ; ; AVX1-LABEL: larger_mul: ; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: larger_mul: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1892,8 +2207,10 @@ ; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7] +; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %a = sext <16 x i16> %A to <16 x i32> @@ -1908,8 +2225,26 @@ define <8 x i32> @pmaddwd_16(<16 x i16> %A, <16 x i16> %B) { ; SSE2-LABEL: pmaddwd_16: ; SSE2: # %bb.0: -; SSE2-NEXT: pmaddwd %xmm2, %xmm0 -; SSE2-NEXT: pmaddwd %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pmulhw %xmm2, %xmm4 +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pmulhw %xmm3, %xmm4 +; SSE2-NEXT: pmullw %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm5, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: pmaddwd_16: @@ -1921,10 +2256,20 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX256-LABEL: pmaddwd_16: -; AVX256: # %bb.0: -; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 -; AVX256-NEXT: retq +; AVX2-LABEL: pmaddwd_16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: pmaddwd_16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512-NEXT: retq %a = sext <16 x i16> %A to <16 x i32> %b = sext <16 x i16> %B to <16 x i32> %m = mul nsw <16 x i32> %a, %b @@ -1937,10 +2282,46 @@ define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) { ; SSE2-LABEL: pmaddwd_32: ; SSE2: # %bb.0: -; SSE2-NEXT: pmaddwd %xmm4, %xmm0 -; SSE2-NEXT: pmaddwd %xmm5, %xmm1 -; SSE2-NEXT: pmaddwd %xmm6, %xmm2 -; SSE2-NEXT: pmaddwd %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pmulhw %xmm4, %xmm8 +; SSE2-NEXT: pmullw %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: pmulhw %xmm5, %xmm8 +; SSE2-NEXT: pmullw %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: pmulhw %xmm6, %xmm8 +; SSE2-NEXT: pmullw %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: pmulhw %xmm7, %xmm8 +; SSE2-NEXT: pmullw %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm7[0,2] +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm6[0,2] +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm5[0,2] +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm4[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm7[1,3] +; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm6[1,3] +; SSE2-NEXT: paddd %xmm9, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm5[1,3] +; SSE2-NEXT: paddd %xmm10, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] +; SSE2-NEXT: paddd %xmm11, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: pmaddwd_32: @@ -1988,13 +2369,36 @@ define <4 x i32> @pmaddwd_const(<8 x i16> %A) { ; SSE2-LABEL: pmaddwd_const: ; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; -; AVX-LABEL: pmaddwd_const: -; AVX: # %bb.0: -; AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: pmaddwd_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX256-LABEL: pmaddwd_const: +; AVX256: # %bb.0: +; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX256-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %a = sext <8 x i16> %A to <8 x i32> %m = mul nsw <8 x i32> %a, %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> @@ -2058,9 +2462,9 @@ ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] ; SSE2-NEXT: paddd %xmm2, %xmm1 @@ -2095,13 +2499,41 @@ define <4 x i32> @jumbled_indices4(<8 x i16> %A, <8 x i16> %B) { ; SSE2-LABEL: jumbled_indices4: ; SSE2: # %bb.0: -; SSE2-NEXT: pmaddwd %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pmulhw %xmm1, %xmm2 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[1,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: retq ; -; AVX-LABEL: jumbled_indices4: -; AVX: # %bb.0: -; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: jumbled_indices4: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX1-NEXT: retq +; +; AVX256-LABEL: jumbled_indices4: +; AVX256: # %bb.0: +; AVX256-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX256-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %exta = sext <8 x i16> %A to <8 x i32> %extb = sext <8 x i16> %B to <8 x i32> %m = mul <8 x i32> %exta, %extb @@ -2114,8 +2546,26 @@ define <8 x i32> @jumbled_indices8(<16 x i16> %A, <16 x i16> %B) { ; SSE2-LABEL: jumbled_indices8: ; SSE2: # %bb.0: -; SSE2-NEXT: pmaddwd %xmm2, %xmm0 -; SSE2-NEXT: pmaddwd %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pmulhw %xmm2, %xmm4 +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pmulhw %xmm3, %xmm4 +; SSE2-NEXT: pmullw %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm3[3,0] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm3[2,1] +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[2,1] +; SSE2-NEXT: paddd %xmm5, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: jumbled_indices8: @@ -2127,10 +2577,21 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX256-LABEL: jumbled_indices8: -; AVX256: # %bb.0: -; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 -; AVX256-NEXT: retq +; AVX2-LABEL: jumbled_indices8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: jumbled_indices8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,5,4,3,2,7,6] +; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: retq %exta = sext <16 x i16> %A to <16 x i32> %extb = sext <16 x i16> %B to <16 x i32> %m = mul <16 x i32> %exta, %extb @@ -2143,10 +2604,46 @@ define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) { ; SSE2-LABEL: jumbled_indices16: ; SSE2: # %bb.0: -; SSE2-NEXT: pmaddwd %xmm4, %xmm0 -; SSE2-NEXT: pmaddwd %xmm5, %xmm1 -; SSE2-NEXT: pmaddwd %xmm6, %xmm2 -; SSE2-NEXT: pmaddwd %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pmulhw %xmm4, %xmm8 +; SSE2-NEXT: pmullw %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: pmulhw %xmm5, %xmm8 +; SSE2-NEXT: pmullw %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: pmulhw %xmm6, %xmm8 +; SSE2-NEXT: pmullw %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: pmulhw %xmm7, %xmm8 +; SSE2-NEXT: pmullw %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,0],xmm7[3,1] +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,2],xmm6[0,3] +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm5[3,0] +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm4[1,2] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,1],xmm7[2,0] +; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm6[1,2] +; SSE2-NEXT: paddd %xmm9, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,1] +; SSE2-NEXT: paddd %xmm10, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm4[0,3] +; SSE2-NEXT: paddd %xmm11, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: jumbled_indices16: @@ -2194,16 +2691,99 @@ define <32 x i32> @jumbled_indices32(<64 x i16> %A, <64 x i16> %B) { ; SSE2-LABEL: jumbled_indices32: ; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm1 -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm5 -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm6 -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm7 -; SSE2-NEXT: movdqa %xmm7, 112(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 +; SSE2-NEXT: movdqa %xmm0, %xmm15 +; SSE2-NEXT: pmulhw %xmm7, %xmm15 +; SSE2-NEXT: pmullw %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] +; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE2-NEXT: movdqa %xmm1, %xmm15 +; SSE2-NEXT: pmulhw %xmm9, %xmm15 +; SSE2-NEXT: pmullw %xmm9, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE2-NEXT: movdqa %xmm2, %xmm15 +; SSE2-NEXT: pmulhw %xmm10, %xmm15 +; SSE2-NEXT: pmullw %xmm10, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE2-NEXT: movdqa %xmm3, %xmm15 +; SSE2-NEXT: pmulhw %xmm11, %xmm15 +; SSE2-NEXT: pmullw %xmm11, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] +; SSE2-NEXT: movdqa %xmm4, %xmm15 +; SSE2-NEXT: pmulhw %xmm13, %xmm15 +; SSE2-NEXT: pmullw %xmm13, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3] +; SSE2-NEXT: movdqa %xmm5, %xmm15 +; SSE2-NEXT: pmulhw %xmm14, %xmm15 +; SSE2-NEXT: pmullw %xmm14, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] +; SSE2-NEXT: movdqa %xmm6, %xmm15 +; SSE2-NEXT: pmulhw %xmm12, %xmm15 +; SSE2-NEXT: pmullw %xmm12, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15 +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: pmulhw %xmm15, %xmm7 +; SSE2-NEXT: pmullw %xmm15, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm15 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,0],xmm15[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,1],xmm15[0,2] +; SSE2-NEXT: paddd %xmm7, %xmm8 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm12[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm12[1,3] +; SSE2-NEXT: paddd %xmm7, %xmm6 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm14[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm14[0,2] +; SSE2-NEXT: paddd %xmm7, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm13[1,2] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm13[0,3] +; SSE2-NEXT: paddd %xmm7, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm11[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[3,1] +; SSE2-NEXT: paddd %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm10[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm10[3,1] +; SSE2-NEXT: paddd %xmm7, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm9[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm9[3,1] +; SSE2-NEXT: paddd %xmm7, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm9[2,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm9[3,0] +; SSE2-NEXT: paddd %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm8, 112(%rdi) ; SSE2-NEXT: movdqa %xmm6, 96(%rdi) ; SSE2-NEXT: movdqa %xmm5, 80(%rdi) ; SSE2-NEXT: movdqa %xmm4, 64(%rdi) @@ -2306,10 +2886,44 @@ define <8 x i32> @pmaddwd_256(<16 x i16>* %Aptr, <16 x i16>* %Bptr) { ; SSE2-LABEL: pmaddwd_256: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 -; SSE2-NEXT: pmaddwd (%rsi), %xmm0 -; SSE2-NEXT: pmaddwd 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm2 +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: movdqa 16(%rsi), %xmm3 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: pmulhw %xmm4, %xmm2 +; SSE2-NEXT: pmullw %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pmulhw %xmm0, %xmm2 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: paddd %xmm4, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: pmaddwd_256: @@ -2321,11 +2935,52 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX256-LABEL: pmaddwd_256: -; AVX256: # %bb.0: -; AVX256-NEXT: vmovdqa (%rdi), %ymm0 -; AVX256-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 -; AVX256-NEXT: retq +; AVX2-LABEL: pmaddwd_256: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rsi), %xmm2 +; AVX2-NEXT: vmovdqa 16(%rsi), %xmm4 +; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; AVX2-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm4, %xmm4 +; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX2-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vpmovsxwd %xmm3, %ymm3 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpmulld %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX2-NEXT: vpmovsxwd %xmm2, %ymm2 +; AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: pmaddwd_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm1 +; AVX512-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512-NEXT: vpmovdw %zmm2, %ymm3 +; AVX512-NEXT: vpsrld $16, %ymm2, %ymm2 +; AVX512-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX512-NEXT: vpmovsxwd %xmm3, %ymm3 +; AVX512-NEXT: vpmulld %ymm3, %ymm1, %ymm1 +; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512-NEXT: vpmovsxwd %xmm2, %ymm2 +; AVX512-NEXT: vpmulld %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: retq %A = load <16 x i16>, <16 x i16>* %Aptr %B = load <16 x i16>, <16 x i16>* %Bptr %A_even = shufflevector <16 x i16> %A, <16 x i16> undef, <8 x i32> @@ -2345,14 +3000,82 @@ define <16 x i32> @pmaddwd_512(<32 x i16>* %Aptr, <32 x i16>* %Bptr) { ; SSE2-LABEL: pmaddwd_512: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 -; SSE2-NEXT: movdqa 48(%rdi), %xmm3 -; SSE2-NEXT: pmaddwd (%rsi), %xmm0 -; SSE2-NEXT: pmaddwd 16(%rsi), %xmm1 -; SSE2-NEXT: pmaddwd 32(%rsi), %xmm2 -; SSE2-NEXT: pmaddwd 48(%rsi), %xmm3 +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm5 +; SSE2-NEXT: movdqa 32(%rdi), %xmm3 +; SSE2-NEXT: movdqa 48(%rdi), %xmm9 +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: movdqa 16(%rsi), %xmm7 +; SSE2-NEXT: movdqa 32(%rsi), %xmm2 +; SSE2-NEXT: movdqa 48(%rsi), %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm8[0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm9[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0] +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm5, %xmm1 +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: packssdw %xmm9, %xmm3 +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm9[0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm9 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0] +; SSE2-NEXT: psrad $16, %xmm7 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm7, %xmm0 +; SSE2-NEXT: psrad $16, %xmm6 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: packssdw %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm8, %xmm6 +; SSE2-NEXT: pmulhw %xmm9, %xmm6 +; SSE2-NEXT: pmullw %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pmulhw %xmm5, %xmm6 +; SSE2-NEXT: pmullw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pmulhw %xmm2, %xmm6 +; SSE2-NEXT: pmullw %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE2-NEXT: paddd %xmm7, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE2-NEXT: paddd %xmm9, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pmulhw %xmm0, %xmm6 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2-NEXT: paddd %xmm5, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: pmaddwd_512: @@ -2377,20 +3100,24 @@ ; AVX2-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: pmaddwd_512: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: pmaddwd_512: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: retq +; AVX512-LABEL: pmaddwd_512: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm1 +; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512-NEXT: vpmovdw %zmm2, %ymm3 +; AVX512-NEXT: vpsrld $16, %zmm2, %zmm2 +; AVX512-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512-NEXT: vpmovsxwd %ymm3, %zmm3 +; AVX512-NEXT: vpmulld %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512-NEXT: vpmulld %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: retq %A = load <32 x i16>, <32 x i16>* %Aptr %B = load <32 x i16>, <32 x i16>* %Bptr %A_even = shufflevector <32 x i16> %A, <32 x i16> undef, <16 x i32> @@ -2411,30 +3138,176 @@ ; SSE2-LABEL: pmaddwd_1024: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pmaddwd (%rdx), %xmm0 -; SSE2-NEXT: pmaddwd 16(%rdx), %xmm1 -; SSE2-NEXT: pmaddwd 32(%rdx), %xmm2 -; SSE2-NEXT: pmaddwd 48(%rdx), %xmm3 -; SSE2-NEXT: movdqa 64(%rsi), %xmm4 -; SSE2-NEXT: pmaddwd 64(%rdx), %xmm4 -; SSE2-NEXT: movdqa 80(%rsi), %xmm5 -; SSE2-NEXT: pmaddwd 80(%rdx), %xmm5 -; SSE2-NEXT: movdqa 96(%rsi), %xmm6 -; SSE2-NEXT: pmaddwd 96(%rdx), %xmm6 +; SSE2-NEXT: movdqa 96(%rsi), %xmm9 ; SSE2-NEXT: movdqa 112(%rsi), %xmm7 -; SSE2-NEXT: pmaddwd 112(%rdx), %xmm7 -; SSE2-NEXT: movdqa %xmm7, 112(%rdi) -; SSE2-NEXT: movdqa %xmm6, 96(%rdi) -; SSE2-NEXT: movdqa %xmm5, 80(%rdi) -; SSE2-NEXT: movdqa %xmm4, 64(%rdi) -; SSE2-NEXT: movdqa %xmm3, 48(%rdi) -; SSE2-NEXT: movdqa %xmm2, 32(%rdi) -; SSE2-NEXT: movdqa %xmm1, 16(%rdi) -; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: movdqa 64(%rsi), %xmm12 +; SSE2-NEXT: movdqa 80(%rsi), %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa 32(%rsi), %xmm10 +; SSE2-NEXT: movdqa 48(%rsi), %xmm8 +; SSE2-NEXT: movdqa 80(%rdx), %xmm11 +; SSE2-NEXT: movdqa (%rdx), %xmm5 +; SSE2-NEXT: movdqa 16(%rdx), %xmm14 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE2-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm10[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE2-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm13 = xmm13[0],xmm3[0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm4[0] +; SSE2-NEXT: movdqa 32(%rdx), %xmm4 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa 48(%rdx), %xmm3 +; SSE2-NEXT: psrad $16, %xmm8 +; SSE2-NEXT: psrad $16, %xmm10 +; SSE2-NEXT: packssdw %xmm8, %xmm10 +; SSE2-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm12 +; SSE2-NEXT: packssdw %xmm0, %xmm12 +; SSE2-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: psrad $16, %xmm7 +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: packssdw %xmm7, %xmm9 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: movdqa 64(%rdx), %xmm8 +; SSE2-NEXT: pshuflw {{.*#+}} xmm10 = xmm8[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm10[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE2-NEXT: movdqa 112(%rdx), %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSE2-NEXT: movdqa 96(%rdx), %xmm10 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: psrad $16, %xmm14 +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: packssdw %xmm14, %xmm5 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: packssdw %xmm3, %xmm4 +; SSE2-NEXT: psrad $16, %xmm11 +; SSE2-NEXT: psrad $16, %xmm8 +; SSE2-NEXT: packssdw %xmm11, %xmm8 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm10 +; SSE2-NEXT: packssdw %xmm1, %xmm10 +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: pmulhw %xmm0, %xmm1 +; SSE2-NEXT: pmullw %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movdqa %xmm13, %xmm1 +; SSE2-NEXT: pmulhw %xmm15, %xmm1 +; SSE2-NEXT: pmullw %xmm13, %xmm15 +; SSE2-NEXT: movdqa %xmm15, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pmulhw %xmm12, %xmm1 +; SSE2-NEXT: pmullw %xmm2, %xmm12 +; SSE2-NEXT: movdqa %xmm12, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pmulhw %xmm7, %xmm1 +; SSE2-NEXT: pmullw %xmm2, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE2-NEXT: movdqa %xmm9, %xmm1 +; SSE2-NEXT: pmulhw %xmm10, %xmm1 +; SSE2-NEXT: pmullw %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm10, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: paddd %xmm14, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE2-NEXT: paddd %xmm0, %xmm10 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pmulhw %xmm8, %xmm1 +; SSE2-NEXT: pmullw %xmm0, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: paddd %xmm13, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE2-NEXT: paddd %xmm15, %xmm8 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: pmulhw %xmm4, %xmm1 +; SSE2-NEXT: pmullw %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; SSE2-NEXT: paddd %xmm11, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE2-NEXT: paddd %xmm12, %xmm4 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm9, %xmm1 +; SSE2-NEXT: pmulhw %xmm5, %xmm1 +; SSE2-NEXT: pmullw %xmm9, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; SSE2-NEXT: paddd %xmm2, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE2-NEXT: paddd %xmm7, %xmm5 +; SSE2-NEXT: movdqa %xmm3, 112(%rdi) +; SSE2-NEXT: movdqa %xmm10, 96(%rdi) +; SSE2-NEXT: movdqa %xmm0, 80(%rdi) +; SSE2-NEXT: movdqa %xmm8, 64(%rdi) +; SSE2-NEXT: movdqa %xmm6, 48(%rdi) +; SSE2-NEXT: movdqa %xmm4, 32(%rdi) +; SSE2-NEXT: movdqa %xmm9, 16(%rdi) +; SSE2-NEXT: movdqa %xmm5, (%rdi) ; SSE2-NEXT: retq ; ; AVX1-LABEL: pmaddwd_1024: @@ -2514,13 +3387,26 @@ ; SSE2-LABEL: pmaddwd_commuted_mul: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pmaddwd (%rsi), %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pmaddwd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: pmaddwd_commuted_mul: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rsi), %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %A = load <8 x i16>, <8 x i16>* %Aptr %B = load <8 x i16>, <8 x i16>* %Bptr @@ -2541,14 +3427,20 @@ define <4 x i32> @pmaddwd_swapped_indices(<8 x i16>* %Aptr, <8 x i16>* %Bptr) { ; SSE2-LABEL: pmaddwd_swapped_indices: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pmaddwd (%rsi), %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[1,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,4,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[1,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; SSE2-NEXT: pmaddwd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: pmaddwd_swapped_indices: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[1,0,2,3,4,5,6,7] +; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = mem[1,0,2,3,4,5,6,7] +; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] +; AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %A = load <8 x i16>, <8 x i16>* %Aptr %B = load <8 x i16>, <8 x i16>* %Bptr @@ -2604,31 +3496,80 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqu (%rdi), %xmm0 ; SSE2-NEXT: movdqu (%rsi), %xmm1 -; SSE2-NEXT: pmaddwd %xmm0, %xmm1 -; SSE2-NEXT: movdqu (%rdx), %xmm0 -; SSE2-NEXT: movdqu (%rcx), %xmm2 -; SSE2-NEXT: pmaddwd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pmulhw %xmm1, %xmm2 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: movdqu (%rdx), %xmm1 +; SSE2-NEXT: movdqu (%rcx), %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pmulhw %xmm2, %xmm3 +; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX-LABEL: madd_double_reduction: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqu (%rdi), %xmm0 -; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu (%rdx), %xmm1 -; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq +; AVX1-LABEL: madd_double_reduction: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1 +; AVX1-NEXT: vpmovsxwd (%rsi), %xmm2 +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd 8(%rsi), %xmm2 +; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd (%rdx), %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%rdx), %xmm2 +; AVX1-NEXT: vpmovsxwd (%rcx), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%rcx), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: retq +; +; AVX256-LABEL: madd_double_reduction: +; AVX256: # %bb.0: +; AVX256-NEXT: vpmovsxwd (%rdi), %ymm0 +; AVX256-NEXT: vpmovsxwd (%rsi), %ymm1 +; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpmovsxwd (%rdx), %ymm1 +; AVX256-NEXT: vpmovsxwd (%rcx), %ymm2 +; AVX256-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vmovd %xmm0, %eax +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %tmp = load <8 x i16>, <8 x i16>* %arg, align 1 %tmp6 = load <8 x i16>, <8 x i16>* %arg1, align 1 %tmp7 = sext <8 x i16> %tmp to <8 x i32> @@ -2655,49 +3596,140 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movdqu (%rsi), %xmm1 -; SSE2-NEXT: pmaddwd %xmm0, %xmm1 +; SSE2-NEXT: movdqu (%rdi), %xmm1 +; SSE2-NEXT: movdqu (%rsi), %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pmulhw %xmm0, %xmm2 +; SSE2-NEXT: pmullw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] +; SSE2-NEXT: paddd %xmm2, %xmm1 ; SSE2-NEXT: movdqu (%rdx), %xmm0 ; SSE2-NEXT: movdqu (%rcx), %xmm2 -; SSE2-NEXT: pmaddwd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: movdqu (%r8), %xmm0 -; SSE2-NEXT: movdqu (%r9), %xmm1 -; SSE2-NEXT: pmaddwd %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pmulhw %xmm2, %xmm3 +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movdqu (%r8), %xmm1 +; SSE2-NEXT: movdqu (%r9), %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pmulhw %xmm2, %xmm3 +; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movdqu (%r10), %xmm0 ; SSE2-NEXT: movdqu (%rax), %xmm2 -; SSE2-NEXT: pmaddwd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pmulhw %xmm2, %xmm3 +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; -; AVX-LABEL: madd_quad_reduction: -; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX-NEXT: vmovdqu (%rdi), %xmm0 -; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu (%rdx), %xmm1 -; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovdqu (%r8), %xmm1 -; AVX-NEXT: vpmaddwd (%r9), %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqu (%r10), %xmm1 -; AVX-NEXT: vpmaddwd (%rax), %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq +; AVX1-LABEL: madd_quad_reduction: +; AVX1: # %bb.0: +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1 +; AVX1-NEXT: vpmovsxwd (%rsi), %xmm2 +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd 8(%rsi), %xmm2 +; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd (%rdx), %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%rdx), %xmm2 +; AVX1-NEXT: vpmovsxwd (%rcx), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%rcx), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovsxwd (%r8), %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%r8), %xmm2 +; AVX1-NEXT: vpmovsxwd (%r9), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%r9), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd (%r10), %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%r10), %xmm2 +; AVX1-NEXT: vpmovsxwd (%rax), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%rax), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: retq +; +; AVX256-LABEL: madd_quad_reduction: +; AVX256: # %bb.0: +; AVX256-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX256-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX256-NEXT: vpmovsxwd (%rdi), %ymm0 +; AVX256-NEXT: vpmovsxwd (%rsi), %ymm1 +; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpmovsxwd (%rdx), %ymm1 +; AVX256-NEXT: vpmovsxwd (%rcx), %ymm2 +; AVX256-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX256-NEXT: vpmovsxwd (%r8), %ymm1 +; AVX256-NEXT: vpmovsxwd (%r9), %ymm2 +; AVX256-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpmovsxwd (%r10), %ymm1 +; AVX256-NEXT: vpmovsxwd (%rax), %ymm2 +; AVX256-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vmovd %xmm0, %eax +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %tmp = load <8 x i16>, <8 x i16>* %arg, align 1 %tmp6 = load <8 x i16>, <8 x i16>* %arg1, align 1 %tmp7 = sext <8 x i16> %tmp to <8 x i32> @@ -2753,8 +3785,15 @@ ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: paddd %xmm5, %xmm3 -; SSE2-NEXT: pmaddwd %xmm4, %xmm4 +; SSE2-NEXT: pmullw %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm5[0,2] +; SSE2-NEXT: psrld $16, %xmm4 ; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: paddd %xmm6, %xmm1 ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: addq $-8, %rax ; SSE2-NEXT: jne .LBB33_1 @@ -2835,16 +3874,16 @@ ; AVX256-NEXT: jne .LBB33_1 ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX256-NEXT: vmovd %xmm1, %ecx ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vmovd %xmm0, %eax @@ -2908,7 +3947,16 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; SSE2-NEXT: psubw %xmm2, %xmm3 -; SSE2-NEXT: pmaddwd %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pmulhw %xmm3, %xmm2 +; SSE2-NEXT: pmullw %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm4[1,3] +; SSE2-NEXT: paddd %xmm2, %xmm3 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: addq $8, %rcx ; SSE2-NEXT: cmpq %rcx, %rax @@ -2930,10 +3978,15 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB34_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmaddwd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmulld %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmulld %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vphaddd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: addq $8, %rcx @@ -2958,19 +4011,21 @@ ; AVX256-NEXT: .p2align 4, 0x90 ; AVX256-NEXT: .LBB34_1: # %vector.body ; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX256-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX256-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX256-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; AVX256-NEXT: vpmaddwd %xmm1, %xmm1, %xmm1 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX256-NEXT: vpsubd %ymm1, %ymm2, %ymm1 +; AVX256-NEXT: vpmulld %ymm1, %ymm1, %ymm1 +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vphaddd %xmm2, %xmm1, %xmm1 ; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX256-NEXT: addq $8, %rcx ; AVX256-NEXT: cmpq %rcx, %rax ; AVX256-NEXT: jne .LBB34_1 ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vmovd %xmm0, %eax @@ -3114,14 +4169,30 @@ ; SSE2-NEXT: psraw $8, %xmm5 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: pmaddwd %xmm5, %xmm6 -; SSE2-NEXT: paddd %xmm6, %xmm2 +; SSE2-NEXT: pmullw %xmm5, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: pmaddwd %xmm4, %xmm3 +; SSE2-NEXT: pmullw %xmm4, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm4[0,2] +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm5[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm4[1,3] +; SSE2-NEXT: paddd %xmm7, %xmm3 ; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm5[1,3] +; SSE2-NEXT: paddd %xmm8, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm2 ; SSE2-NEXT: addq $16, %rax ; SSE2-NEXT: cmpq %r8, %rax ; SSE2-NEXT: jb .LBB38_1 @@ -3146,11 +4217,19 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB38_1: # %loop ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovsxbw 8(%rdi,%rax), %xmm2 -; AVX1-NEXT: vpmovsxbw (%rdi,%rax), %xmm3 -; AVX1-NEXT: vpmovsxbw 8(%rsi,%rax), %xmm4 +; AVX1-NEXT: vpmovsxbd 12(%rdi,%rax), %xmm2 +; AVX1-NEXT: vpmovsxbd 8(%rdi,%rax), %xmm3 +; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovsxbd 4(%rdi,%rax), %xmm3 +; AVX1-NEXT: vpmovsxbd (%rdi,%rax), %xmm4 +; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovsxbd 12(%rsi,%rax), %xmm4 +; AVX1-NEXT: vpmovsxbd 8(%rsi,%rax), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpmovsxbw (%rsi,%rax), %xmm4 +; AVX1-NEXT: vpmovsxbd 4(%rsi,%rax), %xmm4 +; AVX1-NEXT: vpmovsxbd (%rsi,%rax), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 @@ -3183,8 +4262,14 @@ ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB38_1: # %loop ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vpmovsxbw (%rdi,%rax), %ymm2 -; AVX2-NEXT: vpmovsxbw (%rsi,%rax), %ymm3 +; AVX2-NEXT: vpmovsxbd 8(%rdi,%rax), %ymm2 +; AVX2-NEXT: vpmovsxbd (%rdi,%rax), %ymm3 +; AVX2-NEXT: vpackssdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovsxbd 8(%rsi,%rax), %ymm3 +; AVX2-NEXT: vpmovsxbd (%rsi,%rax), %ymm4 +; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] ; AVX2-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: addq $16, %rax @@ -3193,9 +4278,9 @@ ; AVX2-NEXT: # %bb.2: # %afterloop ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -3211,9 +4296,12 @@ ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB38_1: # %loop ; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512-NEXT: vpmovsxbw (%rdi,%rax), %ymm1 -; AVX512-NEXT: vpmovsxbw (%rsi,%rax), %ymm2 -; AVX512-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vpmovsxbd (%rdi,%rax), %zmm1 +; AVX512-NEXT: vpmovsxbd (%rsi,%rax), %zmm2 +; AVX512-NEXT: vpmulld %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vphaddd %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: addq $16, %rax ; AVX512-NEXT: cmpq %r8, %rax @@ -3222,9 +4310,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/mask-negated-bool.ll b/llvm/test/CodeGen/X86/mask-negated-bool.ll --- a/llvm/test/CodeGen/X86/mask-negated-bool.ll +++ b/llvm/test/CodeGen/X86/mask-negated-bool.ll @@ -27,7 +27,10 @@ define <4 x i32> @mask_negated_zext_bool_vec(<4 x i1> %x) { ; CHECK-LABEL: mask_negated_zext_bool_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: psubd %xmm0, %xmm1 +; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %ext = zext <4 x i1> %x to <4 x i32> %neg = sub <4 x i32> zeroinitializer, %ext @@ -61,7 +64,10 @@ define <4 x i32> @mask_negated_sext_bool_vec(<4 x i1> %x) { ; CHECK-LABEL: mask_negated_sext_bool_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: psubd %xmm0, %xmm1 +; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %ext = sext <4 x i1> %x to <4 x i32> %neg = sub <4 x i32> zeroinitializer, %ext diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll --- a/llvm/test/CodeGen/X86/masked_compressstore.ll +++ b/llvm/test/CodeGen/X86/masked_compressstore.ll @@ -1290,7 +1290,7 @@ ; SSE2-NEXT: pmovmskb %xmm8, %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: jne LBB6_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %al @@ -1614,7 +1614,7 @@ ; SSE42-NEXT: pmovmskb %xmm8, %eax ; SSE42-NEXT: shll $16, %eax ; SSE42-NEXT: orl %ecx, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: jne LBB6_1 ; SSE42-NEXT: ## %bb.2: ## %else ; SSE42-NEXT: testb $2, %al @@ -1895,7 +1895,7 @@ ; AVX1-NEXT: vpmovmskb %xmm4, %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %ecx, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $1, %cl ; AVX1-NEXT: jne LBB6_1 ; AVX1-NEXT: ## %bb.2: ## %else ; AVX1-NEXT: testb $2, %al diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll --- a/llvm/test/CodeGen/X86/masked_expandload.ll +++ b/llvm/test/CodeGen/X86/masked_expandload.ll @@ -1389,7 +1389,7 @@ ; SSE2-NEXT: pmovmskb %xmm8, %ecx ; SSE2-NEXT: shll $16, %ecx ; SSE2-NEXT: orl %edx, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $1, %dl ; SSE2-NEXT: jne LBB8_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %cl @@ -1746,7 +1746,7 @@ ; SSE42-NEXT: pmovmskb %xmm8, %ecx ; SSE42-NEXT: shll $16, %ecx ; SSE42-NEXT: orl %edx, %ecx -; SSE42-NEXT: testb $1, %cl +; SSE42-NEXT: testb $1, %dl ; SSE42-NEXT: jne LBB8_1 ; SSE42-NEXT: ## %bb.2: ## %else ; SSE42-NEXT: testb $2, %cl @@ -2043,7 +2043,7 @@ ; AVX1-NEXT: vpmovmskb %xmm4, %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %ecx, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $1, %cl ; AVX1-NEXT: jne LBB8_1 ; AVX1-NEXT: ## %bb.2: ## %else ; AVX1-NEXT: testb $2, %al @@ -2666,20 +2666,16 @@ ; define <2 x i64> @expandload_v2i64_const(ptr %base, <2 x i64> %src0) { -; SSE2-LABEL: expandload_v2i64_const: -; SSE2: ## %bb.0: -; SSE2-NEXT: movsd (%rdi), %xmm1 ## xmm1 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSE42-LABEL: expandload_v2i64_const: -; SSE42: ## %bb.0: -; SSE42-NEXT: pinsrq $1, (%rdi), %xmm0 -; SSE42-NEXT: retq +; SSE-LABEL: expandload_v2i64_const: +; SSE: ## %bb.0: +; SSE-NEXT: movsd (%rdi), %xmm1 ## xmm1 = mem[0],zero +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: expandload_v2i64_const: ; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm0 +; AVX1OR2-NEXT: vmovddup (%rdi), %xmm1 ## xmm1 = mem[0,0] +; AVX1OR2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: expandload_v2i64_const: diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll --- a/llvm/test/CodeGen/X86/masked_gather.ll +++ b/llvm/test/CodeGen/X86/masked_gather.ll @@ -139,8 +139,9 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: je .LBB0_2 ; AVX512F-NEXT: # %bb.1: # %cond.load ; AVX512F-NEXT: vmovq %xmm0, %rcx @@ -338,8 +339,9 @@ ; AVX512F-NEXT: vpsllq $2, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: je .LBB1_2 ; AVX512F-NEXT: # %bb.1: # %cond.load ; AVX512F-NEXT: vmovq %xmm0, %rcx @@ -533,8 +535,9 @@ ; AVX512F-NEXT: vpsllq $2, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: je .LBB2_2 ; AVX512F-NEXT: # %bb.1: # %cond.load ; AVX512F-NEXT: vmovq %xmm0, %rcx diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -867,8 +867,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %ymm1 ; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm1 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: # implicit-def: $xmm0 ; KNL_64-NEXT: je .LBB14_2 ; KNL_64-NEXT: # %bb.1: # %cond.load @@ -908,8 +909,9 @@ ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: # implicit-def: $xmm0 ; KNL_32-NEXT: jne .LBB14_1 ; KNL_32-NEXT: # %bb.2: # %else @@ -981,8 +983,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %ymm1 ; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: je .LBB15_2 ; KNL_64-NEXT: # %bb.1: # %cond.load ; KNL_64-NEXT: vmovq %xmm0, %rcx @@ -1025,8 +1028,9 @@ ; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB15_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1098,8 +1102,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB16_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -1127,8 +1132,9 @@ ; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB16_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1157,8 +1163,9 @@ ; SKX-NEXT: vpbroadcastq %rdi, %xmm1 ; SKX-NEXT: vpsllq $3, %xmm0, %xmm0 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; SKX-NEXT: kmovw %k0, %ecx ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %cl ; SKX-NEXT: jne .LBB16_1 ; SKX-NEXT: # %bb.2: # %else ; SKX-NEXT: testb $2, %al @@ -1183,8 +1190,9 @@ ; SKX_32-NEXT: vpmovq2m %xmm1, %k0 ; SKX_32-NEXT: vpslld $3, %xmm0, %xmm0 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB16_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -1220,8 +1228,9 @@ ; KNL_64: # %bb.0: ; KNL_64-NEXT: vpslld $31, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: je .LBB17_2 ; KNL_64-NEXT: # %bb.1: # %cond.store ; KNL_64-NEXT: vmovq %xmm1, %rcx @@ -1257,8 +1266,9 @@ ; KNL_32: # %bb.0: ; KNL_32-NEXT: vpslld $31, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB17_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1320,8 +1330,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm2 ; KNL_64-NEXT: vpbroadcastq %xmm2, %ymm2 ; KNL_64-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: je .LBB18_2 ; KNL_64-NEXT: # %bb.1: # %cond.store ; KNL_64-NEXT: vmovq %xmm1, %rcx @@ -1363,8 +1374,9 @@ ; KNL_32-NEXT: vpslld $3, %xmm1, %xmm1 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2 ; KNL_32-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: je .LBB18_2 ; KNL_32-NEXT: # %bb.1: # %cond.store ; KNL_32-NEXT: vmovd %xmm1, %ecx @@ -1423,8 +1435,9 @@ ; KNL_64: # %bb.0: ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB19_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -1447,8 +1460,9 @@ ; KNL_32: # %bb.0: ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB19_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1471,8 +1485,9 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX-NEXT: vpmovq2m %xmm2, %k0 +; SKX-NEXT: kmovw %k0, %ecx ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %cl ; SKX-NEXT: jne .LBB19_1 ; SKX-NEXT: # %bb.2: # %else ; SKX-NEXT: testb $2, %al @@ -1493,8 +1508,9 @@ ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX_32-NEXT: vpmovq2m %xmm2, %k0 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB19_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -1520,8 +1536,9 @@ ; KNL_64: # %bb.0: ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB20_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -1544,8 +1561,9 @@ ; KNL_32: # %bb.0: ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB20_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1568,8 +1586,9 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX-NEXT: vpmovq2m %xmm2, %k0 +; SKX-NEXT: kmovw %k0, %ecx ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %cl ; SKX-NEXT: jne .LBB20_1 ; SKX-NEXT: # %bb.2: # %else ; SKX-NEXT: testb $2, %al @@ -1590,8 +1609,9 @@ ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX_32-NEXT: vpmovq2m %xmm2, %k0 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB20_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -1624,8 +1644,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB21_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -1654,8 +1675,9 @@ ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB21_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1685,8 +1707,9 @@ ; SKX-NEXT: vpbroadcastq %rdi, %xmm1 ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; SKX-NEXT: kmovw %k0, %ecx ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %cl ; SKX-NEXT: jne .LBB21_1 ; SKX-NEXT: # %bb.2: # %else ; SKX-NEXT: testb $2, %al @@ -1712,8 +1735,9 @@ ; SKX_32-NEXT: vpmovq2m %xmm1, %k0 ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB21_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -1747,8 +1771,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB22_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -1778,8 +1803,9 @@ ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB22_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1808,8 +1834,9 @@ ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 ; SKX-NEXT: vpbroadcastq %rdi, %xmm1 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; SKX-NEXT: kmovw %k0, %ecx ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %cl ; SKX-NEXT: jne .LBB22_1 ; SKX-NEXT: # %bb.2: # %else ; SKX-NEXT: testb $2, %al @@ -1836,8 +1863,9 @@ ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB22_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -1874,8 +1902,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB23_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -1903,8 +1932,9 @@ ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB23_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1933,8 +1963,9 @@ ; SKX-NEXT: vpbroadcastq %rdi, %xmm1 ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; SKX-NEXT: kmovw %k0, %ecx ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %cl ; SKX-NEXT: jne .LBB23_1 ; SKX-NEXT: # %bb.2: # %else ; SKX-NEXT: testb $2, %al @@ -1959,8 +1990,9 @@ ; SKX_32-NEXT: vpmovq2m %xmm1, %k0 ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB23_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -1993,8 +2025,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB24_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -2023,8 +2056,9 @@ ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB24_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -2052,8 +2086,9 @@ ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 ; SKX-NEXT: vpbroadcastq %rdi, %xmm1 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; SKX-NEXT: kmovw %k0, %ecx ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %cl ; SKX-NEXT: jne .LBB24_1 ; SKX-NEXT: # %bb.2: # %else ; SKX-NEXT: testb $2, %al @@ -2079,8 +2114,9 @@ ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB24_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -2165,8 +2201,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB26_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -2194,8 +2231,9 @@ ; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB26_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -2226,8 +2264,9 @@ ; SKX-NEXT: vpbroadcastq %rdi, %xmm1 ; SKX-NEXT: vpsllq $3, %xmm0, %xmm0 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; SKX-NEXT: kmovw %k0, %ecx ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %cl ; SKX-NEXT: jne .LBB26_1 ; SKX-NEXT: # %bb.2: # %else ; SKX-NEXT: testb $2, %al @@ -2252,8 +2291,9 @@ ; SKX_32-NEXT: vpmovq2m %xmm1, %k0 ; SKX_32-NEXT: vpslld $3, %xmm0, %xmm0 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB26_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -3251,7 +3291,8 @@ ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: kmovw %k0, %ecx +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: # implicit-def: $ymm1 ; KNL_64-NEXT: je .LBB42_2 ; KNL_64-NEXT: # %bb.1: # %cond.load @@ -3281,7 +3322,8 @@ ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; KNL_64-NEXT: .LBB42_8: # %else8 ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: kmovw %k0, %ecx +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: # implicit-def: $ymm3 ; KNL_64-NEXT: jne .LBB42_9 ; KNL_64-NEXT: # %bb.10: # %else15 @@ -3299,7 +3341,8 @@ ; KNL_64-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; KNL_64-NEXT: .LBB42_16: # %else33 ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: kmovw %k0, %ecx +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: # implicit-def: $ymm4 ; KNL_64-NEXT: jne .LBB42_17 ; KNL_64-NEXT: # %bb.18: # %else40 @@ -3364,16 +3407,19 @@ ; KNL_32-NEXT: movl %esp, %ebp ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: pushl %ebx +; KNL_32-NEXT: pushl %edi ; KNL_32-NEXT: pushl %esi ; KNL_32-NEXT: andl $-32, %esp ; KNL_32-NEXT: subl $32, %esp -; KNL_32-NEXT: .cfi_offset %esi, -16 +; KNL_32-NEXT: .cfi_offset %esi, -20 +; KNL_32-NEXT: .cfi_offset %edi, -16 ; KNL_32-NEXT: .cfi_offset %ebx, -12 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL_32-NEXT: kmovw %k0, %ebx -; KNL_32-NEXT: testb $1, %bl -; KNL_32-NEXT: vmovd %xmm0, %eax +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: vmovd %xmm0, %edi ; KNL_32-NEXT: # implicit-def: $ymm1 ; KNL_32-NEXT: je .LBB42_2 ; KNL_32-NEXT: # %bb.1: # %cond.load @@ -3406,7 +3452,8 @@ ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; KNL_32-NEXT: .LBB42_8: # %else8 ; KNL_32-NEXT: kmovw %k0, %ebx -; KNL_32-NEXT: testb $1, %bl +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al ; KNL_32-NEXT: # implicit-def: $ymm0 ; KNL_32-NEXT: jne .LBB42_9 ; KNL_32-NEXT: # %bb.10: # %else15 @@ -3425,7 +3472,8 @@ ; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; KNL_32-NEXT: .LBB42_16: # %else33 ; KNL_32-NEXT: kmovw %k0, %ebx -; KNL_32-NEXT: testb $1, %bl +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al ; KNL_32-NEXT: # implicit-def: $ymm2 ; KNL_32-NEXT: jne .LBB42_17 ; KNL_32-NEXT: # %bb.18: # %else40 @@ -3445,8 +3493,9 @@ ; KNL_32-NEXT: .LBB42_24: # %else58 ; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; KNL_32-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; KNL_32-NEXT: leal -8(%ebp), %esp +; KNL_32-NEXT: leal -12(%ebp), %esp ; KNL_32-NEXT: popl %esi +; KNL_32-NEXT: popl %edi ; KNL_32-NEXT: popl %ebx ; KNL_32-NEXT: popl %ebp ; KNL_32-NEXT: .cfi_def_cfa %esp, 4 @@ -3705,8 +3754,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm2 ; KNL_64-NEXT: vpbroadcastq %xmm2, %xmm2 ; KNL_64-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB47_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -3737,8 +3787,9 @@ ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2 ; KNL_32-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB47_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -3770,8 +3821,9 @@ ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 ; SKX-NEXT: vpbroadcastq %rdi, %xmm2 ; SKX-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; SKX-NEXT: kmovw %k0, %ecx ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %cl ; SKX-NEXT: jne .LBB47_1 ; SKX-NEXT: # %bb.2: # %else ; SKX-NEXT: testb $2, %al @@ -3799,8 +3851,9 @@ ; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB47_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -4016,8 +4069,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm2 ; KNL_64-NEXT: vpbroadcastq %xmm2, %xmm2 ; KNL_64-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB52_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -4043,8 +4097,9 @@ ; KNL_32-NEXT: vpslld $3, %xmm1, %xmm1 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2 ; KNL_32-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB52_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -4071,8 +4126,9 @@ ; SKX-NEXT: vpmovsxdq %xmm1, %xmm1 ; SKX-NEXT: vpsllq $3, %xmm1, %xmm1 ; SKX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; SKX-NEXT: kmovw %k0, %ecx ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %cl ; SKX-NEXT: jne .LBB52_1 ; SKX-NEXT: # %bb.2: # %else ; SKX-NEXT: testb $2, %al @@ -4095,8 +4151,9 @@ ; SKX_32-NEXT: vpmovq2m %xmm2, %k0 ; SKX_32-NEXT: vpslld $3, %xmm1, %xmm1 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm1, %xmm1 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB52_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -4396,9 +4453,10 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm0 ; KNL_64-NEXT: vpbroadcastq %xmm0, %xmm0 ; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB58_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -4423,9 +4481,10 @@ ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 ; KNL_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax ; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB58_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -4451,9 +4510,10 @@ ; SKX_SMALL-NEXT: vpmovq2m %xmm0, %k0 ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %xmm0 ; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; SKX_SMALL-NEXT: kmovw %k0, %ecx ; SKX_SMALL-NEXT: kmovw %k0, %eax ; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; SKX_SMALL-NEXT: testb $1, %al +; SKX_SMALL-NEXT: testb $1, %cl ; SKX_SMALL-NEXT: jne .LBB58_1 ; SKX_SMALL-NEXT: # %bb.2: # %else ; SKX_SMALL-NEXT: testb $2, %al @@ -4477,9 +4537,10 @@ ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %xmm0 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax ; SKX_LARGE-NEXT: vpaddq (%rax), %xmm0, %xmm1 +; SKX_LARGE-NEXT: kmovw %k0, %ecx ; SKX_LARGE-NEXT: kmovw %k0, %eax ; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; SKX_LARGE-NEXT: testb $1, %al +; SKX_LARGE-NEXT: testb $1, %cl ; SKX_LARGE-NEXT: jne .LBB58_1 ; SKX_LARGE-NEXT: # %bb.2: # %else ; SKX_LARGE-NEXT: testb $2, %al @@ -4502,9 +4563,10 @@ ; SKX_32-NEXT: vpmovq2m %xmm0, %k0 ; SKX_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 ; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax ; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB58_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -4592,8 +4654,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm0 ; KNL_64-NEXT: vpbroadcastq %xmm0, %xmm0 ; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB60_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -4618,8 +4681,9 @@ ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 ; KNL_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB60_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -4644,8 +4708,9 @@ ; SKX_SMALL-NEXT: vpmovq2m %xmm0, %k0 ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %xmm0 ; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; SKX_SMALL-NEXT: kmovw %k0, %ecx ; SKX_SMALL-NEXT: kmovw %k0, %eax -; SKX_SMALL-NEXT: testb $1, %al +; SKX_SMALL-NEXT: testb $1, %cl ; SKX_SMALL-NEXT: jne .LBB60_1 ; SKX_SMALL-NEXT: # %bb.2: # %else ; SKX_SMALL-NEXT: testb $2, %al @@ -4669,8 +4734,9 @@ ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %xmm0 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax ; SKX_LARGE-NEXT: vpaddq (%rax), %xmm0, %xmm0 +; SKX_LARGE-NEXT: kmovw %k0, %ecx ; SKX_LARGE-NEXT: kmovw %k0, %eax -; SKX_LARGE-NEXT: testb $1, %al +; SKX_LARGE-NEXT: testb $1, %cl ; SKX_LARGE-NEXT: jne .LBB60_1 ; SKX_LARGE-NEXT: # %bb.2: # %else ; SKX_LARGE-NEXT: testb $2, %al @@ -4693,8 +4759,9 @@ ; SKX_32-NEXT: vpmovq2m %xmm0, %k0 ; SKX_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 ; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB60_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -4780,8 +4847,9 @@ ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL_64-NEXT: vmovq %rdi, %xmm0 ; KNL_64-NEXT: vpbroadcastq %xmm0, %ymm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: je .LBB62_2 ; KNL_64-NEXT: # %bb.1: # %cond.load ; KNL_64-NEXT: vmovq %xmm0, %rcx @@ -4820,8 +4888,9 @@ ; KNL_32-NEXT: vpslld $31, %xmm0, %xmm0 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB62_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -4890,8 +4959,9 @@ ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL_64-NEXT: vmovq %rdi, %xmm0 ; KNL_64-NEXT: vpbroadcastq %xmm0, %ymm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: je .LBB63_2 ; KNL_64-NEXT: # %bb.1: # %cond.store ; KNL_64-NEXT: vmovq %xmm0, %rcx @@ -4928,8 +4998,9 @@ ; KNL_32-NEXT: vpslld $31, %xmm0, %xmm0 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB63_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll --- a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll @@ -12,8 +12,9 @@ ; WIDEN_SKX-NEXT: vpmovsxdq %xmm0, %xmm0 ; WIDEN_SKX-NEXT: vpsllq $3, %xmm0, %xmm0 ; WIDEN_SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; WIDEN_SKX-NEXT: kmovw %k0, %ecx ; WIDEN_SKX-NEXT: kmovw %k0, %eax -; WIDEN_SKX-NEXT: testb $1, %al +; WIDEN_SKX-NEXT: testb $1, %cl ; WIDEN_SKX-NEXT: jne .LBB0_1 ; WIDEN_SKX-NEXT: # %bb.2: # %else ; WIDEN_SKX-NEXT: testb $2, %al @@ -41,8 +42,9 @@ ; WIDEN_KNL-NEXT: vmovq %rdi, %xmm1 ; WIDEN_KNL-NEXT: vpbroadcastq %xmm1, %xmm1 ; WIDEN_KNL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; WIDEN_KNL-NEXT: kmovw %k0, %ecx ; WIDEN_KNL-NEXT: kmovw %k0, %eax -; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: testb $1, %cl ; WIDEN_KNL-NEXT: jne .LBB0_1 ; WIDEN_KNL-NEXT: # %bb.2: # %else ; WIDEN_KNL-NEXT: testb $2, %al @@ -83,8 +85,9 @@ ; WIDEN_SKX-NEXT: vpmovsxdq %xmm1, %xmm1 ; WIDEN_SKX-NEXT: vpsllq $3, %xmm1, %xmm1 ; WIDEN_SKX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; WIDEN_SKX-NEXT: kmovw %k0, %ecx ; WIDEN_SKX-NEXT: kmovw %k0, %eax -; WIDEN_SKX-NEXT: testb $1, %al +; WIDEN_SKX-NEXT: testb $1, %cl ; WIDEN_SKX-NEXT: jne .LBB1_1 ; WIDEN_SKX-NEXT: # %bb.2: # %else ; WIDEN_SKX-NEXT: testb $2, %al @@ -110,8 +113,9 @@ ; WIDEN_KNL-NEXT: vmovq %rdi, %xmm2 ; WIDEN_KNL-NEXT: vpbroadcastq %xmm2, %xmm2 ; WIDEN_KNL-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; WIDEN_KNL-NEXT: kmovw %k0, %ecx ; WIDEN_KNL-NEXT: kmovw %k0, %eax -; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: testb $1, %cl ; WIDEN_KNL-NEXT: jne .LBB1_1 ; WIDEN_KNL-NEXT: # %bb.2: # %else ; WIDEN_KNL-NEXT: testb $2, %al @@ -165,8 +169,9 @@ ; WIDEN_SKX: # %bb.0: ; WIDEN_SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; WIDEN_SKX-NEXT: vpmovq2m %xmm1, %k0 +; WIDEN_SKX-NEXT: kmovw %k0, %ecx ; WIDEN_SKX-NEXT: kmovw %k0, %eax -; WIDEN_SKX-NEXT: testb $1, %al +; WIDEN_SKX-NEXT: testb $1, %cl ; WIDEN_SKX-NEXT: jne .LBB2_1 ; WIDEN_SKX-NEXT: # %bb.2: # %else ; WIDEN_SKX-NEXT: testb $2, %al @@ -189,8 +194,9 @@ ; WIDEN_KNL: # %bb.0: ; WIDEN_KNL-NEXT: vpsllq $63, %xmm1, %xmm1 ; WIDEN_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 +; WIDEN_KNL-NEXT: kmovw %k0, %ecx ; WIDEN_KNL-NEXT: kmovw %k0, %eax -; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: testb $1, %cl ; WIDEN_KNL-NEXT: jne .LBB2_1 ; WIDEN_KNL-NEXT: # %bb.2: # %else ; WIDEN_KNL-NEXT: testb $2, %al @@ -227,8 +233,9 @@ ; WIDEN_SKX: # %bb.0: ; WIDEN_SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; WIDEN_SKX-NEXT: vpmovq2m %xmm2, %k0 +; WIDEN_SKX-NEXT: kmovw %k0, %ecx ; WIDEN_SKX-NEXT: kmovw %k0, %eax -; WIDEN_SKX-NEXT: testb $1, %al +; WIDEN_SKX-NEXT: testb $1, %cl ; WIDEN_SKX-NEXT: jne .LBB3_1 ; WIDEN_SKX-NEXT: # %bb.2: # %else ; WIDEN_SKX-NEXT: testb $2, %al @@ -249,8 +256,9 @@ ; WIDEN_KNL: # %bb.0: ; WIDEN_KNL-NEXT: vpsllq $63, %xmm2, %xmm2 ; WIDEN_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 +; WIDEN_KNL-NEXT: kmovw %k0, %ecx ; WIDEN_KNL-NEXT: kmovw %k0, %eax -; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: testb $1, %cl ; WIDEN_KNL-NEXT: jne .LBB3_1 ; WIDEN_KNL-NEXT: # %bb.2: # %else ; WIDEN_KNL-NEXT: testb $2, %al @@ -302,8 +310,9 @@ ; WIDEN_SKX-NEXT: vpmovsxdq %xmm0, %xmm0 ; WIDEN_SKX-NEXT: vpsllq $2, %xmm0, %xmm0 ; WIDEN_SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; WIDEN_SKX-NEXT: kmovw %k0, %ecx ; WIDEN_SKX-NEXT: kmovw %k0, %eax -; WIDEN_SKX-NEXT: testb $1, %al +; WIDEN_SKX-NEXT: testb $1, %cl ; WIDEN_SKX-NEXT: jne .LBB4_1 ; WIDEN_SKX-NEXT: # %bb.2: # %else ; WIDEN_SKX-NEXT: testb $2, %al @@ -331,8 +340,9 @@ ; WIDEN_KNL-NEXT: vmovq %rdi, %xmm1 ; WIDEN_KNL-NEXT: vpbroadcastq %xmm1, %xmm1 ; WIDEN_KNL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; WIDEN_KNL-NEXT: kmovw %k0, %ecx ; WIDEN_KNL-NEXT: kmovw %k0, %eax -; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: testb $1, %cl ; WIDEN_KNL-NEXT: jne .LBB4_1 ; WIDEN_KNL-NEXT: # %bb.2: # %else ; WIDEN_KNL-NEXT: testb $2, %al @@ -374,8 +384,9 @@ ; WIDEN_SKX-NEXT: vpmovsxdq %xmm1, %xmm1 ; WIDEN_SKX-NEXT: vpsllq $2, %xmm1, %xmm1 ; WIDEN_SKX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; WIDEN_SKX-NEXT: kmovw %k0, %ecx ; WIDEN_SKX-NEXT: kmovw %k0, %eax -; WIDEN_SKX-NEXT: testb $1, %al +; WIDEN_SKX-NEXT: testb $1, %cl ; WIDEN_SKX-NEXT: jne .LBB5_1 ; WIDEN_SKX-NEXT: # %bb.2: # %else ; WIDEN_SKX-NEXT: testb $2, %al @@ -401,8 +412,9 @@ ; WIDEN_KNL-NEXT: vmovq %rdi, %xmm2 ; WIDEN_KNL-NEXT: vpbroadcastq %xmm2, %xmm2 ; WIDEN_KNL-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; WIDEN_KNL-NEXT: kmovw %k0, %ecx ; WIDEN_KNL-NEXT: kmovw %k0, %eax -; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: testb $1, %cl ; WIDEN_KNL-NEXT: jne .LBB5_1 ; WIDEN_KNL-NEXT: # %bb.2: # %else ; WIDEN_KNL-NEXT: testb $2, %al diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -4521,7 +4521,7 @@ ; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: jne LBB24_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %al @@ -4940,7 +4940,7 @@ ; SSE42-NEXT: pmovmskb %xmm1, %eax ; SSE42-NEXT: shll $16, %eax ; SSE42-NEXT: orl %ecx, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: jne LBB24_1 ; SSE42-NEXT: ## %bb.2: ## %else ; SSE42-NEXT: testb $2, %al @@ -5174,7 +5174,7 @@ ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %ecx, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $1, %cl ; AVX1-NEXT: jne LBB24_1 ; AVX1-NEXT: ## %bb.2: ## %else ; AVX1-NEXT: testb $2, %al @@ -6563,20 +6563,13 @@ } define <8 x float> @mload_constmask_v8f32_zero(ptr %addr, <8 x float> %dst) { -; SSE2-LABEL: mload_constmask_v8f32_zero: -; SSE2: ## %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: retq -; -; SSE42-LABEL: mload_constmask_v8f32_zero: -; SSE42: ## %bb.0: -; SSE42-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],zero -; SSE42-NEXT: xorps %xmm1, %xmm1 -; SSE42-NEXT: retq +; SSE-LABEL: mload_constmask_v8f32_zero: +; SSE: ## %bb.0: +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: mload_constmask_v8f32_zero: ; AVX1OR2: ## %bb.0: diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -3250,7 +3250,7 @@ ; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm2, %ecx ; SSE2-NEXT: jne LBB16_1 ; SSE2-NEXT: ## %bb.2: ## %else @@ -3458,7 +3458,7 @@ ; SSE4-NEXT: pmovmskb %xmm1, %eax ; SSE4-NEXT: shll $16, %eax ; SSE4-NEXT: orl %ecx, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $1, %cl ; SSE4-NEXT: jne LBB16_1 ; SSE4-NEXT: ## %bb.2: ## %else ; SSE4-NEXT: testb $2, %al @@ -3693,7 +3693,7 @@ ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %ecx, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $1, %cl ; AVX1-NEXT: jne LBB16_1 ; AVX1-NEXT: ## %bb.2: ## %else ; AVX1-NEXT: testb $2, %al @@ -5241,10 +5241,10 @@ ; SSE2-NEXT: andb $1, %dl ; SSE2-NEXT: addb %dl, %dl ; SSE2-NEXT: orb %sil, %dl -; SSE2-NEXT: andb $1, %cl ; SSE2-NEXT: shlb $2, %cl ; SSE2-NEXT: orb %dl, %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: andb $7, %cl +; SSE2-NEXT: testb %sil, %sil ; SSE2-NEXT: jne LBB28_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %cl @@ -5274,10 +5274,10 @@ ; SSE4-NEXT: andb $1, %dl ; SSE4-NEXT: addb %dl, %dl ; SSE4-NEXT: orb %sil, %dl -; SSE4-NEXT: andb $1, %cl ; SSE4-NEXT: shlb $2, %cl ; SSE4-NEXT: orb %dl, %cl -; SSE4-NEXT: testb $1, %cl +; SSE4-NEXT: andb $7, %cl +; SSE4-NEXT: testb %sil, %sil ; SSE4-NEXT: jne LBB28_1 ; SSE4-NEXT: ## %bb.2: ## %else ; SSE4-NEXT: testb $2, %cl @@ -5623,37 +5623,38 @@ ; SSE2-NEXT: pxor %xmm7, %xmm7 ; SSE2-NEXT: pxor %xmm8, %xmm8 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtd (%rdi), %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = mem[0,2,2,3] +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm10 +; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm10[0,1,0,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = mem[0,2,2,3] +; SSE2-NEXT: pxor %xmm11, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: pshuflw {{.*#+}} xmm10 = xmm11[0,1,0,2,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; SSE2-NEXT: movsd {{.*#+}} xmm10 = xmm9[0],xmm10[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = mem[0,2,2,3] ; SSE2-NEXT: pxor %xmm9, %xmm9 ; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 ; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm9[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = mem[0,2,2,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm9 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,0,2,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = mem[0,2,2,3] -; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 -; SSE2-NEXT: pshuflw {{.*#+}} xmm9 = xmm10[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] -; SSE2-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = mem[0,2,2,3] -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = mem[0,2,2,3] -; SSE2-NEXT: pcmpgtd %xmm8, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm7 ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE2-NEXT: pmovmskb %xmm9, %r11d +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE2-NEXT: pmovmskb %xmm10, %r11d ; SSE2-NEXT: andl $21845, %r11d ## imm = 0x5555 ; SSE2-NEXT: pmovmskb %xmm7, %edi ; SSE2-NEXT: andl $85, %edi ; SSE2-NEXT: shll $16, %edi ; SSE2-NEXT: orl %r11d, %edi -; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movd %xmm6, %r11d +; SSE2-NEXT: testb $1, %r11b ; SSE2-NEXT: jne LBB31_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %dil @@ -5898,24 +5899,23 @@ ; SSE4-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; SSE4-NEXT: movl 56(%rsi), %eax ; SSE4-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill -; SSE4-NEXT: movl 52(%rsi), %eax -; SSE4-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpgtd 48(%rdi), %xmm1 -; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] ; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpgtd 32(%rdi), %xmm2 +; SSE4-NEXT: pcmpgtd 48(%rdi), %xmm2 ; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] -; SSE4-NEXT: packusdw %xmm1, %xmm2 ; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpgtd 16(%rdi), %xmm1 +; SSE4-NEXT: pcmpgtd 32(%rdi), %xmm1 ; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; SSE4-NEXT: packusdw %xmm2, %xmm1 +; SSE4-NEXT: pxor %xmm2, %xmm2 +; SSE4-NEXT: pcmpgtd 16(%rdi), %xmm2 +; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] ; SSE4-NEXT: pxor %xmm3, %xmm3 ; SSE4-NEXT: pcmpgtd (%rdi), %xmm3 +; SSE4-NEXT: movd %xmm3, %eax ; SSE4-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3],xmm3[4],xmm0[5,6,7] -; SSE4-NEXT: packusdw %xmm1, %xmm3 ; SSE4-NEXT: packusdw %xmm2, %xmm3 +; SSE4-NEXT: packusdw %xmm1, %xmm3 ; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpgtd 80(%rdi), %xmm1 ; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] @@ -5924,14 +5924,16 @@ ; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] ; SSE4-NEXT: packusdw %xmm1, %xmm2 ; SSE4-NEXT: packusdw %xmm2, %xmm2 -; SSE4-NEXT: pmovmskb %xmm3, %eax -; SSE4-NEXT: andl $21845, %eax ## imm = 0x5555 +; SSE4-NEXT: pmovmskb %xmm3, %ecx +; SSE4-NEXT: andl $21845, %ecx ## imm = 0x5555 ; SSE4-NEXT: pmovmskb %xmm2, %edi ; SSE4-NEXT: andl $85, %edi ; SSE4-NEXT: shll $16, %edi -; SSE4-NEXT: orl %eax, %edi +; SSE4-NEXT: orl %ecx, %edi +; SSE4-NEXT: movl 52(%rsi), %ecx +; SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: movl 48(%rsi), %r13d -; SSE4-NEXT: testb $1, %dil ; SSE4-NEXT: movl 44(%rsi), %eax ; SSE4-NEXT: movl 40(%rsi), %ecx ; SSE4-NEXT: movl 36(%rsi), %r8d @@ -6176,26 +6178,24 @@ ; AVX2-NEXT: vmovdqa 64(%rsi), %ymm2 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpcmpgtd 32(%rdi), %ymm3, %ymm4 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-NEXT: vpackssdw %xmm5, %xmm4, %xmm4 ; AVX2-NEXT: vpcmpgtd (%rdi), %ymm3, %ymm5 -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vpacksswb %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vpackssdw %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vpcmpgtd 64(%rdi), %ymm3, %ymm3 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX2-NEXT: vpackssdw %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpacksswb %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 -; AVX2-NEXT: vpmaskmovd %ymm1, %ymm5, 32(%rdx) -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-NEXT: vpacksswb %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,u],zero,xmm3[u,6,u],zero,xmm3[u,12,u],zero,xmm3[u,14,u],zero,xmm3[u] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX2-NEXT: vpslld $31, %ymm4, %ymm4 +; AVX2-NEXT: vpmaskmovd %ymm1, %ymm4, 32(%rdx) +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,u],zero,xmm3[u,2,u],zero,xmm3[u,8,u],zero,xmm3[u,10,u],zero,xmm3[u] ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 ; AVX2-NEXT: vpmaskmovd %ymm0, %ymm1, (%rdx) -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpmaskmovd %ymm2, %ymm0, 64(%rdx) @@ -6442,7 +6442,7 @@ ; ; AVX2-LABEL: undefshuffle: ; AVX2: ## %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -1307,8 +1307,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB4_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -1538,8 +1539,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB5_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -1777,8 +1779,9 @@ ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB7_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -1896,8 +1899,9 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB8_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -4191,8 +4195,9 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB13_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -4365,8 +4370,9 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB14_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -4435,7 +4441,7 @@ ; SSE2-NEXT: notl %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: jne .LBB15_1 ; SSE2-NEXT: # %bb.2: # %else @@ -4652,7 +4658,7 @@ ; SSE4-NEXT: notl %eax ; SSE4-NEXT: shll $16, %eax ; SSE4-NEXT: orl %ecx, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $1, %cl ; SSE4-NEXT: jne .LBB15_1 ; SSE4-NEXT: # %bb.2: # %else ; SSE4-NEXT: testb $2, %al @@ -4897,7 +4903,7 @@ ; AVX1-NEXT: notl %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %ecx, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $1, %cl ; AVX1-NEXT: jne .LBB15_1 ; AVX1-NEXT: # %bb.2: # %else ; AVX1-NEXT: testb $2, %al diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -16,103 +16,108 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm2, %xmm6 ; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm11 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [4294967295,4294967295] -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm2 ; SSE2-NEXT: pandn %xmm9, %xmm6 ; SSE2-NEXT: por %xmm2, %xmm6 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm11 -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm9, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm11 -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm9, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm11, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm11 -; SSE2-NEXT: por %xmm1, %xmm11 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: pandn %xmm9, %xmm10 +; SSE2-NEXT: por %xmm1, %xmm10 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562067968,18446744071562067968] -; SSE2-NEXT: movdqa %xmm11, %xmm1 +; SSE2-NEXT: movdqa %xmm10, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm12 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744069414584320,18446744069414584320] -; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm12, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm13, %xmm12 -; SSE2-NEXT: pand %xmm12, %xmm11 -; SSE2-NEXT: pandn %xmm0, %xmm12 -; SSE2-NEXT: por %xmm11, %xmm12 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: movdqa %xmm1, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm10 +; SSE2-NEXT: pandn %xmm0, %xmm11 +; SSE2-NEXT: por %xmm10, %xmm11 ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm12[0,2] -; SSE2-NEXT: movdqa %xmm2, %xmm11 -; SSE2-NEXT: pxor %xmm8, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,2,2] -; SSE2-NEXT: pand %xmm12, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[0,2] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] ; SSE2-NEXT: pxor %xmm6, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm8[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm12, %xmm10 +; SSE2-NEXT: movdqa %xmm8, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm11[1,1,3,3] ; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm11, %xmm11 +; SSE2-NEXT: pxor %xmm11, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm7, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 +; SSE2-NEXT: pxor %xmm11, %xmm4 ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax @@ -121,8 +126,8 @@ ; SSE2-NEXT: # %bb.1: # %cond.store ; SSE2-NEXT: movss %xmm1, (%rdi) ; SSE2-NEXT: .LBB0_2: # %else -; SSE2-NEXT: por %xmm11, %xmm3 -; SSE2-NEXT: por %xmm8, %xmm10 +; SSE2-NEXT: por %xmm10, %xmm3 +; SSE2-NEXT: por %xmm9, %xmm8 ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB0_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 @@ -131,8 +136,8 @@ ; SSE2-NEXT: .LBB0_4: # %else2 ; SSE2-NEXT: pand %xmm3, %xmm2 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm10, %xmm6 -; SSE2-NEXT: pandn %xmm0, %xmm10 +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: pandn %xmm0, %xmm8 ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB0_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 @@ -140,7 +145,7 @@ ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: .LBB0_6: # %else4 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: por %xmm10, %xmm6 +; SSE2-NEXT: por %xmm8, %xmm6 ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB0_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 @@ -347,29 +352,30 @@ ; AVX512F-LABEL: truncstore_v8i64_v8i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kshiftlw $8, %k0, %k0 +; AVX512F-NEXT: kshiftrw $8, %k0, %k1 +; AVX512F-NEXT: vpmovsqd %zmm0, %ymm0 +; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: truncstore_v8i64_v8i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpmovsqd %zmm0, %ymm0 +; AVX512VL-NEXT: vmovdqu32 %ymm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: truncstore_v8i64_v8i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k0, %k1 +; AVX512BW-NEXT: vpmovsqd %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -390,110 +396,115 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm2, %xmm8 ; SSE2-NEXT: pxor %xmm7, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147516415,2147516415] -; SSE2-NEXT: movdqa %xmm11, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147516415,2147516415] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm2 ; SSE2-NEXT: pandn %xmm9, %xmm8 ; SSE2-NEXT: por %xmm2, %xmm8 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 -; SSE2-NEXT: movdqa %xmm11, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm9, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 -; SSE2-NEXT: movdqa %xmm11, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm10 -; SSE2-NEXT: pand %xmm10, %xmm0 -; SSE2-NEXT: pandn %xmm9, %xmm10 -; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm9, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm10, %xmm11 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: pandn %xmm9, %xmm10 +; SSE2-NEXT: por %xmm1, %xmm10 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; SSE2-NEXT: movdqa %xmm10, %xmm0 +; SSE2-NEXT: pxor %xmm7, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm11, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm11 -; SSE2-NEXT: por %xmm1, %xmm11 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] -; SSE2-NEXT: movdqa %xmm11, %xmm0 +; SSE2-NEXT: pand %xmm11, %xmm10 +; SSE2-NEXT: pandn %xmm1, %xmm11 +; SSE2-NEXT: por %xmm10, %xmm11 +; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm12 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200] -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] +; SSE2-NEXT: movdqa %xmm0, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm12, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm13, %xmm12 -; SSE2-NEXT: pand %xmm12, %xmm11 -; SSE2-NEXT: pandn %xmm3, %xmm12 -; SSE2-NEXT: por %xmm11, %xmm12 -; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pandn %xmm3, %xmm0 -; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: packssdw %xmm12, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: pxor %xmm7, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm11 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: packssdw %xmm11, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm10 ; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm12, %xmm10 +; SSE2-NEXT: por %xmm3, %xmm10 ; SSE2-NEXT: pand %xmm10, %xmm2 -; SSE2-NEXT: pandn %xmm3, %xmm10 +; SSE2-NEXT: pandn %xmm1, %xmm10 ; SSE2-NEXT: por %xmm2, %xmm10 ; SSE2-NEXT: pxor %xmm8, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm8 -; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm8, %xmm2 ; SSE2-NEXT: packssdw %xmm10, %xmm2 ; SSE2-NEXT: packssdw %xmm2, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm1, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: pxor %xmm1, %xmm4 @@ -909,19 +920,19 @@ ; AVX512BW-LABEL: truncstore_v8i64_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqw %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftld $24, %k0, %k0 +; AVX512BW-NEXT: kshiftrd $24, %k0, %k1 +; AVX512BW-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v8i64_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovqw %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -942,111 +953,116 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm2, %xmm8 ; SSE2-NEXT: pxor %xmm7, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm11, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm2 ; SSE2-NEXT: pandn %xmm9, %xmm8 ; SSE2-NEXT: por %xmm2, %xmm8 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 -; SSE2-NEXT: movdqa %xmm11, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm9, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 -; SSE2-NEXT: movdqa %xmm11, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm10 -; SSE2-NEXT: pand %xmm10, %xmm0 -; SSE2-NEXT: pandn %xmm9, %xmm10 -; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm9, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm10, %xmm11 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: pandn %xmm9, %xmm10 +; SSE2-NEXT: por %xmm1, %xmm10 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; SSE2-NEXT: movdqa %xmm10, %xmm0 +; SSE2-NEXT: pxor %xmm7, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm11, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm11 -; SSE2-NEXT: por %xmm1, %xmm11 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] -; SSE2-NEXT: movdqa %xmm11, %xmm0 +; SSE2-NEXT: pand %xmm11, %xmm10 +; SSE2-NEXT: pandn %xmm1, %xmm11 +; SSE2-NEXT: por %xmm10, %xmm11 +; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm12 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] +; SSE2-NEXT: movdqa %xmm0, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm12, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm13, %xmm12 -; SSE2-NEXT: pand %xmm12, %xmm11 -; SSE2-NEXT: pandn %xmm3, %xmm12 -; SSE2-NEXT: por %xmm11, %xmm12 -; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pandn %xmm3, %xmm0 -; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: packssdw %xmm12, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: pxor %xmm7, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm11 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: packssdw %xmm11, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm10 ; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm12, %xmm10 +; SSE2-NEXT: por %xmm3, %xmm10 ; SSE2-NEXT: pand %xmm10, %xmm2 -; SSE2-NEXT: pandn %xmm3, %xmm10 +; SSE2-NEXT: pandn %xmm1, %xmm10 ; SSE2-NEXT: por %xmm2, %xmm10 ; SSE2-NEXT: pxor %xmm8, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm8 -; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm8, %xmm2 ; SSE2-NEXT: packssdw %xmm10, %xmm2 ; SSE2-NEXT: packssdw %xmm2, %xmm0 ; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm1, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: pxor %xmm1, %xmm4 @@ -1458,19 +1474,19 @@ ; AVX512BW-LABEL: truncstore_v8i64_v8i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqb %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftlq $56, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 +; AVX512BW-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v8i64_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -1491,25 +1507,26 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] -; SSE2-NEXT: movdqa %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm10, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn %xmm6, %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm7 ; SSE2-NEXT: pand %xmm7, %xmm1 @@ -1518,32 +1535,33 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744069414584320,18446744069414584320] -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm10, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pandn %xmm1, %xmm6 -; SSE2-NEXT: por %xmm7, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pandn %xmm1, %xmm8 +; SSE2-NEXT: por %xmm7, %xmm8 ; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm5 ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: movmskps %xmm2, %eax +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: movmskps %xmm3, %eax ; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB3_1 @@ -1687,9 +1705,8 @@ ; AVX512VL-LABEL: truncstore_v4i64_v4i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovqd %ymm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpmovsqd %ymm0, %xmm0 +; AVX512VL-NEXT: vmovdqu32 %xmm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -1722,25 +1739,26 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147516415,2147516415] -; SSE2-NEXT: movdqa %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm10, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn %xmm6, %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm7 ; SSE2-NEXT: pand %xmm7, %xmm1 @@ -1749,33 +1767,34 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200] -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm10, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pandn %xmm1, %xmm6 -; SSE2-NEXT: por %xmm7, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pandn %xmm1, %xmm8 +; SSE2-NEXT: por %xmm7, %xmm8 ; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm5 ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: packssdw %xmm6, %xmm0 +; SSE2-NEXT: packssdw %xmm8, %xmm0 ; SSE2-NEXT: packssdw %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: movmskps %xmm2, %eax +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: movmskps %xmm3, %eax ; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB4_1 @@ -1968,8 +1987,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB4_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -2015,9 +2035,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i64_v4i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -2038,25 +2057,26 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm10, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pandn %xmm6, %xmm5 ; SSE2-NEXT: por %xmm1, %xmm5 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm7 ; SSE2-NEXT: pand %xmm7, %xmm0 @@ -2065,37 +2085,38 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm10, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm7 ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm7, %xmm0 ; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm6 +; SSE2-NEXT: por %xmm5, %xmm6 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm6 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm6, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: movmskps %xmm2, %ecx +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: movmskps %xmm3, %ecx ; SSE2-NEXT: xorl $15, %ecx ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %eax @@ -2294,8 +2315,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB5_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -2341,9 +2363,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i64_v4i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -2363,34 +2384,36 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm5, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm5 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: por %xmm5, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: movmskpd %xmm2, %eax +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movmskpd %xmm1, %eax ; SSE2-NEXT: xorl $3, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB6_1 @@ -2490,9 +2513,8 @@ ; AVX512VL-LABEL: truncstore_v2i64_v2i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovqd %xmm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpmovsqd %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu32 %xmm0, (%rdi) {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: truncstore_v2i64_v2i32: @@ -2523,35 +2545,37 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147516415,2147516415] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147516415,2147516415] +; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm5, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm5 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: por %xmm5, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: movmskpd %xmm2, %eax +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movmskpd %xmm1, %eax ; SSE2-NEXT: xorl $3, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB7_1 @@ -2668,8 +2692,9 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB7_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -2701,9 +2726,8 @@ ; AVX512BWVL-LABEL: truncstore_v2i64_v2i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqw %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer %b = icmp slt <2 x i64> %x, @@ -2722,25 +2746,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm5, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm5 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -2749,9 +2775,9 @@ ; SSE2-NEXT: packuswb %xmm3, %xmm3 ; SSE2-NEXT: packuswb %xmm3, %xmm3 ; SSE2-NEXT: packuswb %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movmskpd %xmm0, %eax ; SSE2-NEXT: xorl $3, %eax ; SSE2-NEXT: testb $1, %al @@ -2865,8 +2891,9 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB8_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -2898,9 +2925,8 @@ ; AVX512BWVL-LABEL: truncstore_v2i64_v2i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer %b = icmp slt <2 x i64> %x, @@ -3580,18 +3606,16 @@ ; AVX512BW-LABEL: truncstore_v16i32_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdw %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vpmovsdw %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v16i32_v16i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovdw %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsdw %zmm0, %ymm0 +; AVX512BWVL-NEXT: vmovdqu16 %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -4251,18 +4275,16 @@ ; AVX512BW-LABEL: truncstore_v16i32_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vpmovsdb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v16i32_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovdb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsdb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -4640,9 +4662,8 @@ ; AVX512BWVL-LABEL: truncstore_v8i32_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -5019,9 +5040,8 @@ ; AVX512BWVL-LABEL: truncstore_v8i32_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -5152,8 +5172,9 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB13_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -5198,9 +5219,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i32_v4i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer %b = icmp slt <4 x i32> %x, @@ -5392,8 +5412,9 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovsdb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB14_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -5439,9 +5460,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i32_v4i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsdb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer %b = icmp slt <4 x i32> %x, @@ -5466,7 +5486,7 @@ ; SSE2-NEXT: notl %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: jne .LBB15_1 ; SSE2-NEXT: # %bb.2: # %else @@ -5678,7 +5698,7 @@ ; SSE4-NEXT: notl %eax ; SSE4-NEXT: shll $16, %eax ; SSE4-NEXT: orl %ecx, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $1, %cl ; SSE4-NEXT: jne .LBB15_1 ; SSE4-NEXT: # %bb.2: # %else ; SSE4-NEXT: testb $2, %al @@ -5920,7 +5940,7 @@ ; AVX1-NEXT: notl %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %ecx, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $1, %cl ; AVX1-NEXT: jne .LBB15_1 ; AVX1-NEXT: # %bb.2: # %else ; AVX1-NEXT: testb $2, %al @@ -6621,19 +6641,18 @@ ; AVX512BW-LABEL: truncstore_v32i16_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmb %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmb %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kmovd %k0, %k1 +; AVX512BW-NEXT: vpmovswb %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v32i16_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovswb %zmm0, %ymm0 +; AVX512BWVL-NEXT: vmovdqu8 %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <32 x i8> %mask, zeroinitializer @@ -7259,9 +7278,8 @@ ; AVX512BWVL-LABEL: truncstore_v16i16_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovswb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i8> %mask, zeroinitializer @@ -7556,9 +7574,8 @@ ; AVX512BWVL-LABEL: truncstore_v8i16_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmw %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i16> %mask, zeroinitializer %b = icmp slt <8 x i16> %x, diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -11,51 +11,51 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i64_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pxor %xmm7, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm10, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647] -; SSE2-NEXT: movdqa %xmm7, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm9 -; SSE2-NEXT: pand %xmm12, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm1 -; SSE2-NEXT: pandn %xmm6, %xmm9 -; SSE2-NEXT: por %xmm1, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: pandn %xmm6, %xmm10 +; SSE2-NEXT: por %xmm1, %xmm10 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2] -; SSE2-NEXT: movdqa %xmm7, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 -; SSE2-NEXT: pand %xmm12, %xmm1 +; SSE2-NEXT: pand %xmm11, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm6, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm10[0,2] ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE2-NEXT: movdqa %xmm7, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: pxor %xmm10, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm9[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm12, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm10 -; SSE2-NEXT: pxor %xmm10, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm8 +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 +; SSE2-NEXT: pxor %xmm9, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm9, %xmm4 ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax @@ -64,8 +64,8 @@ ; SSE2-NEXT: # %bb.1: # %cond.store ; SSE2-NEXT: movss %xmm1, (%rdi) ; SSE2-NEXT: .LBB0_2: # %else -; SSE2-NEXT: pand %xmm11, %xmm0 -; SSE2-NEXT: pand %xmm7, %xmm9 +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pand %xmm11, %xmm8 ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB0_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 @@ -74,8 +74,8 @@ ; SSE2-NEXT: .LBB0_4: # %else2 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm6, %xmm0 -; SSE2-NEXT: pand %xmm9, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm9 +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm8 ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB0_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 @@ -83,7 +83,7 @@ ; SSE2-NEXT: movd %xmm4, 8(%rdi) ; SSE2-NEXT: .LBB0_6: # %else4 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: por %xmm9, %xmm2 +; SSE2-NEXT: por %xmm8, %xmm2 ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB0_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 @@ -280,26 +280,30 @@ ; AVX512F-LABEL: truncstore_v8i64_v8i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kshiftlw $8, %k0, %k0 +; AVX512F-NEXT: kshiftrw $8, %k0, %k1 +; AVX512F-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: truncstore_v8i64_v8i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512VL-NEXT: vmovdqu32 %ymm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: truncstore_v8i64_v8i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k0, %k1 +; AVX512BW-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -318,49 +322,49 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm2, %xmm7 ; SSE2-NEXT: pxor %xmm9, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm7[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE2-NEXT: pand %xmm12, %xmm7 +; SSE2-NEXT: pand %xmm11, %xmm7 ; SSE2-NEXT: pand %xmm7, %xmm2 ; SSE2-NEXT: pandn %xmm8, %xmm7 ; SSE2-NEXT: por %xmm2, %xmm7 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm2[0,0,2,2] -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 -; SSE2-NEXT: pand %xmm12, %xmm2 +; SSE2-NEXT: pand %xmm11, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm8, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm9, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,0,2,2] -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE2-NEXT: pand %xmm12, %xmm3 +; SSE2-NEXT: pand %xmm11, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm8, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm11, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pand %xmm10, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pxor %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm10, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm0, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm9 +; SSE2-NEXT: por %xmm1, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] @@ -778,17 +782,19 @@ ; AVX512BW-LABEL: truncstore_v8i64_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqw %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftld $24, %k0, %k0 +; AVX512BW-NEXT: kshiftrd $24, %k0, %k1 +; AVX512BW-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v8i64_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovqw %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -807,51 +813,51 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm1, %xmm10 ; SSE2-NEXT: pxor %xmm8, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm10 -; SSE2-NEXT: pand %xmm12, %xmm10 +; SSE2-NEXT: pand %xmm11, %xmm10 ; SSE2-NEXT: pand %xmm10, %xmm1 ; SSE2-NEXT: pandn %xmm7, %xmm10 ; SSE2-NEXT: por %xmm1, %xmm10 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2] -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm1 -; SSE2-NEXT: pand %xmm12, %xmm1 +; SSE2-NEXT: pand %xmm11, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm7, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: packuswb %xmm10, %xmm1 ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE2-NEXT: movdqa %xmm9, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 -; SSE2-NEXT: pand %xmm11, %xmm0 +; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm7, %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm10, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 -; SSE2-NEXT: pand %xmm9, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm7, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: packuswb %xmm0, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm8 +; SSE2-NEXT: movdqa %xmm9, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: pandn %xmm7, %xmm8 +; SSE2-NEXT: por %xmm2, %xmm8 +; SSE2-NEXT: packuswb %xmm0, %xmm8 +; SSE2-NEXT: packuswb %xmm8, %xmm1 ; SSE2-NEXT: packuswb %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 @@ -1256,17 +1262,19 @@ ; AVX512BW-LABEL: truncstore_v8i64_v8i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqb %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftlq $56, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 +; AVX512BW-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v8i64_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -1285,22 +1293,22 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647,2147483647,2147483647] -; SSE2-NEXT: movdqa %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: pand %xmm8, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: pandn %xmm4, %xmm6 ; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSE2-NEXT: pand %xmm8, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm1 @@ -1448,8 +1456,8 @@ ; AVX512VL-LABEL: truncstore_v4i64_v4i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovqd %ymm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpmovusqd %ymm0, %xmm0 +; AVX512VL-NEXT: vmovdqu32 %xmm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -1480,27 +1488,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: pand %xmm8, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm6 ; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 -; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm4, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] @@ -1696,8 +1704,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB4_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -1743,8 +1752,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i64_v4i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -1763,29 +1772,29 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm9, %xmm4 +; SSE2-NEXT: pand %xmm8, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm5, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm6 +; SSE2-NEXT: por %xmm1, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm6 ; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: packuswb %xmm0, %xmm4 +; SSE2-NEXT: packuswb %xmm6, %xmm4 ; SSE2-NEXT: packuswb %xmm4, %xmm4 ; SSE2-NEXT: packuswb %xmm4, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 @@ -1985,8 +1994,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB5_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -2032,8 +2042,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i64_v4i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -2050,11 +2060,12 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647] -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: pand %xmm5, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -2160,8 +2171,8 @@ ; AVX512VL-LABEL: truncstore_v2i64_v2i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovqd %xmm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu32 %xmm0, (%rdi) {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: truncstore_v2i64_v2i32: @@ -2189,11 +2200,12 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: pand %xmm5, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -2317,8 +2329,9 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB7_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -2350,8 +2363,8 @@ ; AVX512BWVL-LABEL: truncstore_v2i64_v2i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqw %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer %b = icmp ult <2 x i64> %x, @@ -2367,11 +2380,12 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: pand %xmm5, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -2493,8 +2507,9 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB8_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -2526,8 +2541,8 @@ ; AVX512BWVL-LABEL: truncstore_v2i64_v2i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer %b = icmp ult <2 x i64> %x, @@ -2541,22 +2556,22 @@ ; SSE2-LABEL: truncstore_v16i32_v16i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535] ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: pxor %xmm11, %xmm9 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm10, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm13 +; SSE2-NEXT: pxor %xmm11, %xmm13 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183,2147549183,2147549183] +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm13, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: pxor %xmm9, %xmm8 +; SSE2-NEXT: pandn %xmm10, %xmm8 ; SSE2-NEXT: por %xmm0, %xmm8 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm13 +; SSE2-NEXT: movdqa %xmm9, %xmm13 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm13 ; SSE2-NEXT: pand %xmm13, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm13 +; SSE2-NEXT: pandn %xmm10, %xmm13 ; SSE2-NEXT: por %xmm1, %xmm13 ; SSE2-NEXT: pslld $16, %xmm13 ; SSE2-NEXT: psrad $16, %xmm13 @@ -2564,14 +2579,15 @@ ; SSE2-NEXT: psrad $16, %xmm8 ; SSE2-NEXT: packssdw %xmm13, %xmm8 ; SSE2-NEXT: pcmpeqd %xmm12, %xmm7 -; SSE2-NEXT: pxor %xmm9, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm7 ; SSE2-NEXT: pcmpeqd %xmm12, %xmm6 -; SSE2-NEXT: pxor %xmm9, %xmm6 +; SSE2-NEXT: pxor %xmm0, %xmm6 ; SSE2-NEXT: packssdw %xmm7, %xmm6 ; SSE2-NEXT: pcmpeqd %xmm12, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm5 +; SSE2-NEXT: pxor %xmm0, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm12, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 +; SSE2-NEXT: pxor %xmm0, %xmm4 ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm6, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax @@ -2596,9 +2612,9 @@ ; SSE2-NEXT: pextrw $2, %xmm8, %ecx ; SSE2-NEXT: movw %cx, 4(%rdi) ; SSE2-NEXT: .LBB9_6: # %else4 -; SSE2-NEXT: movdqa %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm9 ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB9_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 @@ -2606,9 +2622,9 @@ ; SSE2-NEXT: movw %cx, 6(%rdi) ; SSE2-NEXT: .LBB9_8: # %else6 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm9 -; SSE2-NEXT: pand %xmm10, %xmm3 +; SSE2-NEXT: pandn %xmm10, %xmm0 +; SSE2-NEXT: pand %xmm9, %xmm3 +; SSE2-NEXT: pandn %xmm10, %xmm9 ; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB9_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 @@ -3237,16 +3253,16 @@ ; AVX512BW-LABEL: truncstore_v16i32_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdw %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vpmovusdw %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v16i32_v16i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovdw %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdw %zmm0, %ymm0 +; AVX512BWVL-NEXT: vmovdqu16 %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -3946,16 +3962,16 @@ ; AVX512BW-LABEL: truncstore_v16i32_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v16i32_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovdb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -3970,30 +3986,31 @@ ; SSE2-LABEL: truncstore_v8i32_v8i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535] ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm7, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm8, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pxor %xmm7, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183,2147549183,2147549183] +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm6, %xmm4 +; SSE2-NEXT: pandn %xmm6, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm1, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm8 -; SSE2-NEXT: por %xmm1, %xmm8 -; SSE2-NEXT: pslld $16, %xmm8 -; SSE2-NEXT: psrad $16, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm6, %xmm9 +; SSE2-NEXT: por %xmm1, %xmm9 +; SSE2-NEXT: pslld $16, %xmm9 +; SSE2-NEXT: psrad $16, %xmm9 ; SSE2-NEXT: pslld $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: packssdw %xmm8, %xmm4 +; SSE2-NEXT: packssdw %xmm9, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 -; SSE2-NEXT: pxor %xmm6, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE2-NEXT: pxor %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 ; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: packsswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax @@ -4357,8 +4374,8 @@ ; AVX512BWVL-LABEL: truncstore_v8i32_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -4754,8 +4771,8 @@ ; AVX512BWVL-LABEL: truncstore_v8i32_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -4934,8 +4951,9 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusdw %zmm0, %ymm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB13_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -4981,8 +4999,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i32_v4i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdw %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer %b = icmp ult <4 x i32> %x, @@ -5163,8 +5181,9 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB14_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -5210,8 +5229,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i32_v4i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer %b = icmp ult <4 x i32> %x, @@ -5241,7 +5260,7 @@ ; SSE2-NEXT: notl %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: jne .LBB15_1 ; SSE2-NEXT: # %bb.2: # %else @@ -5462,7 +5481,7 @@ ; SSE4-NEXT: notl %eax ; SSE4-NEXT: shll $16, %eax ; SSE4-NEXT: orl %ecx, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $1, %cl ; SSE4-NEXT: jne .LBB15_1 ; SSE4-NEXT: # %bb.2: # %else ; SSE4-NEXT: testb $2, %al @@ -5709,7 +5728,7 @@ ; AVX1-NEXT: notl %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %ecx, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $1, %cl ; AVX1-NEXT: jne .LBB15_1 ; AVX1-NEXT: # %bb.2: # %else ; AVX1-NEXT: testb $2, %al @@ -6419,17 +6438,18 @@ ; AVX512BW-LABEL: truncstore_v32i16_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmb %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmb %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kmovd %k0, %k1 +; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v32i16_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BWVL-NEXT: vmovdqu8 %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <32 x i8> %mask, zeroinitializer @@ -7068,8 +7088,8 @@ ; AVX512BWVL-LABEL: truncstore_v16i16_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovuswb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i8> %mask, zeroinitializer @@ -7369,8 +7389,8 @@ ; AVX512BWVL-LABEL: truncstore_v8i16_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmw %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovuswb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i16> %mask, zeroinitializer %b = icmp ult <8 x i16> %x, diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll --- a/llvm/test/CodeGen/X86/matrix-multiply.ll +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -180,24 +180,22 @@ ; SSE-NEXT: movss {{.*#+}} xmm11 = mem[0],zero,zero,zero ; SSE-NEXT: addss %xmm13, %xmm1 ; SSE-NEXT: addss %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: mulss %xmm11, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0,0,1,1] +; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: mulss %xmm11, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,0,0,0] ; SSE-NEXT: mulps %xmm0, %xmm11 -; SSE-NEXT: movaps %xmm5, %xmm12 -; SSE-NEXT: mulss %xmm10, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0,0,1,1] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: mulss %xmm10, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,0,0,0] ; SSE-NEXT: mulps %xmm3, %xmm10 ; SSE-NEXT: addps %xmm11, %xmm10 ; SSE-NEXT: movaps %xmm9, %xmm11 ; SSE-NEXT: mulss %xmm8, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0,0,1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0,0,0] ; SSE-NEXT: mulps %xmm6, %xmm8 ; SSE-NEXT: addps %xmm10, %xmm8 -; SSE-NEXT: addss %xmm7, %xmm12 -; SSE-NEXT: addss %xmm11, %xmm12 -; SSE-NEXT: movaps %xmm8, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm12[0] +; SSE-NEXT: addss %xmm12, %xmm7 +; SSE-NEXT: addss %xmm11, %xmm7 ; SSE-NEXT: movss {{.*#+}} xmm10 = mem[0],zero,zero,zero ; SSE-NEXT: mulss %xmm10, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0,0,1,1] @@ -212,11 +210,12 @@ ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0,0,1,1] ; SSE-NEXT: mulps %xmm6, %xmm3 ; SSE-NEXT: addps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm3[0,1] ; SSE-NEXT: addss %xmm2, %xmm5 ; SSE-NEXT: addss %xmm9, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[0,1] ; SSE-NEXT: movss %xmm5, 32(%rdi) ; SSE-NEXT: movaps %xmm7, 16(%rdi) ; SSE-NEXT: movaps %xmm4, (%rdi) @@ -256,7 +255,6 @@ ; AVX1-NEXT: vaddss %xmm4, %xmm9, %xmm4 ; AVX1-NEXT: vmulss %xmm10, %xmm8, %xmm9 ; AVX1-NEXT: vaddss %xmm4, %xmm9, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm7[0,1],xmm4[0],xmm7[3] ; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm9 ; AVX1-NEXT: vmulps %xmm0, %xmm9, %xmm0 ; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm10 @@ -270,15 +268,13 @@ ; AVX1-NEXT: vaddss %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vmulss %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vaddss %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm7[1,1,3,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0] ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[0] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,2,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-NEXT: vmovss %xmm2, 32(%rdi) -; AVX1-NEXT: vmovaps %ymm0, (%rdi) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vmovaps %xmm1, (%rdi) +; AVX1-NEXT: vmovaps %xmm0, 16(%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_mul3x3_f32: @@ -315,36 +311,35 @@ ; AVX2-NEXT: vaddss %xmm4, %xmm9, %xmm4 ; AVX2-NEXT: vmulss %xmm10, %xmm8, %xmm9 ; AVX2-NEXT: vaddss %xmm4, %xmm9, %xmm4 -; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm7[0,1],xmm4[0],xmm7[3] -; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm7 -; AVX2-NEXT: vmulps %xmm7, %xmm0, %xmm0 ; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm9 -; AVX2-NEXT: vmulps %xmm3, %xmm9, %xmm3 +; AVX2-NEXT: vmulps %xmm0, %xmm9, %xmm0 +; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm10 +; AVX2-NEXT: vmulps %xmm3, %xmm10, %xmm3 ; AVX2-NEXT: vaddps %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm3 ; AVX2-NEXT: vmulps %xmm3, %xmm6, %xmm6 ; AVX2-NEXT: vaddps %xmm6, %xmm0, %xmm0 -; AVX2-NEXT: vmulss %xmm7, %xmm2, %xmm2 -; AVX2-NEXT: vmulss %xmm5, %xmm9, %xmm5 +; AVX2-NEXT: vmulss %xmm2, %xmm9, %xmm2 +; AVX2-NEXT: vmulss %xmm5, %xmm10, %xmm5 ; AVX2-NEXT: vaddss %xmm5, %xmm2, %xmm2 ; AVX2-NEXT: vmulss %xmm3, %xmm8, %xmm3 ; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = <0,1,2,4,5,6,u,u> -; AVX2-NEXT: vpermps %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovshdup {{.*#+}} xmm3 = xmm7[1,1,3,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[0] ; AVX2-NEXT: vmovss %xmm2, 32(%rdi) -; AVX2-NEXT: vmovaps %ymm0, (%rdi) -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vmovaps %xmm1, (%rdi) +; AVX2-NEXT: vmovaps %xmm0, 16(%rdi) ; AVX2-NEXT: retq ; ; AVX512F-LABEL: test_mul3x3_f32: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: valignd {{.*#+}} zmm2 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2] -; AVX512F-NEXT: vbroadcastss %xmm1, %xmm3 -; AVX512F-NEXT: vmulps %xmm3, %xmm0, %xmm3 +; AVX512F-NEXT: vbroadcastss %xmm1, %xmm2 +; AVX512F-NEXT: vmulps %xmm2, %xmm0, %xmm3 ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512F-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[2,3] ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3] ; AVX512F-NEXT: vmulps %xmm6, %xmm2, %xmm4 ; AVX512F-NEXT: vaddps %xmm4, %xmm3, %xmm4 @@ -356,105 +351,108 @@ ; AVX512F-NEXT: vaddps %xmm4, %xmm9, %xmm9 ; AVX512F-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] ; AVX512F-NEXT: vmulss %xmm1, %xmm4, %xmm10 -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm5 = xmm5[1,1,3,3] -; AVX512F-NEXT: vmulss %xmm6, %xmm5, %xmm6 -; AVX512F-NEXT: vaddss %xmm6, %xmm10, %xmm6 -; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm10 -; AVX512F-NEXT: vmulss %xmm8, %xmm10, %xmm8 -; AVX512F-NEXT: vaddss %xmm6, %xmm8, %xmm6 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm9[0,1],xmm6[0],xmm9[3] +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm11 = xmm5[1,1,3,3] +; AVX512F-NEXT: vmulss %xmm6, %xmm11, %xmm5 +; AVX512F-NEXT: vaddss %xmm5, %xmm10, %xmm5 +; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm6 +; AVX512F-NEXT: vmulss %xmm6, %xmm8, %xmm8 +; AVX512F-NEXT: vaddss %xmm5, %xmm8, %xmm5 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm9[0,1],xmm5[0],xmm9[3] ; AVX512F-NEXT: vmulps %xmm7, %xmm0, %xmm8 ; AVX512F-NEXT: vextractf128 $1, %ymm1, %xmm9 -; AVX512F-NEXT: vmovsldup {{.*#+}} xmm11 = xmm9[0,0,2,2] -; AVX512F-NEXT: vmulps %xmm2, %xmm11, %xmm11 -; AVX512F-NEXT: vaddps %xmm11, %xmm8, %xmm8 -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm11 = xmm9[1,1,3,3] -; AVX512F-NEXT: vmulps %xmm3, %xmm11, %xmm12 +; AVX512F-NEXT: vmovsldup {{.*#+}} xmm10 = xmm9[0,0,2,2] +; AVX512F-NEXT: vmulps %xmm2, %xmm10, %xmm10 +; AVX512F-NEXT: vaddps %xmm10, %xmm8, %xmm8 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm10 = xmm9[1,1,3,3] +; AVX512F-NEXT: vmulps %xmm3, %xmm10, %xmm12 ; AVX512F-NEXT: vaddps %xmm12, %xmm8, %xmm8 ; AVX512F-NEXT: vmulss %xmm7, %xmm4, %xmm7 -; AVX512F-NEXT: vmulss %xmm5, %xmm9, %xmm12 +; AVX512F-NEXT: vmulss %xmm9, %xmm11, %xmm12 ; AVX512F-NEXT: vaddss %xmm7, %xmm12, %xmm7 -; AVX512F-NEXT: vmulss %xmm11, %xmm10, %xmm11 -; AVX512F-NEXT: vaddss %xmm7, %xmm11, %xmm7 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm7 = xmm8[0,1],xmm7[0],xmm8[3] -; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm9[3,3,3,3] -; AVX512F-NEXT: vshufpd {{.*#+}} xmm11 = xmm9[1,0] +; AVX512F-NEXT: vmulss %xmm6, %xmm10, %xmm10 +; AVX512F-NEXT: vaddss %xmm7, %xmm10, %xmm7 +; AVX512F-NEXT: vshufps {{.*#+}} xmm10 = xmm9[3,3,3,3] +; AVX512F-NEXT: vshufpd {{.*#+}} xmm12 = xmm9[1,0] ; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,2,2,2] ; AVX512F-NEXT: vmulps %xmm0, %xmm9, %xmm0 -; AVX512F-NEXT: vmulps %xmm2, %xmm8, %xmm2 +; AVX512F-NEXT: vmulps %xmm2, %xmm10, %xmm2 ; AVX512F-NEXT: vaddps %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vextractf32x4 $2, %zmm1, %xmm1 ; AVX512F-NEXT: vbroadcastss %xmm1, %xmm2 ; AVX512F-NEXT: vmulps %xmm2, %xmm3, %xmm2 ; AVX512F-NEXT: vaddps %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vmulss %xmm4, %xmm11, %xmm2 -; AVX512F-NEXT: vmulss %xmm5, %xmm8, %xmm3 +; AVX512F-NEXT: vmulss %xmm4, %xmm12, %xmm2 +; AVX512F-NEXT: vmulss %xmm10, %xmm11, %xmm3 ; AVX512F-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vmulss %xmm1, %xmm10, %xmm1 +; AVX512F-NEXT: vmulss %xmm1, %xmm6, %xmm1 ; AVX512F-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512F-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm2 -; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = <0,1,2,4,5,6,16,17,18,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm8[1,1,3,3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[2,3] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0,1,2],xmm8[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: test_mul3x3_f32: ; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: valignd {{.*#+}} zmm2 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2] -; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm3 -; AVX512VL-NEXT: vmulps %xmm3, %xmm0, %xmm3 -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm2 +; AVX512VL-NEXT: vmulps %xmm2, %xmm0, %xmm2 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm4 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[2,3] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vmulps %xmm5, %xmm2, %xmm6 -; AVX512VL-NEXT: vaddps %xmm6, %xmm3, %xmm3 -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm6 = xmm4[1,0] +; AVX512VL-NEXT: vmulps %xmm5, %xmm4, %xmm6 +; AVX512VL-NEXT: vaddps %xmm6, %xmm2, %xmm2 +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm6 = xmm3[1,0] ; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm1[3,3,3,3] ; AVX512VL-NEXT: vshufpd {{.*#+}} xmm8 = xmm1[1,0] ; AVX512VL-NEXT: vshufps {{.*#+}} xmm9 = xmm1[2,2,2,2] ; AVX512VL-NEXT: vmulps %xmm6, %xmm9, %xmm9 -; AVX512VL-NEXT: vaddps %xmm3, %xmm9, %xmm3 +; AVX512VL-NEXT: vaddps %xmm2, %xmm9, %xmm2 ; AVX512VL-NEXT: vshufpd {{.*#+}} xmm9 = xmm0[1,0] ; AVX512VL-NEXT: vmulss %xmm1, %xmm9, %xmm10 -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm4[1,1,3,3] -; AVX512VL-NEXT: vmulss %xmm5, %xmm4, %xmm5 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] +; AVX512VL-NEXT: vmulss %xmm5, %xmm3, %xmm5 ; AVX512VL-NEXT: vaddss %xmm5, %xmm10, %xmm5 ; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm10 ; AVX512VL-NEXT: vmulss %xmm8, %xmm10, %xmm8 ; AVX512VL-NEXT: vaddss %xmm5, %xmm8, %xmm5 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm5[0],xmm2[3] ; AVX512VL-NEXT: vmulps %xmm7, %xmm0, %xmm5 ; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm8 ; AVX512VL-NEXT: vmovsldup {{.*#+}} xmm11 = xmm8[0,0,2,2] -; AVX512VL-NEXT: vmulps %xmm2, %xmm11, %xmm11 +; AVX512VL-NEXT: vmulps %xmm4, %xmm11, %xmm11 ; AVX512VL-NEXT: vaddps %xmm5, %xmm11, %xmm5 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm11 = xmm8[1,1,3,3] ; AVX512VL-NEXT: vmulps %xmm6, %xmm11, %xmm12 ; AVX512VL-NEXT: vaddps %xmm5, %xmm12, %xmm5 ; AVX512VL-NEXT: vmulss %xmm7, %xmm9, %xmm7 -; AVX512VL-NEXT: vmulss %xmm4, %xmm8, %xmm12 +; AVX512VL-NEXT: vmulss %xmm3, %xmm8, %xmm12 ; AVX512VL-NEXT: vaddss %xmm7, %xmm12, %xmm7 ; AVX512VL-NEXT: vmulss %xmm11, %xmm10, %xmm11 ; AVX512VL-NEXT: vaddss %xmm7, %xmm11, %xmm7 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3] -; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm8[3,3,3,3] -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm11 = xmm8[1,0] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm11 = xmm8[3,3,3,3] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm12 = xmm8[1,0] ; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,2,2,2] ; AVX512VL-NEXT: vmulps %xmm0, %xmm8, %xmm0 -; AVX512VL-NEXT: vmulps %xmm7, %xmm2, %xmm2 -; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vmulps %xmm4, %xmm11, %xmm4 +; AVX512VL-NEXT: vaddps %xmm4, %xmm0, %xmm0 ; AVX512VL-NEXT: vextractf32x4 $2, %zmm1, %xmm1 -; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm2 -; AVX512VL-NEXT: vmulps %xmm2, %xmm6, %xmm2 -; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vmulss %xmm11, %xmm9, %xmm2 -; AVX512VL-NEXT: vmulss %xmm7, %xmm4, %xmm4 -; AVX512VL-NEXT: vaddss %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm4 +; AVX512VL-NEXT: vmulps %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmulss %xmm12, %xmm9, %xmm4 +; AVX512VL-NEXT: vmulss %xmm3, %xmm11, %xmm3 +; AVX512VL-NEXT: vaddss %xmm3, %xmm4, %xmm3 ; AVX512VL-NEXT: vmulss %xmm1, %xmm10, %xmm1 -; AVX512VL-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm3, %zmm2 -; AVX512VL-NEXT: vmovaps {{.*#+}} zmm0 = <0,1,2,4,5,6,16,17,18,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0 +; AVX512VL-NEXT: vaddss %xmm1, %xmm3, %xmm1 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm3 = xmm5[1,1,3,3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[2,3] +; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq entry: %block = shufflevector <9 x float> %a0, <9 x float> poison, <2 x i32> @@ -617,7 +615,6 @@ ; AVX1-NEXT: vaddsd %xmm4, %xmm9, %xmm4 ; AVX1-NEXT: vmulsd %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vaddsd %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] ; AVX1-NEXT: vmulpd %xmm7, %xmm1, %xmm9 ; AVX1-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] @@ -644,15 +641,13 @@ ; AVX1-NEXT: vaddsd %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vmulsd %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vaddsd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[2] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm3 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 -; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm3[0],ymm1[2],ymm3[3] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm9[0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm4 = xmm9[1],xmm7[0] ; AVX1-NEXT: vmovsd %xmm2, 64(%rdi) -; AVX1-NEXT: vmovapd %ymm1, 32(%rdi) -; AVX1-NEXT: vmovapd %ymm0, (%rdi) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vmovapd %xmm1, 48(%rdi) +; AVX1-NEXT: vmovapd %xmm0, (%rdi) +; AVX1-NEXT: vmovapd %xmm4, 32(%rdi) +; AVX1-NEXT: vmovapd %xmm3, 16(%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_mul3x3_f64: @@ -675,7 +670,6 @@ ; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4 ; AVX2-NEXT: vmulsd %xmm7, %xmm8, %xmm7 ; AVX2-NEXT: vaddsd %xmm7, %xmm4, %xmm4 -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] ; AVX2-NEXT: vmulpd %xmm7, %xmm1, %xmm9 ; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] @@ -702,70 +696,68 @@ ; AVX2-NEXT: vaddsd %xmm5, %xmm2, %xmm2 ; AVX2-NEXT: vmulsd %xmm3, %xmm8, %xmm3 ; AVX2-NEXT: vaddsd %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[2] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm3 -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 -; AVX2-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm3[0],ymm1[2],ymm3[3] +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm9[0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm4 = xmm9[1],xmm7[0] ; AVX2-NEXT: vmovsd %xmm2, 64(%rdi) -; AVX2-NEXT: vmovapd %ymm1, 32(%rdi) -; AVX2-NEXT: vmovapd %ymm0, (%rdi) -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vmovapd %xmm1, 48(%rdi) +; AVX2-NEXT: vmovapd %xmm0, (%rdi) +; AVX2-NEXT: vmovapd %xmm4, 32(%rdi) +; AVX2-NEXT: vmovapd %xmm3, 16(%rdi) ; AVX2-NEXT: retq ; ; AVX512F-LABEL: test_mul3x3_f64: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: movq %rdi, %rax ; AVX512F-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero -; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] ; AVX512F-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm0, %xmm9, %xmm10 -; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm4[0] -; AVX512F-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm3, %xmm1, %xmm4 -; AVX512F-NEXT: vaddpd %xmm4, %xmm10, %xmm4 +; AVX512F-NEXT: vmulpd %xmm1, %xmm9, %xmm0 +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX512F-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm4, %xmm3, %xmm10 +; AVX512F-NEXT: vaddpd %xmm0, %xmm10, %xmm0 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] ; AVX512F-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] ; AVX512F-NEXT: vmulpd %xmm7, %xmm6, %xmm10 -; AVX512F-NEXT: vaddpd %xmm4, %xmm10, %xmm4 +; AVX512F-NEXT: vaddpd %xmm0, %xmm10, %xmm0 ; AVX512F-NEXT: vmulsd %xmm2, %xmm9, %xmm9 -; AVX512F-NEXT: vmulsd %xmm3, %xmm5, %xmm3 -; AVX512F-NEXT: vaddsd %xmm3, %xmm9, %xmm3 -; AVX512F-NEXT: vmulsd %xmm7, %xmm8, %xmm7 -; AVX512F-NEXT: vaddsd %xmm7, %xmm3, %xmm3 -; AVX512F-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm4, %xmm0, %xmm7 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm1, %xmm9, %xmm10 -; AVX512F-NEXT: vaddpd %xmm7, %xmm10, %xmm7 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm6, %xmm10, %xmm11 -; AVX512F-NEXT: vaddpd %xmm7, %xmm11, %xmm7 -; AVX512F-NEXT: vmulsd %xmm4, %xmm2, %xmm4 -; AVX512F-NEXT: vmulsd %xmm5, %xmm9, %xmm9 +; AVX512F-NEXT: vmulsd %xmm4, %xmm5, %xmm4 ; AVX512F-NEXT: vaddsd %xmm4, %xmm9, %xmm4 -; AVX512F-NEXT: vmulsd %xmm10, %xmm8, %xmm9 -; AVX512F-NEXT: vaddsd %xmm4, %xmm9, %xmm4 -; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX512F-NEXT: vmulsd %xmm7, %xmm8, %xmm7 +; AVX512F-NEXT: vaddsd %xmm7, %xmm4, %xmm4 ; AVX512F-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm7, %xmm0, %xmm0 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm1, %xmm9, %xmm1 -; AVX512F-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm1, %xmm6, %xmm6 -; AVX512F-NEXT: vaddpd %xmm6, %xmm0, %xmm0 -; AVX512F-NEXT: vmulsd %xmm7, %xmm2, %xmm2 -; AVX512F-NEXT: vmulsd %xmm5, %xmm9, %xmm5 +; AVX512F-NEXT: vmulpd %xmm7, %xmm1, %xmm9 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm3, %xmm10, %xmm11 +; AVX512F-NEXT: vaddpd %xmm11, %xmm9, %xmm9 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm6, %xmm11, %xmm12 +; AVX512F-NEXT: vaddpd %xmm12, %xmm9, %xmm9 +; AVX512F-NEXT: vmulsd %xmm7, %xmm2, %xmm7 +; AVX512F-NEXT: vmulsd %xmm5, %xmm10, %xmm10 +; AVX512F-NEXT: vaddsd %xmm7, %xmm10, %xmm7 +; AVX512F-NEXT: vmulsd %xmm11, %xmm8, %xmm10 +; AVX512F-NEXT: vaddsd %xmm7, %xmm10, %xmm7 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm1, %xmm10, %xmm1 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm3, %xmm11, %xmm3 +; AVX512F-NEXT: vaddpd %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm3, %xmm6, %xmm6 +; AVX512F-NEXT: vaddpd %xmm6, %xmm1, %xmm1 +; AVX512F-NEXT: vmulsd %xmm2, %xmm10, %xmm2 +; AVX512F-NEXT: vmulsd %xmm5, %xmm11, %xmm5 ; AVX512F-NEXT: vaddsd %xmm5, %xmm2, %xmm2 -; AVX512F-NEXT: vmulsd %xmm1, %xmm8, %xmm1 -; AVX512F-NEXT: vaddsd %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm3, %zmm2 -; AVX512F-NEXT: vmovapd {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9] -; AVX512F-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3 -; AVX512F-NEXT: vmovsd %xmm1, 64(%rdi) -; AVX512F-NEXT: vmovapd %zmm3, (%rdi) +; AVX512F-NEXT: vmulsd %xmm3, %xmm8, %xmm3 +; AVX512F-NEXT: vaddsd %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vshufpd {{.*#+}} xmm3 = xmm9[1],xmm7[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm9[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovsd %xmm2, 64(%rdi) +; AVX512F-NEXT: vmovapd %zmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -789,39 +781,39 @@ ; AVX512VL-NEXT: vaddsd %xmm4, %xmm1, %xmm1 ; AVX512VL-NEXT: vmulsd %xmm7, %xmm8, %xmm4 ; AVX512VL-NEXT: vaddsd %xmm4, %xmm1, %xmm1 -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 ; AVX512VL-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] ; AVX512VL-NEXT: vmulpd %xmm4, %xmm0, %xmm7 -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm3, %xmm9, %xmm10 -; AVX512VL-NEXT: vaddpd %xmm7, %xmm10, %xmm7 ; AVX512VL-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm6, %xmm10, %xmm11 +; AVX512VL-NEXT: vmulpd %xmm3, %xmm10, %xmm11 ; AVX512VL-NEXT: vaddpd %xmm7, %xmm11, %xmm7 +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] +; AVX512VL-NEXT: vmulpd %xmm6, %xmm11, %xmm12 +; AVX512VL-NEXT: vaddpd %xmm7, %xmm12, %xmm7 ; AVX512VL-NEXT: vmulsd %xmm4, %xmm2, %xmm4 -; AVX512VL-NEXT: vmulsd %xmm5, %xmm9, %xmm9 -; AVX512VL-NEXT: vaddsd %xmm4, %xmm9, %xmm4 -; AVX512VL-NEXT: vmulsd %xmm10, %xmm8, %xmm9 -; AVX512VL-NEXT: vaddsd %xmm4, %xmm9, %xmm4 -; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm7, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm3, %xmm9, %xmm3 +; AVX512VL-NEXT: vmulsd %xmm5, %xmm10, %xmm10 +; AVX512VL-NEXT: vaddsd %xmm4, %xmm10, %xmm4 +; AVX512VL-NEXT: vmulsd %xmm11, %xmm8, %xmm10 +; AVX512VL-NEXT: vaddsd %xmm4, %xmm10, %xmm4 +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX512VL-NEXT: vmulpd %xmm0, %xmm10, %xmm0 +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] +; AVX512VL-NEXT: vmulpd %xmm3, %xmm11, %xmm3 ; AVX512VL-NEXT: vaddpd %xmm3, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX512VL-NEXT: vmulpd %xmm3, %xmm6, %xmm6 ; AVX512VL-NEXT: vaddpd %xmm6, %xmm0, %xmm0 -; AVX512VL-NEXT: vmulsd %xmm7, %xmm2, %xmm2 -; AVX512VL-NEXT: vmulsd %xmm5, %xmm9, %xmm5 +; AVX512VL-NEXT: vmulsd %xmm2, %xmm10, %xmm2 +; AVX512VL-NEXT: vmulsd %xmm5, %xmm11, %xmm5 ; AVX512VL-NEXT: vaddsd %xmm5, %xmm2, %xmm2 ; AVX512VL-NEXT: vmulsd %xmm3, %xmm8, %xmm3 ; AVX512VL-NEXT: vaddsd %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512VL-NEXT: vmovapd {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9] -; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm1, %zmm3 +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm3 = xmm7[1],xmm4[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm7[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512VL-NEXT: vmovsd %xmm2, 64(%rdi) -; AVX512VL-NEXT: vmovapd %zmm3, (%rdi) +; AVX512VL-NEXT: vmovapd %zmm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll --- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll @@ -178,8 +178,8 @@ ; X86-NEXT: movzwl (%ecx), %edx ; X86-NEXT: xorw (%eax), %dx ; X86-NEXT: movzbl 2(%ecx), %ecx -; X86-NEXT: xorb 2(%eax), %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl 2(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orw %dx, %ax ; X86-NEXT: setne %al ; X86-NEXT: retl @@ -311,8 +311,8 @@ ; X86-NEXT: movl (%ecx), %edx ; X86-NEXT: xorl (%eax), %edx ; X86-NEXT: movzbl 4(%ecx), %ecx -; X86-NEXT: xorb 4(%eax), %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orl %edx, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl @@ -514,8 +514,8 @@ ; X86-NEXT: xorl 4(%eax), %esi ; X86-NEXT: orl %edx, %esi ; X86-NEXT: movzbl 8(%ecx), %ecx -; X86-NEXT: xorb 8(%eax), %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl 8(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orl %esi, %eax ; X86-NEXT: sete %al ; X86-NEXT: popl %esi @@ -537,8 +537,8 @@ ; X86-NEXT: xorl 4(%eax), %esi ; X86-NEXT: orl %edx, %esi ; X86-NEXT: movzwl 8(%ecx), %ecx -; X86-NEXT: xorw 8(%eax), %cx -; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: movzwl 8(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orl %esi, %eax ; X86-NEXT: sete %al ; X86-NEXT: popl %esi @@ -645,8 +645,8 @@ ; X86-NEXT: movl 8(%edx), %esi ; X86-NEXT: xorl 8(%ecx), %esi ; X86-NEXT: movzbl 12(%edx), %edx -; X86-NEXT: xorb 12(%ecx), %dl -; X86-NEXT: movzbl %dl, %ecx +; X86-NEXT: movzbl 12(%ecx), %ecx +; X86-NEXT: xorl %edx, %ecx ; X86-NEXT: orl %esi, %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %al @@ -671,8 +671,8 @@ ; X86-NEXT: movl 8(%edx), %esi ; X86-NEXT: xorl 8(%ecx), %esi ; X86-NEXT: movzwl 12(%edx), %edx -; X86-NEXT: xorw 12(%ecx), %dx -; X86-NEXT: movzwl %dx, %ecx +; X86-NEXT: movzwl 12(%ecx), %ecx +; X86-NEXT: xorl %edx, %ecx ; X86-NEXT: orl %esi, %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %al diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll --- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll +++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll @@ -167,9 +167,9 @@ ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: xorw (%rsi), %ax ; X64-NEXT: movzbl 2(%rdi), %ecx -; X64-NEXT: xorb 2(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orw %ax, %cx +; X64-NEXT: movzbl 2(%rsi), %edx +; X64-NEXT: xorl %ecx, %edx +; X64-NEXT: orw %ax, %dx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind @@ -284,9 +284,9 @@ ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: xorl (%rsi), %eax ; X64-NEXT: movzbl 4(%rdi), %ecx -; X64-NEXT: xorb 4(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orl %eax, %ecx +; X64-NEXT: movzbl 4(%rsi), %edx +; X64-NEXT: xorl %ecx, %edx +; X64-NEXT: orl %eax, %edx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind @@ -443,9 +443,9 @@ ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: xorq (%rsi), %rax ; X64-NEXT: movzbl 8(%rdi), %ecx -; X64-NEXT: xorb 8(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movzbl 8(%rsi), %edx +; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: sete %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind @@ -459,9 +459,9 @@ ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: xorq (%rsi), %rax ; X64-NEXT: movzwl 8(%rdi), %ecx -; X64-NEXT: xorw 8(%rsi), %cx -; X64-NEXT: movzwl %cx, %ecx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movzwl 8(%rsi), %edx +; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: sete %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind @@ -490,8 +490,9 @@ ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: xorq (%rsi), %rax ; X64-NEXT: movl 8(%rdi), %ecx -; X64-NEXT: xorl 8(%rsi), %ecx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movl 8(%rsi), %edx +; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind @@ -1636,10 +1637,96 @@ ; X64-AVX1-LABEL: length48_eq: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1 -; X64-AVX1-NEXT: vmovups 32(%rsi), %xmm2 -; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 +; X64-AVX1-NEXT: movq 32(%rdi), %rax +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vmovd %eax, %xmm1 +; X64-AVX1-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq 40(%rdi), %rax +; X64-AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq 32(%rsi), %rcx +; X64-AVX1-NEXT: movq 40(%rsi), %rax +; X64-AVX1-NEXT: movl %ecx, %edx +; X64-AVX1-NEXT: shrl $8, %edx +; X64-AVX1-NEXT: vmovd %ecx, %xmm2 +; X64-AVX1-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %ecx, %edx +; X64-AVX1-NEXT: shrl $16, %edx +; X64-AVX1-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %ecx, %edx +; X64-AVX1-NEXT: shrl $24, %edx +; X64-AVX1-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rcx, %rdx +; X64-AVX1-NEXT: shrq $32, %rdx +; X64-AVX1-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rcx, %rdx +; X64-AVX1-NEXT: shrq $40, %rdx +; X64-AVX1-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rcx, %rdx +; X64-AVX1-NEXT: shrq $48, %rdx +; X64-AVX1-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: shrq $56, %rcx +; X64-AVX1-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ; X64-AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1 +; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 ; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: sete %al @@ -1649,10 +1736,96 @@ ; X64-AVX2-LABEL: length48_eq: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-AVX2-NEXT: vmovdqu 32(%rsi), %xmm2 -; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: movq 32(%rdi), %rax +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vmovd %eax, %xmm1 +; X64-AVX2-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq 40(%rdi), %rax +; X64-AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq 32(%rsi), %rcx +; X64-AVX2-NEXT: movq 40(%rsi), %rax +; X64-AVX2-NEXT: movl %ecx, %edx +; X64-AVX2-NEXT: shrl $8, %edx +; X64-AVX2-NEXT: vmovd %ecx, %xmm2 +; X64-AVX2-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %ecx, %edx +; X64-AVX2-NEXT: shrl $16, %edx +; X64-AVX2-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %ecx, %edx +; X64-AVX2-NEXT: shrl $24, %edx +; X64-AVX2-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rcx, %rdx +; X64-AVX2-NEXT: shrq $32, %rdx +; X64-AVX2-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rcx, %rdx +; X64-AVX2-NEXT: shrq $40, %rdx +; X64-AVX2-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rcx, %rdx +; X64-AVX2-NEXT: shrq $48, %rdx +; X64-AVX2-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: shrq $56, %rcx +; X64-AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: sete %al @@ -1662,10 +1835,96 @@ ; X64-AVX512-LABEL: length48_eq: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX512-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-AVX512-NEXT: vmovdqu 32(%rsi), %xmm2 -; X64-AVX512-NEXT: vpxor (%rsi), %ymm0, %ymm0 +; X64-AVX512-NEXT: movq 32(%rdi), %rax +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vmovd %eax, %xmm1 +; X64-AVX512-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq 40(%rdi), %rax +; X64-AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq 32(%rsi), %rcx +; X64-AVX512-NEXT: movq 40(%rsi), %rax +; X64-AVX512-NEXT: movl %ecx, %edx +; X64-AVX512-NEXT: shrl $8, %edx +; X64-AVX512-NEXT: vmovd %ecx, %xmm2 +; X64-AVX512-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %ecx, %edx +; X64-AVX512-NEXT: shrl $16, %edx +; X64-AVX512-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %ecx, %edx +; X64-AVX512-NEXT: shrl $24, %edx +; X64-AVX512-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rcx, %rdx +; X64-AVX512-NEXT: shrq $32, %rdx +; X64-AVX512-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rcx, %rdx +; X64-AVX512-NEXT: shrq $40, %rdx +; X64-AVX512-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rcx, %rdx +; X64-AVX512-NEXT: shrq $48, %rdx +; X64-AVX512-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: shrq $56, %rcx +; X64-AVX512-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ; X64-AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; X64-AVX512-NEXT: vpxor (%rsi), %ymm0, %ymm0 ; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vptest %ymm0, %ymm0 ; X64-AVX512-NEXT: sete %al @@ -1676,8 +1935,22 @@ ; X64-MIC-AVX: # %bb.0: ; X64-MIC-AVX-NEXT: vmovdqu (%rdi), %ymm0 ; X64-MIC-AVX-NEXT: vmovdqu (%rsi), %ymm1 -; X64-MIC-AVX-NEXT: vmovdqu 32(%rdi), %xmm2 -; X64-MIC-AVX-NEXT: vmovdqu 32(%rsi), %xmm3 +; X64-MIC-AVX-NEXT: movq 32(%rdi), %rax +; X64-MIC-AVX-NEXT: vmovd %eax, %xmm2 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-MIC-AVX-NEXT: movq 40(%rdi), %rax +; X64-MIC-AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-MIC-AVX-NEXT: movq 32(%rsi), %rax +; X64-MIC-AVX-NEXT: vmovd %eax, %xmm3 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; X64-MIC-AVX-NEXT: movq 40(%rsi), %rax +; X64-MIC-AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3 ; X64-MIC-AVX-NEXT: vpcmpneqd %zmm3, %zmm2, %k0 ; X64-MIC-AVX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 ; X64-MIC-AVX-NEXT: kortestw %k0, %k1 @@ -1823,9 +2096,52 @@ ; X64-AVX1-LABEL: length48_eq_const: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1 -; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX1-NEXT: movq 32(%rdi), %rax +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vmovd %eax, %xmm1 +; X64-AVX1-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq 40(%rdi), %rax +; X64-AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: setne %al @@ -1835,9 +2151,52 @@ ; X64-AVX2-LABEL: length48_eq_const: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: movq 32(%rdi), %rax +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vmovd %eax, %xmm1 +; X64-AVX2-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq 40(%rdi), %rax +; X64-AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: setne %al @@ -1847,9 +2206,52 @@ ; X64-AVX512-LABEL: length48_eq_const: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX512-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX512-NEXT: movq 32(%rdi), %rax +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vmovd %eax, %xmm1 +; X64-AVX512-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq 40(%rdi), %rax +; X64-AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vptest %ymm0, %ymm0 ; X64-AVX512-NEXT: setne %al @@ -1859,12 +2261,19 @@ ; X64-MIC-AVX-LABEL: length48_eq_const: ; X64-MIC-AVX: # %bb.0: ; X64-MIC-AVX-NEXT: vmovdqu (%rdi), %ymm0 -; X64-MIC-AVX-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [892613426,959985462,858927408,926299444,0,0,0,0] -; X64-MIC-AVX-NEXT: vpcmpneqd %zmm2, %zmm1, %k0 -; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960] -; X64-MIC-AVX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 -; X64-MIC-AVX-NEXT: kortestw %k0, %k1 +; X64-MIC-AVX-NEXT: movq 32(%rdi), %rax +; X64-MIC-AVX-NEXT: vmovd %eax, %xmm1 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X64-MIC-AVX-NEXT: movq 40(%rdi), %rax +; X64-MIC-AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960] +; X64-MIC-AVX-NEXT: vpcmpneqd %zmm2, %zmm0, %k0 +; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [892613426,959985462,858927408,926299444,0,0,0,0] +; X64-MIC-AVX-NEXT: vpcmpneqd %zmm0, %zmm1, %k1 +; X64-MIC-AVX-NEXT: kortestw %k1, %k0 ; X64-MIC-AVX-NEXT: setne %al ; X64-MIC-AVX-NEXT: vzeroupper ; X64-MIC-AVX-NEXT: retq @@ -2388,23 +2797,231 @@ ; X64-AVX512BW-LABEL: length96_eq: ; X64-AVX512BW: # %bb.0: ; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512BW-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-AVX512BW-NEXT: vmovdqu 64(%rsi), %ymm2 -; X64-AVX512BW-NEXT: vpcmpneqb (%rsi), %zmm0, %k0 -; X64-AVX512BW-NEXT: vpcmpneqb %zmm2, %zmm1, %k1 -; X64-AVX512BW-NEXT: kortestq %k1, %k0 +; X64-AVX512BW-NEXT: movq 80(%rdi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm1 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq 88(%rdi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq 64(%rdi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm2 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq 72(%rdi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X64-AVX512BW-NEXT: movq 80(%rsi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm2 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq 88(%rsi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq 64(%rsi), %rcx +; X64-AVX512BW-NEXT: movq 72(%rsi), %rax +; X64-AVX512BW-NEXT: movl %ecx, %edx +; X64-AVX512BW-NEXT: shrl $8, %edx +; X64-AVX512BW-NEXT: vmovd %ecx, %xmm3 +; X64-AVX512BW-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %ecx, %edx +; X64-AVX512BW-NEXT: shrl $16, %edx +; X64-AVX512BW-NEXT: vpinsrb $2, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %ecx, %edx +; X64-AVX512BW-NEXT: shrl $24, %edx +; X64-AVX512BW-NEXT: vpinsrb $3, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rcx, %rdx +; X64-AVX512BW-NEXT: shrq $32, %rdx +; X64-AVX512BW-NEXT: vpinsrb $4, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rcx, %rdx +; X64-AVX512BW-NEXT: shrq $40, %rdx +; X64-AVX512BW-NEXT: vpinsrb $5, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rcx, %rdx +; X64-AVX512BW-NEXT: shrq $48, %rdx +; X64-AVX512BW-NEXT: vpinsrb $6, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: shrq $56, %rcx +; X64-AVX512BW-NEXT: vpinsrb $7, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX512BW-NEXT: vpcmpneqb %zmm2, %zmm1, %k0 +; X64-AVX512BW-NEXT: vpcmpneqb (%rsi), %zmm0, %k1 +; X64-AVX512BW-NEXT: kortestq %k0, %k1 ; X64-AVX512BW-NEXT: setne %al ; X64-AVX512BW-NEXT: vzeroupper ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512F-LABEL: length96_eq: ; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512F-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-AVX512F-NEXT: vmovdqu 64(%rsi), %ymm2 -; X64-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm0, %k0 -; X64-AVX512F-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 -; X64-AVX512F-NEXT: kortestw %k1, %k0 +; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1 +; X64-AVX512F-NEXT: movq 80(%rdi), %rax +; X64-AVX512F-NEXT: vmovd %eax, %xmm0 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X64-AVX512F-NEXT: movq 88(%rdi), %rax +; X64-AVX512F-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; X64-AVX512F-NEXT: movq 64(%rdi), %rax +; X64-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: movq 72(%rdi), %rax +; X64-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; X64-AVX512F-NEXT: movq 80(%rsi), %rax +; X64-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: movq 88(%rsi), %rax +; X64-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: movq 64(%rsi), %rax +; X64-AVX512F-NEXT: movq 72(%rsi), %rcx +; X64-AVX512F-NEXT: vmovd %eax, %xmm3 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; X64-AVX512F-NEXT: vpinsrd $2, %ecx, %xmm3, %xmm3 +; X64-AVX512F-NEXT: shrq $32, %rcx +; X64-AVX512F-NEXT: vpinsrd $3, %ecx, %xmm3, %xmm3 +; X64-AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX512F-NEXT: vpcmpneqd %zmm2, %zmm0, %k0 +; X64-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm1, %k1 +; X64-AVX512F-NEXT: kortestw %k0, %k1 ; X64-AVX512F-NEXT: setne %al ; X64-AVX512F-NEXT: vzeroupper ; X64-AVX512F-NEXT: retq @@ -2428,12 +3045,44 @@ ; ; X64-MIC-AVX512F-LABEL: length96_eq: ; X64-MIC-AVX512F: # %bb.0: -; X64-MIC-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-MIC-AVX512F-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-MIC-AVX512F-NEXT: vmovdqu 64(%rsi), %ymm2 -; X64-MIC-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm0, %k0 -; X64-MIC-AVX512F-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 -; X64-MIC-AVX512F-NEXT: kortestw %k1, %k0 +; X64-MIC-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1 +; X64-MIC-AVX512F-NEXT: movq 80(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm0 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X64-MIC-AVX512F-NEXT: movq 88(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; X64-MIC-AVX512F-NEXT: movq 64(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: movq 72(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; X64-MIC-AVX512F-NEXT: movq 80(%rsi), %rax +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: movq 88(%rsi), %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: movq 64(%rsi), %rax +; X64-MIC-AVX512F-NEXT: movq 72(%rsi), %rcx +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm3 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %ecx, %xmm3, %xmm3 +; X64-MIC-AVX512F-NEXT: shrq $32, %rcx +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %ecx, %xmm3, %xmm3 +; X64-MIC-AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; X64-MIC-AVX512F-NEXT: vpcmpneqd %zmm2, %zmm0, %k0 +; X64-MIC-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm1, %k1 +; X64-MIC-AVX512F-NEXT: kortestw %k0, %k1 ; X64-MIC-AVX512F-NEXT: setne %al ; X64-MIC-AVX512F-NEXT: vzeroupper ; X64-MIC-AVX512F-NEXT: retq @@ -2517,10 +3166,98 @@ ; X64-AVX512BW-LABEL: length96_eq_const: ; X64-AVX512BW: # %bb.0: ; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512BW-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-AVX512BW-NEXT: vpcmpneqb .L.str(%rip), %zmm0, %k0 -; X64-AVX512BW-NEXT: vpcmpneqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1 -; X64-AVX512BW-NEXT: kortestq %k1, %k0 +; X64-AVX512BW-NEXT: movq 80(%rdi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm1 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq 88(%rdi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq 64(%rdi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm2 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq 72(%rdi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X64-AVX512BW-NEXT: vpcmpneqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k0 +; X64-AVX512BW-NEXT: vpcmpneqb .L.str(%rip), %zmm0, %k1 +; X64-AVX512BW-NEXT: kortestq %k0, %k1 ; X64-AVX512BW-NEXT: sete %al ; X64-AVX512BW-NEXT: vzeroupper ; X64-AVX512BW-NEXT: retq @@ -2528,10 +3265,26 @@ ; X64-AVX512F-LABEL: length96_eq_const: ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512F-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k0 -; X64-AVX512F-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1 -; X64-AVX512F-NEXT: kortestw %k1, %k0 +; X64-AVX512F-NEXT: movq 80(%rdi), %rax +; X64-AVX512F-NEXT: vmovd %eax, %xmm1 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X64-AVX512F-NEXT: movq 88(%rdi), %rax +; X64-AVX512F-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; X64-AVX512F-NEXT: movq 64(%rdi), %rax +; X64-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: movq 72(%rdi), %rax +; X64-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X64-AVX512F-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k0 +; X64-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k1 +; X64-AVX512F-NEXT: kortestw %k0, %k1 ; X64-AVX512F-NEXT: sete %al ; X64-AVX512F-NEXT: vzeroupper ; X64-AVX512F-NEXT: retq @@ -2556,10 +3309,26 @@ ; X64-MIC-AVX512F-LABEL: length96_eq_const: ; X64-MIC-AVX512F: # %bb.0: ; X64-MIC-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-MIC-AVX512F-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k0 -; X64-MIC-AVX512F-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1 -; X64-MIC-AVX512F-NEXT: kortestw %k1, %k0 +; X64-MIC-AVX512F-NEXT: movq 80(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm1 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X64-MIC-AVX512F-NEXT: movq 88(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; X64-MIC-AVX512F-NEXT: movq 64(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: movq 72(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X64-MIC-AVX512F-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k0 +; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k1 +; X64-MIC-AVX512F-NEXT: kortestw %k0, %k1 ; X64-MIC-AVX512F-NEXT: sete %al ; X64-MIC-AVX512F-NEXT: vzeroupper ; X64-MIC-AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll --- a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll @@ -106,9 +106,9 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl (%ecx), %edx ; X86-NEXT: xorw (%eax), %dx -; X86-NEXT: movb 2(%ecx), %cl -; X86-NEXT: xorb 2(%eax), %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl 2(%ecx), %ecx +; X86-NEXT: movzbl 2(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orw %dx, %ax ; X86-NEXT: setne %al ; X86-NEXT: retl @@ -197,9 +197,9 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %edx ; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: movb 4(%ecx), %cl -; X86-NEXT: xorb 4(%eax), %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl 4(%ecx), %ecx +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orl %edx, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll --- a/llvm/test/CodeGen/X86/memcmp-optsize.ll +++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll @@ -94,10 +94,10 @@ ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: xorw (%rsi), %ax -; X64-NEXT: movb 2(%rdi), %cl -; X64-NEXT: xorb 2(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orw %ax, %cx +; X64-NEXT: movzbl 2(%rdi), %ecx +; X64-NEXT: movzbl 2(%rsi), %edx +; X64-NEXT: xorl %ecx, %edx +; X64-NEXT: orw %ax, %dx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind @@ -173,10 +173,10 @@ ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: xorl (%rsi), %eax -; X64-NEXT: movb 4(%rdi), %cl -; X64-NEXT: xorb 4(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orl %eax, %ecx +; X64-NEXT: movzbl 4(%rdi), %ecx +; X64-NEXT: movzbl 4(%rsi), %edx +; X64-NEXT: xorl %ecx, %edx +; X64-NEXT: orl %eax, %edx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind @@ -230,8 +230,9 @@ ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: xorq (%rsi), %rax ; X64-NEXT: movl 8(%rdi), %ecx -; X64-NEXT: xorl 8(%rsi), %ecx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movl 8(%rsi), %edx +; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll --- a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll @@ -106,9 +106,9 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl (%ecx), %edx ; X86-NEXT: xorw (%eax), %dx -; X86-NEXT: movb 2(%ecx), %cl -; X86-NEXT: xorb 2(%eax), %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl 2(%ecx), %ecx +; X86-NEXT: movzbl 2(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orw %dx, %ax ; X86-NEXT: setne %al ; X86-NEXT: retl @@ -197,9 +197,9 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %edx ; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: movb 4(%ecx), %cl -; X86-NEXT: xorb 4(%eax), %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl 4(%ecx), %ecx +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orl %edx, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll --- a/llvm/test/CodeGen/X86/memcmp-pgso.ll +++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll @@ -94,10 +94,10 @@ ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: xorw (%rsi), %ax -; X64-NEXT: movb 2(%rdi), %cl -; X64-NEXT: xorb 2(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orw %ax, %cx +; X64-NEXT: movzbl 2(%rdi), %ecx +; X64-NEXT: movzbl 2(%rsi), %edx +; X64-NEXT: xorl %ecx, %edx +; X64-NEXT: orw %ax, %dx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind @@ -173,10 +173,10 @@ ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: xorl (%rsi), %eax -; X64-NEXT: movb 4(%rdi), %cl -; X64-NEXT: xorb 4(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orl %eax, %ecx +; X64-NEXT: movzbl 4(%rdi), %ecx +; X64-NEXT: movzbl 4(%rsi), %edx +; X64-NEXT: xorl %ecx, %edx +; X64-NEXT: orl %eax, %edx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind @@ -230,8 +230,9 @@ ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: xorq (%rsi), %rax ; X64-NEXT: movl 8(%rdi), %ecx -; X64-NEXT: xorl 8(%rsi), %ecx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movl 8(%rsi), %edx +; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind diff --git a/llvm/test/CodeGen/X86/memcmp-x32.ll b/llvm/test/CodeGen/X86/memcmp-x32.ll --- a/llvm/test/CodeGen/X86/memcmp-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-x32.ll @@ -206,8 +206,8 @@ ; X86-NEXT: movzwl (%ecx), %edx ; X86-NEXT: xorw (%eax), %dx ; X86-NEXT: movzbl 2(%ecx), %ecx -; X86-NEXT: xorb 2(%eax), %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl 2(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orw %dx, %ax ; X86-NEXT: setne %al ; X86-NEXT: retl @@ -339,8 +339,8 @@ ; X86-NEXT: movl (%ecx), %edx ; X86-NEXT: xorl (%eax), %edx ; X86-NEXT: movzbl 4(%ecx), %ecx -; X86-NEXT: xorb 4(%eax), %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orl %edx, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll --- a/llvm/test/CodeGen/X86/memcmp.ll +++ b/llvm/test/CodeGen/X86/memcmp.ll @@ -193,9 +193,9 @@ ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: xorw (%rsi), %ax ; X64-NEXT: movzbl 2(%rdi), %ecx -; X64-NEXT: xorb 2(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orw %ax, %cx +; X64-NEXT: movzbl 2(%rsi), %edx +; X64-NEXT: xorl %ecx, %edx +; X64-NEXT: orw %ax, %dx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind @@ -310,9 +310,9 @@ ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: xorl (%rsi), %eax ; X64-NEXT: movzbl 4(%rdi), %ecx -; X64-NEXT: xorb 4(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orl %eax, %ecx +; X64-NEXT: movzbl 4(%rsi), %edx +; X64-NEXT: xorl %ecx, %edx +; X64-NEXT: orl %eax, %edx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind @@ -469,9 +469,9 @@ ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: xorq (%rsi), %rax ; X64-NEXT: movzbl 8(%rdi), %ecx -; X64-NEXT: xorb 8(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movzbl 8(%rsi), %edx +; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: sete %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind @@ -485,9 +485,9 @@ ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: xorq (%rsi), %rax ; X64-NEXT: movzwl 8(%rdi), %ecx -; X64-NEXT: xorw 8(%rsi), %cx -; X64-NEXT: movzwl %cx, %ecx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movzwl 8(%rsi), %edx +; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: sete %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind @@ -516,8 +516,9 @@ ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: xorq (%rsi), %rax ; X64-NEXT: movl 8(%rdi), %ecx -; X64-NEXT: xorl 8(%rsi), %ecx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movl 8(%rsi), %edx +; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind @@ -1493,10 +1494,96 @@ ; X64-AVX1-LABEL: length48_eq: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1 -; X64-AVX1-NEXT: vmovups 32(%rsi), %xmm2 -; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 +; X64-AVX1-NEXT: movq 32(%rdi), %rax +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vmovd %eax, %xmm1 +; X64-AVX1-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq 40(%rdi), %rax +; X64-AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq 32(%rsi), %rcx +; X64-AVX1-NEXT: movq 40(%rsi), %rax +; X64-AVX1-NEXT: movl %ecx, %edx +; X64-AVX1-NEXT: shrl $8, %edx +; X64-AVX1-NEXT: vmovd %ecx, %xmm2 +; X64-AVX1-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %ecx, %edx +; X64-AVX1-NEXT: shrl $16, %edx +; X64-AVX1-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %ecx, %edx +; X64-AVX1-NEXT: shrl $24, %edx +; X64-AVX1-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rcx, %rdx +; X64-AVX1-NEXT: shrq $32, %rdx +; X64-AVX1-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rcx, %rdx +; X64-AVX1-NEXT: shrq $40, %rdx +; X64-AVX1-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rcx, %rdx +; X64-AVX1-NEXT: shrq $48, %rdx +; X64-AVX1-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: shrq $56, %rcx +; X64-AVX1-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ; X64-AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1 +; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 ; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: sete %al @@ -1506,10 +1593,96 @@ ; X64-AVX2-LABEL: length48_eq: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-AVX2-NEXT: vmovdqu 32(%rsi), %xmm2 -; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: movq 32(%rdi), %rax +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vmovd %eax, %xmm1 +; X64-AVX2-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq 40(%rdi), %rax +; X64-AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq 32(%rsi), %rcx +; X64-AVX2-NEXT: movq 40(%rsi), %rax +; X64-AVX2-NEXT: movl %ecx, %edx +; X64-AVX2-NEXT: shrl $8, %edx +; X64-AVX2-NEXT: vmovd %ecx, %xmm2 +; X64-AVX2-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %ecx, %edx +; X64-AVX2-NEXT: shrl $16, %edx +; X64-AVX2-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %ecx, %edx +; X64-AVX2-NEXT: shrl $24, %edx +; X64-AVX2-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rcx, %rdx +; X64-AVX2-NEXT: shrq $32, %rdx +; X64-AVX2-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rcx, %rdx +; X64-AVX2-NEXT: shrq $40, %rdx +; X64-AVX2-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rcx, %rdx +; X64-AVX2-NEXT: shrq $48, %rdx +; X64-AVX2-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: shrq $56, %rcx +; X64-AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: sete %al @@ -1519,10 +1692,96 @@ ; X64-AVX512-LABEL: length48_eq: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX512-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-AVX512-NEXT: vmovdqu 32(%rsi), %xmm2 -; X64-AVX512-NEXT: vpxor (%rsi), %ymm0, %ymm0 +; X64-AVX512-NEXT: movq 32(%rdi), %rax +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vmovd %eax, %xmm1 +; X64-AVX512-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq 40(%rdi), %rax +; X64-AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq 32(%rsi), %rcx +; X64-AVX512-NEXT: movq 40(%rsi), %rax +; X64-AVX512-NEXT: movl %ecx, %edx +; X64-AVX512-NEXT: shrl $8, %edx +; X64-AVX512-NEXT: vmovd %ecx, %xmm2 +; X64-AVX512-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %ecx, %edx +; X64-AVX512-NEXT: shrl $16, %edx +; X64-AVX512-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %ecx, %edx +; X64-AVX512-NEXT: shrl $24, %edx +; X64-AVX512-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rcx, %rdx +; X64-AVX512-NEXT: shrq $32, %rdx +; X64-AVX512-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rcx, %rdx +; X64-AVX512-NEXT: shrq $40, %rdx +; X64-AVX512-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rcx, %rdx +; X64-AVX512-NEXT: shrq $48, %rdx +; X64-AVX512-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: shrq $56, %rcx +; X64-AVX512-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ; X64-AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; X64-AVX512-NEXT: vpxor (%rsi), %ymm0, %ymm0 ; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vptest %ymm0, %ymm0 ; X64-AVX512-NEXT: sete %al @@ -1533,8 +1792,22 @@ ; X64-MIC-AVX: # %bb.0: ; X64-MIC-AVX-NEXT: vmovdqu (%rdi), %ymm0 ; X64-MIC-AVX-NEXT: vmovdqu (%rsi), %ymm1 -; X64-MIC-AVX-NEXT: vmovdqu 32(%rdi), %xmm2 -; X64-MIC-AVX-NEXT: vmovdqu 32(%rsi), %xmm3 +; X64-MIC-AVX-NEXT: movq 32(%rdi), %rax +; X64-MIC-AVX-NEXT: vmovd %eax, %xmm2 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-MIC-AVX-NEXT: movq 40(%rdi), %rax +; X64-MIC-AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-MIC-AVX-NEXT: movq 32(%rsi), %rax +; X64-MIC-AVX-NEXT: vmovd %eax, %xmm3 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; X64-MIC-AVX-NEXT: movq 40(%rsi), %rax +; X64-MIC-AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3 ; X64-MIC-AVX-NEXT: vpcmpneqd %zmm3, %zmm2, %k0 ; X64-MIC-AVX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 ; X64-MIC-AVX-NEXT: kortestw %k0, %k1 @@ -1606,9 +1879,52 @@ ; X64-AVX1-LABEL: length48_eq_const: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1 -; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX1-NEXT: movq 32(%rdi), %rax +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vmovd %eax, %xmm1 +; X64-AVX1-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq 40(%rdi), %rax +; X64-AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: setne %al @@ -1618,9 +1934,52 @@ ; X64-AVX2-LABEL: length48_eq_const: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: movq 32(%rdi), %rax +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vmovd %eax, %xmm1 +; X64-AVX2-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq 40(%rdi), %rax +; X64-AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: setne %al @@ -1630,9 +1989,52 @@ ; X64-AVX512-LABEL: length48_eq_const: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX512-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX512-NEXT: movq 32(%rdi), %rax +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vmovd %eax, %xmm1 +; X64-AVX512-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq 40(%rdi), %rax +; X64-AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vptest %ymm0, %ymm0 ; X64-AVX512-NEXT: setne %al @@ -1642,12 +2044,19 @@ ; X64-MIC-AVX-LABEL: length48_eq_const: ; X64-MIC-AVX: # %bb.0: ; X64-MIC-AVX-NEXT: vmovdqu (%rdi), %ymm0 -; X64-MIC-AVX-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [892613426,959985462,858927408,926299444,0,0,0,0] -; X64-MIC-AVX-NEXT: vpcmpneqd %zmm2, %zmm1, %k0 -; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960] -; X64-MIC-AVX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 -; X64-MIC-AVX-NEXT: kortestw %k0, %k1 +; X64-MIC-AVX-NEXT: movq 32(%rdi), %rax +; X64-MIC-AVX-NEXT: vmovd %eax, %xmm1 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X64-MIC-AVX-NEXT: movq 40(%rdi), %rax +; X64-MIC-AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960] +; X64-MIC-AVX-NEXT: vpcmpneqd %zmm2, %zmm0, %k0 +; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [892613426,959985462,858927408,926299444,0,0,0,0] +; X64-MIC-AVX-NEXT: vpcmpneqd %zmm0, %zmm1, %k1 +; X64-MIC-AVX-NEXT: kortestw %k1, %k0 ; X64-MIC-AVX-NEXT: setne %al ; X64-MIC-AVX-NEXT: vzeroupper ; X64-MIC-AVX-NEXT: retq @@ -2047,23 +2456,231 @@ ; X64-AVX512BW-LABEL: length96_eq: ; X64-AVX512BW: # %bb.0: ; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512BW-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-AVX512BW-NEXT: vmovdqu 64(%rsi), %ymm2 -; X64-AVX512BW-NEXT: vpcmpneqb (%rsi), %zmm0, %k0 -; X64-AVX512BW-NEXT: vpcmpneqb %zmm2, %zmm1, %k1 -; X64-AVX512BW-NEXT: kortestq %k1, %k0 +; X64-AVX512BW-NEXT: movq 80(%rdi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm1 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq 88(%rdi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq 64(%rdi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm2 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq 72(%rdi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X64-AVX512BW-NEXT: movq 80(%rsi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm2 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq 88(%rsi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq 64(%rsi), %rcx +; X64-AVX512BW-NEXT: movq 72(%rsi), %rax +; X64-AVX512BW-NEXT: movl %ecx, %edx +; X64-AVX512BW-NEXT: shrl $8, %edx +; X64-AVX512BW-NEXT: vmovd %ecx, %xmm3 +; X64-AVX512BW-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %ecx, %edx +; X64-AVX512BW-NEXT: shrl $16, %edx +; X64-AVX512BW-NEXT: vpinsrb $2, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %ecx, %edx +; X64-AVX512BW-NEXT: shrl $24, %edx +; X64-AVX512BW-NEXT: vpinsrb $3, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rcx, %rdx +; X64-AVX512BW-NEXT: shrq $32, %rdx +; X64-AVX512BW-NEXT: vpinsrb $4, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rcx, %rdx +; X64-AVX512BW-NEXT: shrq $40, %rdx +; X64-AVX512BW-NEXT: vpinsrb $5, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rcx, %rdx +; X64-AVX512BW-NEXT: shrq $48, %rdx +; X64-AVX512BW-NEXT: vpinsrb $6, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: shrq $56, %rcx +; X64-AVX512BW-NEXT: vpinsrb $7, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX512BW-NEXT: vpcmpneqb %zmm2, %zmm1, %k0 +; X64-AVX512BW-NEXT: vpcmpneqb (%rsi), %zmm0, %k1 +; X64-AVX512BW-NEXT: kortestq %k0, %k1 ; X64-AVX512BW-NEXT: setne %al ; X64-AVX512BW-NEXT: vzeroupper ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512F-LABEL: length96_eq: ; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512F-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-AVX512F-NEXT: vmovdqu 64(%rsi), %ymm2 -; X64-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm0, %k0 -; X64-AVX512F-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 -; X64-AVX512F-NEXT: kortestw %k1, %k0 +; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1 +; X64-AVX512F-NEXT: movq 80(%rdi), %rax +; X64-AVX512F-NEXT: vmovd %eax, %xmm0 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X64-AVX512F-NEXT: movq 88(%rdi), %rax +; X64-AVX512F-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; X64-AVX512F-NEXT: movq 64(%rdi), %rax +; X64-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: movq 72(%rdi), %rax +; X64-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; X64-AVX512F-NEXT: movq 80(%rsi), %rax +; X64-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: movq 88(%rsi), %rax +; X64-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: movq 64(%rsi), %rax +; X64-AVX512F-NEXT: movq 72(%rsi), %rcx +; X64-AVX512F-NEXT: vmovd %eax, %xmm3 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; X64-AVX512F-NEXT: vpinsrd $2, %ecx, %xmm3, %xmm3 +; X64-AVX512F-NEXT: shrq $32, %rcx +; X64-AVX512F-NEXT: vpinsrd $3, %ecx, %xmm3, %xmm3 +; X64-AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX512F-NEXT: vpcmpneqd %zmm2, %zmm0, %k0 +; X64-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm1, %k1 +; X64-AVX512F-NEXT: kortestw %k0, %k1 ; X64-AVX512F-NEXT: setne %al ; X64-AVX512F-NEXT: vzeroupper ; X64-AVX512F-NEXT: retq @@ -2080,12 +2697,44 @@ ; ; X64-MIC-AVX512F-LABEL: length96_eq: ; X64-MIC-AVX512F: # %bb.0: -; X64-MIC-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-MIC-AVX512F-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-MIC-AVX512F-NEXT: vmovdqu 64(%rsi), %ymm2 -; X64-MIC-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm0, %k0 -; X64-MIC-AVX512F-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 -; X64-MIC-AVX512F-NEXT: kortestw %k1, %k0 +; X64-MIC-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1 +; X64-MIC-AVX512F-NEXT: movq 80(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm0 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X64-MIC-AVX512F-NEXT: movq 88(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; X64-MIC-AVX512F-NEXT: movq 64(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: movq 72(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; X64-MIC-AVX512F-NEXT: movq 80(%rsi), %rax +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: movq 88(%rsi), %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: movq 64(%rsi), %rax +; X64-MIC-AVX512F-NEXT: movq 72(%rsi), %rcx +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm3 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %ecx, %xmm3, %xmm3 +; X64-MIC-AVX512F-NEXT: shrq $32, %rcx +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %ecx, %xmm3, %xmm3 +; X64-MIC-AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; X64-MIC-AVX512F-NEXT: vpcmpneqd %zmm2, %zmm0, %k0 +; X64-MIC-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm1, %k1 +; X64-MIC-AVX512F-NEXT: kortestw %k0, %k1 ; X64-MIC-AVX512F-NEXT: setne %al ; X64-MIC-AVX512F-NEXT: vzeroupper ; X64-MIC-AVX512F-NEXT: retq @@ -2161,10 +2810,98 @@ ; X64-AVX512BW-LABEL: length96_eq_const: ; X64-AVX512BW: # %bb.0: ; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512BW-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-AVX512BW-NEXT: vpcmpneqb .L.str(%rip), %zmm0, %k0 -; X64-AVX512BW-NEXT: vpcmpneqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1 -; X64-AVX512BW-NEXT: kortestq %k1, %k0 +; X64-AVX512BW-NEXT: movq 80(%rdi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm1 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq 88(%rdi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq 64(%rdi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm2 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq 72(%rdi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X64-AVX512BW-NEXT: vpcmpneqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k0 +; X64-AVX512BW-NEXT: vpcmpneqb .L.str(%rip), %zmm0, %k1 +; X64-AVX512BW-NEXT: kortestq %k0, %k1 ; X64-AVX512BW-NEXT: sete %al ; X64-AVX512BW-NEXT: vzeroupper ; X64-AVX512BW-NEXT: retq @@ -2172,10 +2909,26 @@ ; X64-AVX512F-LABEL: length96_eq_const: ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512F-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k0 -; X64-AVX512F-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1 -; X64-AVX512F-NEXT: kortestw %k1, %k0 +; X64-AVX512F-NEXT: movq 80(%rdi), %rax +; X64-AVX512F-NEXT: vmovd %eax, %xmm1 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X64-AVX512F-NEXT: movq 88(%rdi), %rax +; X64-AVX512F-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; X64-AVX512F-NEXT: movq 64(%rdi), %rax +; X64-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: movq 72(%rdi), %rax +; X64-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: shrq $32, %rax +; X64-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X64-AVX512F-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k0 +; X64-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k1 +; X64-AVX512F-NEXT: kortestw %k0, %k1 ; X64-AVX512F-NEXT: sete %al ; X64-AVX512F-NEXT: vzeroupper ; X64-AVX512F-NEXT: retq @@ -2194,10 +2947,26 @@ ; X64-MIC-AVX512F-LABEL: length96_eq_const: ; X64-MIC-AVX512F: # %bb.0: ; X64-MIC-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-MIC-AVX512F-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k0 -; X64-MIC-AVX512F-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1 -; X64-MIC-AVX512F-NEXT: kortestw %k1, %k0 +; X64-MIC-AVX512F-NEXT: movq 80(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm1 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X64-MIC-AVX512F-NEXT: movq 88(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; X64-MIC-AVX512F-NEXT: movq 64(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vmovd %eax, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: movq 72(%rdi), %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: shrq $32, %rax +; X64-MIC-AVX512F-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-MIC-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X64-MIC-AVX512F-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k0 +; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k1 +; X64-MIC-AVX512F-NEXT: kortestw %k0, %k1 ; X64-MIC-AVX512F-NEXT: sete %al ; X64-MIC-AVX512F-NEXT: vzeroupper ; X64-MIC-AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/memset-zero.ll b/llvm/test/CodeGen/X86/memset-zero.ll --- a/llvm/test/CodeGen/X86/memset-zero.ll +++ b/llvm/test/CodeGen/X86/memset-zero.ll @@ -735,10 +735,10 @@ ; SANDYBRIDGE-LABEL: memset_64: ; SANDYBRIDGE: # %bb.0: # %entry ; SANDYBRIDGE-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; SANDYBRIDGE-NEXT: vmovups %xmm0, 16(%rdi) -; SANDYBRIDGE-NEXT: vmovups %xmm0, (%rdi) ; SANDYBRIDGE-NEXT: vmovups %xmm0, 48(%rdi) ; SANDYBRIDGE-NEXT: vmovups %xmm0, 32(%rdi) +; SANDYBRIDGE-NEXT: vmovups %xmm0, 16(%rdi) +; SANDYBRIDGE-NEXT: vmovups %xmm0, (%rdi) ; SANDYBRIDGE-NEXT: retq ; ; SKYLAKE-LABEL: memset_64: diff --git a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll --- a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll +++ b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll @@ -18,26 +18,26 @@ ; SLOW_32-LABEL: bork: ; SLOW_32: # %bb.0: ; SLOW_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SLOW_32-NEXT: movl $0, 4(%eax) -; SLOW_32-NEXT: movl $0, (%eax) -; SLOW_32-NEXT: movl $0, 12(%eax) -; SLOW_32-NEXT: movl $0, 8(%eax) -; SLOW_32-NEXT: movl $0, 20(%eax) -; SLOW_32-NEXT: movl $0, 16(%eax) -; SLOW_32-NEXT: movl $0, 28(%eax) -; SLOW_32-NEXT: movl $0, 24(%eax) -; SLOW_32-NEXT: movl $0, 36(%eax) -; SLOW_32-NEXT: movl $0, 32(%eax) -; SLOW_32-NEXT: movl $0, 44(%eax) -; SLOW_32-NEXT: movl $0, 40(%eax) -; SLOW_32-NEXT: movl $0, 52(%eax) -; SLOW_32-NEXT: movl $0, 48(%eax) -; SLOW_32-NEXT: movl $0, 60(%eax) -; SLOW_32-NEXT: movl $0, 56(%eax) -; SLOW_32-NEXT: movl $0, 68(%eax) -; SLOW_32-NEXT: movl $0, 64(%eax) ; SLOW_32-NEXT: movl $0, 76(%eax) ; SLOW_32-NEXT: movl $0, 72(%eax) +; SLOW_32-NEXT: movl $0, 68(%eax) +; SLOW_32-NEXT: movl $0, 64(%eax) +; SLOW_32-NEXT: movl $0, 60(%eax) +; SLOW_32-NEXT: movl $0, 56(%eax) +; SLOW_32-NEXT: movl $0, 52(%eax) +; SLOW_32-NEXT: movl $0, 48(%eax) +; SLOW_32-NEXT: movl $0, 44(%eax) +; SLOW_32-NEXT: movl $0, 40(%eax) +; SLOW_32-NEXT: movl $0, 36(%eax) +; SLOW_32-NEXT: movl $0, 32(%eax) +; SLOW_32-NEXT: movl $0, 28(%eax) +; SLOW_32-NEXT: movl $0, 24(%eax) +; SLOW_32-NEXT: movl $0, 20(%eax) +; SLOW_32-NEXT: movl $0, 16(%eax) +; SLOW_32-NEXT: movl $0, 12(%eax) +; SLOW_32-NEXT: movl $0, 8(%eax) +; SLOW_32-NEXT: movl $0, 4(%eax) +; SLOW_32-NEXT: movl $0, (%eax) ; SLOW_32-NEXT: retl ; ; SLOW_64-LABEL: bork: diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -1226,9 +1226,9 @@ ; ; AVX-LABEL: merge_4f32_f32_X0YY: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vbroadcastss (%rsi), %xmm0 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX-NEXT: retq ; ; X86-SSE-LABEL: merge_4f32_f32_X0YY: diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -8,13 +8,17 @@ define <8 x double> @merge_8f64_2f64_12u4(ptr %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8f64_2f64_12u4: ; ALL: # %bb.0: -; ALL-NEXT: vmovups 16(%rdi), %zmm0 +; ALL-NEXT: vmovups 16(%rdi), %ymm0 +; ALL-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm1 +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_8f64_2f64_12u4: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovups 16(%eax), %zmm0 +; X86-AVX512F-NEXT: vmovups 16(%eax), %ymm0 +; X86-AVX512F-NEXT: vinsertf128 $1, 64(%eax), %ymm0, %ymm1 +; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds <2 x double>, ptr %ptr, i64 1 %ptr1 = getelementptr inbounds <2 x double>, ptr %ptr, i64 2 @@ -31,15 +35,19 @@ define <8 x double> @merge_8f64_2f64_23z5(ptr %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8f64_2f64_23z5: ; ALL: # %bb.0: -; ALL-NEXT: vmovdqu64 32(%rdi), %zmm0 -; ALL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; ALL-NEXT: vmovups 32(%rdi), %ymm0 +; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vinsertf128 $1, 80(%rdi), %ymm1, %ymm1 +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_8f64_2f64_23z5: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovdqu64 32(%eax), %zmm0 -; X86-AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 +; X86-AVX512F-NEXT: vmovups 32(%eax), %ymm0 +; X86-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X86-AVX512F-NEXT: vinsertf128 $1, 80(%eax), %ymm1, %ymm1 +; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds <2 x double>, ptr %ptr, i64 2 %ptr1 = getelementptr inbounds <2 x double>, ptr %ptr, i64 3 @@ -209,7 +217,7 @@ ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 -; X86-AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 +; X86-AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i64, ptr %ptr, i64 1 %ptr2 = getelementptr inbounds i64, ptr %ptr, i64 3 diff --git a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll @@ -324,8 +324,8 @@ ; X86-SSE4A-NEXT: movsd 8(%ecx), %xmm1 # xmm1 = mem[0],zero ; X86-SSE4A-NEXT: movsd 16(%ecx), %xmm2 # xmm2 = mem[0],zero ; X86-SSE4A-NEXT: movsd 24(%ecx), %xmm3 # xmm3 = mem[0],zero -; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) ; X86-SSE4A-NEXT: movntsd %xmm1, 8(%eax) +; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) ; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax) ; X86-SSE4A-NEXT: movntsd %xmm2, 16(%eax) ; X86-SSE4A-NEXT: retl @@ -352,8 +352,8 @@ ; X64-SSE4A-NEXT: movsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero ; X64-SSE4A-NEXT: movsd 16(%rdi), %xmm2 # xmm2 = mem[0],zero ; X64-SSE4A-NEXT: movsd 24(%rdi), %xmm3 # xmm3 = mem[0],zero -; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) ; X64-SSE4A-NEXT: movntsd %xmm1, 8(%rsi) +; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) ; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi) ; X64-SSE4A-NEXT: movntsd %xmm2, 16(%rsi) ; X64-SSE4A-NEXT: retq @@ -435,8 +435,8 @@ ; X86-SSE4A-NEXT: movsd 8(%ecx), %xmm1 # xmm1 = mem[0],zero ; X86-SSE4A-NEXT: movsd 16(%ecx), %xmm2 # xmm2 = mem[0],zero ; X86-SSE4A-NEXT: movsd 24(%ecx), %xmm3 # xmm3 = mem[0],zero -; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) ; X86-SSE4A-NEXT: movntsd %xmm1, 8(%eax) +; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) ; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax) ; X86-SSE4A-NEXT: movntsd %xmm2, 16(%eax) ; X86-SSE4A-NEXT: retl @@ -463,8 +463,8 @@ ; X64-SSE4A-NEXT: movsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero ; X64-SSE4A-NEXT: movsd 16(%rdi), %xmm2 # xmm2 = mem[0],zero ; X64-SSE4A-NEXT: movsd 24(%rdi), %xmm3 # xmm3 = mem[0],zero -; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) ; X64-SSE4A-NEXT: movntsd %xmm1, 8(%rsi) +; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) ; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi) ; X64-SSE4A-NEXT: movntsd %xmm2, 16(%rsi) ; X64-SSE4A-NEXT: retq diff --git a/llvm/test/CodeGen/X86/merge-store-constants.ll b/llvm/test/CodeGen/X86/merge-store-constants.ll --- a/llvm/test/CodeGen/X86/merge-store-constants.ll +++ b/llvm/test/CodeGen/X86/merge-store-constants.ll @@ -12,8 +12,10 @@ ; ; X64-LABEL: big_nonzero_16_bytes: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,3,4] -; X64-NEXT: vmovups %xmm0, (%rdi) +; X64-NEXT: movabsq $8589934593, %rax # imm = 0x200000001 +; X64-NEXT: movq %rax, (%rdi) +; X64-NEXT: movabsq $17179869187, %rax # imm = 0x400000003 +; X64-NEXT: movq %rax, 8(%rdi) ; X64-NEXT: retq %arrayidx1 = getelementptr inbounds i32, ptr %a, i64 1 %arrayidx2 = getelementptr inbounds i32, ptr %a, i64 2 @@ -58,9 +60,9 @@ ; X32-LABEL: big_nonzero_32_bytes_splat: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42] -; X32-NEXT: vmovups %ymm0, (%eax) -; X32-NEXT: vzeroupper +; X32-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42] +; X32-NEXT: vmovups %xmm0, (%eax) +; X32-NEXT: vmovups %xmm0, 16(%eax) ; X32-NEXT: retl ; ; X64-LABEL: big_nonzero_32_bytes_splat: diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -1884,13 +1884,13 @@ ; ; AVX512VL-FALLBACK-LABEL: vec128_i16_signed_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm3, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -2009,13 +2009,13 @@ ; AVX512VL-FALLBACK-LABEL: vec128_i16_unsigned_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX512VL-FALLBACK-NEXT: vpternlogd $190, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm4 ; AVX512VL-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %xmm4, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -2117,13 +2117,13 @@ ; AVX512VL-FALLBACK-LABEL: vec128_i16_signed_mem_reg: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512VL-FALLBACK-NEXT: vpminsw %xmm0, %xmm1, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm0, %xmm1, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm3, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm0, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %xmm0, %xmm2, %xmm0 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %xmm0, %xmm1, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 +; AVX512VL-FALLBACK-NEXT: vpsubw %xmm3, %xmm0, %xmm0 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX512VL-FALLBACK-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX512VL-FALLBACK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -2225,13 +2225,13 @@ ; AVX512VL-FALLBACK-LABEL: vec128_i16_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512VL-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm3, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -2338,13 +2338,13 @@ ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512VL-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm3, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %xmm3, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -2572,14 +2572,20 @@ ; ; AVX512VL-FALLBACK-LABEL: vec128_i8_signed_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512VL-FALLBACK-NEXT: vzeroupper ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-FALLBACK-LABEL: vec128_i8_signed_reg_reg: @@ -2805,14 +2811,20 @@ ; AVX512VL-FALLBACK-LABEL: vec128_i8_unsigned_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX512VL-FALLBACK-NEXT: vpternlogd $190, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm4 ; AVX512VL-FALLBACK-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512VL-FALLBACK-NEXT: vzeroupper ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-FALLBACK-LABEL: vec128_i8_unsigned_reg_reg: @@ -3047,14 +3059,20 @@ ; AVX512VL-FALLBACK-LABEL: vec128_i8_signed_mem_reg: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512VL-FALLBACK-NEXT: vpminsb %xmm0, %xmm1, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm0, %xmm1, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %xmm0, %xmm2, %xmm0 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %xmm0, %xmm1, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 +; AVX512VL-FALLBACK-NEXT: vpsubb %xmm3, %xmm0, %xmm0 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512VL-FALLBACK-NEXT: vzeroupper ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-FALLBACK-LABEL: vec128_i8_signed_mem_reg: @@ -3287,14 +3305,20 @@ ; AVX512VL-FALLBACK-LABEL: vec128_i8_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512VL-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512VL-FALLBACK-NEXT: vzeroupper ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-FALLBACK-LABEL: vec128_i8_signed_reg_mem: @@ -3536,14 +3560,20 @@ ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512VL-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512VL-FALLBACK-NEXT: vzeroupper ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-FALLBACK-LABEL: vec128_i8_signed_mem_mem: diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -1328,13 +1328,13 @@ ; ; AVX512VL-FALLBACK-LABEL: vec256_i16_signed_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpxor %ymm1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm2, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -1456,13 +1456,13 @@ ; AVX512VL-FALLBACK-LABEL: vec256_i16_unsigned_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpternlogd $190, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm2, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpxor %ymm2, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -1584,13 +1584,13 @@ ; AVX512VL-FALLBACK-LABEL: vec256_i16_signed_mem_reg: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm1, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpxor %ymm0, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm3, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -1712,13 +1712,13 @@ ; AVX512VL-FALLBACK-LABEL: vec256_i16_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpxor %ymm1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm2, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -1845,13 +1845,13 @@ ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpxor %ymm1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm2, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -2032,13 +2032,23 @@ ; ; AVX512VL-FALLBACK-LABEL: vec256_i8_signed_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -2221,13 +2231,23 @@ ; AVX512VL-FALLBACK-LABEL: vec256_i8_unsigned_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpternlogd $190, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm2, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -2410,13 +2430,23 @@ ; AVX512VL-FALLBACK-LABEL: vec256_i8_signed_mem_reg: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm1, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -2599,13 +2629,23 @@ ; AVX512VL-FALLBACK-LABEL: vec256_i8_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -2793,13 +2833,23 @@ ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll --- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll @@ -274,22 +274,19 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsubw %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm6 ; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm6 +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsubw %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -301,22 +298,19 @@ ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -354,19 +348,17 @@ ; AVX512F-NEXT: vpminuw %ymm1, %ymm0, %ymm6 ; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm7 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512F-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsubw %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 +; AVX512F-NEXT: vpternlogd $190, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7 ; AVX512F-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsubw %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4 -; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpmullw %ymm7, %ymm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm7, %ymm4 +; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -381,19 +373,17 @@ ; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm7 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm4, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 +; AVX512VL-FALLBACK-NEXT: vpternlogd $190, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7 ; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm7, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -431,22 +421,19 @@ ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm5 +; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm6 ; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsubw %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpsubw %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm6 +; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpmullw %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -459,22 +446,19 @@ ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -512,22 +496,19 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsubw %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm6 ; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm6 +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsubw %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -540,22 +521,19 @@ ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -594,22 +572,19 @@ ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm5 +; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm6 ; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsubw %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpsubw %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm6 +; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpmullw %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -623,22 +598,19 @@ ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -679,62 +651,88 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind { ; AVX512F-LABEL: vec512_i8_signed_reg_reg: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5 +; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm6 ; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5 +; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpminsb %ymm3, %ymm2, %ymm6 +; AVX512F-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 +; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2 +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm1, %ymm8, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpor %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm2, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm1, %ymm8, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm8, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -765,62 +763,90 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind { ; AVX512F-LABEL: vec512_i8_unsigned_reg_reg: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpminub %ymm2, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpminub %ymm3, %ymm2, %ymm4 +; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 ; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm6 ; AVX512F-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512F-NEXT: vpmaxub %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 +; AVX512F-NEXT: vpternlogd $190, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7 ; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpmaxub %ymm3, %ymm2, %ymm3 +; AVX512F-NEXT: vpsubb %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4 -; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm4, %zmm4 -; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2 -; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vextracti64x4 $1, %zmm7, %ymm6 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_unsigned_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpminub %ymm2, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm5 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminub %ymm3, %ymm2, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 +; AVX512VL-FALLBACK-NEXT: vpternlogd $190, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm7 ; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm3, %ymm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm4, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm7, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq @@ -853,66 +879,92 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind { ; AVX512F-LABEL: vec512_i8_signed_mem_reg: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm5 +; AVX512F-NEXT: vpminsb %ymm0, %ymm1, %ymm6 +; AVX512F-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpsubb %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpminsb %ymm3, %ymm2, %ymm6 +; AVX512F-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 +; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1 +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0 -; AVX512F-NEXT: vpsubb %ymm0, %ymm7, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm0, %ymm8, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpor %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm2, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm7, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpand %ymm0, %ymm8, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm8, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_signed_mem_reg: @@ -943,66 +995,92 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i8_signed_reg_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 +; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm6 +; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpsubb %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm6 +; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2 -; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm8, %ymm2 +; AVX512F-NEXT: vpackuswb %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpor %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpand %ymm2, %ymm8, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm8, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_signed_reg_mem: @@ -1033,67 +1111,93 @@ define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i8_signed_mem_mem: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 +; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm6 +; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpsubb %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm6 +; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0 -; AVX512F-NEXT: vpsubb %ymm0, %ymm7, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm8, %ymm2 +; AVX512F-NEXT: vpackuswb %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpor %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_mem: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512VL-FALLBACK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm5, %zmm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm7, %ymm0 -; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0 -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpand %ymm2, %ymm8, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpor %ymm7, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm8, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -17,10 +17,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1 ; CHECK-NEXT: vpaddd (%rsi), %ymm0, %ymm0 -; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %d = load <16 x i32>, <16 x i32>* %a @@ -48,12 +48,54 @@ define dso_local void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "min-legal-vector-width"="256" { ; CHECK-LABEL: avg_v64i8_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rsi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1 -; CHECK-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 -; CHECK-NEXT: vmovdqu %ymm1, (%rax) +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; CHECK-NEXT: vpmovdb %ymm15, %xmm15 +; CHECK-NEXT: vpmovdb %ymm14, %xmm14 +; CHECK-NEXT: vpmovdb %ymm13, %xmm13 +; CHECK-NEXT: vpmovdb %ymm12, %xmm12 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm13[0] +; CHECK-NEXT: vpmovdb %ymm7, %xmm7 +; CHECK-NEXT: vpmovdb %ymm6, %xmm6 +; CHECK-NEXT: vinserti128 $1, %xmm14, %ymm12, %ymm12 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; CHECK-NEXT: vpmovdb %ymm5, %xmm5 +; CHECK-NEXT: vpmovdb %ymm4, %xmm4 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 +; CHECK-NEXT: vpmovdb %ymm11, %xmm5 +; CHECK-NEXT: vpmovdb %ymm10, %xmm6 +; CHECK-NEXT: vpavgb %ymm4, %ymm12, %ymm4 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0] +; CHECK-NEXT: vpmovdb %ymm9, %xmm6 +; CHECK-NEXT: vpmovdb %ymm8, %xmm7 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm7[0],xmm6[0] +; CHECK-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; CHECK-NEXT: vpmovdb %ymm3, %xmm3 +; CHECK-NEXT: vpmovdb %ymm2, %xmm2 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; CHECK-NEXT: vpmovdb %ymm1, %xmm1 +; CHECK-NEXT: vpmovdb %ymm0, %xmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: vpavgb %ymm5, %ymm0, %ymm0 ; CHECK-NEXT: vmovdqu %ymm0, (%rax) +; CHECK-NEXT: vmovdqu %ymm4, (%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %a @@ -72,8 +114,29 @@ define dso_local void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "min-legal-vector-width"="512" { ; CHECK-LABEL: avg_v64i8_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 -; CHECK-NEXT: vpavgb (%rsi), %zmm0, %zmm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; CHECK-NEXT: vpmovdb %zmm7, %xmm7 +; CHECK-NEXT: vpmovdb %zmm6, %xmm6 +; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; CHECK-NEXT: vpmovdb %zmm5, %xmm5 +; CHECK-NEXT: vpmovdb %zmm4, %xmm4 +; CHECK-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; CHECK-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; CHECK-NEXT: vpmovdb %zmm3, %xmm3 +; CHECK-NEXT: vpmovdb %zmm2, %xmm2 +; CHECK-NEXT: vpmovdb %zmm1, %xmm1 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; CHECK-NEXT: vpavgb %zmm4, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, (%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -92,12 +155,30 @@ define dso_local void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "min-legal-vector-width"="256" { ; CHECK-LABEL: pmaddwd_32_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 -; CHECK-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 -; CHECK-NEXT: vmovdqa %ymm0, (%rdx) -; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vpmovsxwd 48(%rdi), %ymm0 +; CHECK-NEXT: vpmovsxwd 32(%rdi), %ymm1 +; CHECK-NEXT: vpmovsxwd 16(%rdi), %ymm2 +; CHECK-NEXT: vpmovsxwd (%rdi), %ymm3 +; CHECK-NEXT: vpmovsxwd 48(%rsi), %ymm4 +; CHECK-NEXT: vpmovsxwd 32(%rsi), %ymm5 +; CHECK-NEXT: vpmovsxwd 16(%rsi), %ymm6 +; CHECK-NEXT: vpmovsxwd (%rsi), %ymm7 +; CHECK-NEXT: vpmovdw %ymm7, %xmm7 +; CHECK-NEXT: vpmovdw %ymm6, %xmm6 +; CHECK-NEXT: vpmovdw %ymm3, %xmm3 +; CHECK-NEXT: vpmovdw %ymm2, %xmm2 +; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; CHECK-NEXT: vpmovdw %ymm5, %xmm3 +; CHECK-NEXT: vpmovdw %ymm4, %xmm4 +; CHECK-NEXT: vpmaddwd %ymm6, %ymm2, %ymm2 +; CHECK-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; CHECK-NEXT: vpmovdw %ymm1, %xmm1 +; CHECK-NEXT: vpmovdw %ymm0, %xmm0 +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vpmaddwd %ymm3, %ymm0, %ymm0 +; CHECK-NEXT: vmovdqa %ymm0, 32(%rdx) +; CHECK-NEXT: vmovdqa %ymm2, (%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %A = load <32 x i16>, <32 x i16>* %APtr @@ -115,8 +196,17 @@ define dso_local void @pmaddwd_32_512(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "min-legal-vector-width"="512" { ; CHECK-LABEL: pmaddwd_32_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 -; CHECK-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0 +; CHECK-NEXT: vpmovsxwd 32(%rdi), %zmm0 +; CHECK-NEXT: vpmovsxwd (%rdi), %zmm1 +; CHECK-NEXT: vpmovsxwd 32(%rsi), %zmm2 +; CHECK-NEXT: vpmovsxwd (%rsi), %zmm3 +; CHECK-NEXT: vpmovdw %zmm3, %ymm3 +; CHECK-NEXT: vpmovdw %zmm2, %ymm2 +; CHECK-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; CHECK-NEXT: vpmovdw %zmm1, %ymm1 +; CHECK-NEXT: vpmovdw %zmm0, %ymm0 +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; CHECK-NEXT: vpmaddwd %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -137,10 +227,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1 ; CHECK-NEXT: vpsubusb (%rsi), %ymm0, %ymm0 -; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = load <64 x i8>, <64 x i8>* %xptr @@ -180,14 +270,22 @@ ; CHECK-SKX-NEXT: .p2align 4, 0x90 ; CHECK-SKX-NEXT: .LBB8_1: # %vector.body ; CHECK-SKX-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-SKX-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 -; CHECK-SKX-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 -; CHECK-SKX-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 -; CHECK-SKX-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 +; CHECK-SKX-NEXT: vpmovsxbw 24(%rdi,%rcx), %xmm3 +; CHECK-SKX-NEXT: vpmovsxbw 16(%rdi,%rcx), %xmm4 +; CHECK-SKX-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm5 +; CHECK-SKX-NEXT: vpmovsxbw (%rdi,%rcx), %xmm6 +; CHECK-SKX-NEXT: vpmovsxbw 24(%rsi,%rcx), %xmm7 +; CHECK-SKX-NEXT: vpmovsxbw 16(%rsi,%rcx), %xmm8 +; CHECK-SKX-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm9 +; CHECK-SKX-NEXT: vpmovsxbw (%rsi,%rcx), %xmm10 +; CHECK-SKX-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; CHECK-SKX-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm6 +; CHECK-SKX-NEXT: vpmaddwd %ymm5, %ymm6, %ymm5 +; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm5, %ymm1 +; CHECK-SKX-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; CHECK-SKX-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm4 +; CHECK-SKX-NEXT: vpmaddwd %ymm3, %ymm4, %ymm3 ; CHECK-SKX-NEXT: vpaddd %ymm2, %ymm3, %ymm2 -; CHECK-SKX-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 -; CHECK-SKX-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 -; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; CHECK-SKX-NEXT: addq $32, %rcx ; CHECK-SKX-NEXT: cmpq %rcx, %rax ; CHECK-SKX-NEXT: jne .LBB8_1 @@ -196,9 +294,9 @@ ; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 ; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-SKX-NEXT: vmovd %xmm0, %eax @@ -215,14 +313,22 @@ ; CHECK-AVX512-NEXT: .p2align 4, 0x90 ; CHECK-AVX512-NEXT: .LBB8_1: # %vector.body ; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-AVX512-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 -; CHECK-AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 -; CHECK-AVX512-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 -; CHECK-AVX512-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 +; CHECK-AVX512-NEXT: vpmovsxbw 24(%rdi,%rcx), %xmm3 +; CHECK-AVX512-NEXT: vpmovsxbw 16(%rdi,%rcx), %xmm4 +; CHECK-AVX512-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm5 +; CHECK-AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %xmm6 +; CHECK-AVX512-NEXT: vpmovsxbw 24(%rsi,%rcx), %xmm7 +; CHECK-AVX512-NEXT: vpmovsxbw 16(%rsi,%rcx), %xmm8 +; CHECK-AVX512-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm9 +; CHECK-AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %xmm10 +; CHECK-AVX512-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; CHECK-AVX512-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm6 +; CHECK-AVX512-NEXT: vpmaddwd %ymm5, %ymm6, %ymm5 +; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm5, %ymm1 +; CHECK-AVX512-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; CHECK-AVX512-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm4 +; CHECK-AVX512-NEXT: vpmaddwd %ymm3, %ymm4, %ymm3 ; CHECK-AVX512-NEXT: vpaddd %ymm2, %ymm3, %ymm2 -; CHECK-AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 -; CHECK-AVX512-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 -; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; CHECK-AVX512-NEXT: addq $32, %rcx ; CHECK-AVX512-NEXT: cmpq %rcx, %rax ; CHECK-AVX512-NEXT: jne .LBB8_1 @@ -231,9 +337,9 @@ ; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1 ; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: vmovd %xmm0, %eax @@ -250,14 +356,22 @@ ; CHECK-VBMI-NEXT: .p2align 4, 0x90 ; CHECK-VBMI-NEXT: .LBB8_1: # %vector.body ; CHECK-VBMI-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-VBMI-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 -; CHECK-VBMI-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 -; CHECK-VBMI-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 -; CHECK-VBMI-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 +; CHECK-VBMI-NEXT: vpmovsxbw 24(%rdi,%rcx), %xmm3 +; CHECK-VBMI-NEXT: vpmovsxbw 16(%rdi,%rcx), %xmm4 +; CHECK-VBMI-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm5 +; CHECK-VBMI-NEXT: vpmovsxbw (%rdi,%rcx), %xmm6 +; CHECK-VBMI-NEXT: vpmovsxbw 24(%rsi,%rcx), %xmm7 +; CHECK-VBMI-NEXT: vpmovsxbw 16(%rsi,%rcx), %xmm8 +; CHECK-VBMI-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm9 +; CHECK-VBMI-NEXT: vpmovsxbw (%rsi,%rcx), %xmm10 +; CHECK-VBMI-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; CHECK-VBMI-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm6 +; CHECK-VBMI-NEXT: vpmaddwd %ymm5, %ymm6, %ymm5 +; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm5, %ymm1 +; CHECK-VBMI-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; CHECK-VBMI-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm4 +; CHECK-VBMI-NEXT: vpmaddwd %ymm3, %ymm4, %ymm3 ; CHECK-VBMI-NEXT: vpaddd %ymm2, %ymm3, %ymm2 -; CHECK-VBMI-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 -; CHECK-VBMI-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 -; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; CHECK-VBMI-NEXT: addq $32, %rcx ; CHECK-VBMI-NEXT: cmpq %rcx, %rax ; CHECK-VBMI-NEXT: jne .LBB8_1 @@ -266,9 +380,9 @@ ; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-VBMI-NEXT: vmovd %xmm0, %eax @@ -320,8 +434,12 @@ ; CHECK-SKX-NEXT: .p2align 4, 0x90 ; CHECK-SKX-NEXT: .LBB9_1: # %vector.body ; CHECK-SKX-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-SKX-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 -; CHECK-SKX-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 +; CHECK-SKX-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm2 +; CHECK-SKX-NEXT: vpmovsxbw (%rdi,%rcx), %ymm3 +; CHECK-SKX-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm4 +; CHECK-SKX-NEXT: vpmovsxbw (%rsi,%rcx), %ymm5 +; CHECK-SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; CHECK-SKX-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm3 ; CHECK-SKX-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 ; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; CHECK-SKX-NEXT: addq $32, %rcx @@ -332,9 +450,9 @@ ; CHECK-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 ; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-SKX-NEXT: vmovd %xmm0, %eax @@ -350,8 +468,12 @@ ; CHECK-AVX512-NEXT: .p2align 4, 0x90 ; CHECK-AVX512-NEXT: .LBB9_1: # %vector.body ; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 -; CHECK-AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 +; CHECK-AVX512-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm2 +; CHECK-AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm3 +; CHECK-AVX512-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm4 +; CHECK-AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm5 +; CHECK-AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; CHECK-AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm3 ; CHECK-AVX512-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 ; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; CHECK-AVX512-NEXT: addq $32, %rcx @@ -362,9 +484,9 @@ ; CHECK-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1 ; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: vmovd %xmm0, %eax @@ -380,8 +502,12 @@ ; CHECK-VBMI-NEXT: .p2align 4, 0x90 ; CHECK-VBMI-NEXT: .LBB9_1: # %vector.body ; CHECK-VBMI-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-VBMI-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 -; CHECK-VBMI-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 +; CHECK-VBMI-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm2 +; CHECK-VBMI-NEXT: vpmovsxbw (%rdi,%rcx), %ymm3 +; CHECK-VBMI-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm4 +; CHECK-VBMI-NEXT: vpmovsxbw (%rsi,%rcx), %ymm5 +; CHECK-VBMI-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; CHECK-VBMI-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm3 ; CHECK-VBMI-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 ; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; CHECK-VBMI-NEXT: addq $32, %rcx @@ -392,9 +518,9 @@ ; CHECK-VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-VBMI-NEXT: vmovd %xmm0, %eax @@ -456,9 +582,9 @@ ; CHECK-SKX-NEXT: # %bb.2: # %middle.block ; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 ; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-SKX-NEXT: vmovd %xmm0, %eax @@ -481,9 +607,9 @@ ; CHECK-AVX512-NEXT: # %bb.2: # %middle.block ; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1 ; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: vmovd %xmm0, %eax @@ -506,9 +632,9 @@ ; CHECK-VBMI-NEXT: # %bb.2: # %middle.block ; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-VBMI-NEXT: vmovd %xmm0, %eax @@ -567,9 +693,9 @@ ; CHECK-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 ; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-SKX-NEXT: vmovd %xmm0, %eax @@ -592,9 +718,9 @@ ; CHECK-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1 ; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512-NEXT: vmovd %xmm0, %eax @@ -617,9 +743,9 @@ ; CHECK-VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-VBMI-NEXT: vmovd %xmm0, %eax @@ -889,23 +1015,23 @@ ; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-SKX-VBMI-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-SKX-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-SKX-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62] -; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm1 ; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; CHECK-SKX-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 ; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-SKX-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm0 -; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62] +; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm4, %ymm2, %ymm0 +; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-SKX-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-SKX-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm4, %ymm2, %ymm1 ; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-SKX-VBMI-NEXT: vzeroupper ; CHECK-SKX-VBMI-NEXT: retq ; @@ -915,27 +1041,27 @@ ; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; CHECK-AVX512-NEXT: vpmullw %ymm4, %ymm5, %ymm4 ; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm4, %ymm4 -; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; CHECK-AVX512-NEXT: vpand %ymm5, %ymm1, %ymm1 -; CHECK-AVX512-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 -; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm4, %ymm3 -; CHECK-AVX512-NEXT: vpand %ymm5, %ymm3, %ymm3 ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm0, %ymm0 -; CHECK-AVX512-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 -; CHECK-AVX512-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-AVX512-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 +; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-AVX512-NEXT: vpmullw %ymm2, %ymm4, %ymm2 +; CHECK-AVX512-NEXT: vpand %ymm5, %ymm2, %ymm2 +; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; CHECK-AVX512-NEXT: vpand %ymm5, %ymm1, %ymm1 +; CHECK-AVX512-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 ; CHECK-AVX512-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-AVX512-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-AVX512-NEXT: vzeroupper ; CHECK-AVX512-NEXT: retq ; @@ -945,23 +1071,23 @@ ; CHECK-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-VBMI-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62] -; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm1 ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm0 -; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62] +; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm2, %ymm0 +; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm2, %ymm1 ; CHECK-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-VBMI-NEXT: vzeroupper ; CHECK-VBMI-NEXT: retq %d = load <64 x i8>, <64 x i8>* %a @@ -1069,11 +1195,12 @@ ; CHECK-NEXT: vmovdqa 96(%rdi), %ymm3 ; CHECK-NEXT: vpmovqb %ymm3, %xmm3 ; CHECK-NEXT: vpmovqb %ymm2, %xmm2 -; CHECK-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,4,0,4] +; CHECK-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 ; CHECK-NEXT: vpmovqb %ymm1, %xmm1 ; CHECK-NEXT: vpmovqb %ymm0, %xmm0 ; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %a = load <16 x i64>, <16 x i64>* %x @@ -1890,10 +2017,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rsi), %ymm0 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1 -; CHECK-NEXT: vpcmpgtq 32(%rdi), %ymm1, %ymm1 ; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vpcmpgtq 32(%rdi), %ymm1, %ymm1 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = load <8 x i64>, <8 x i64>* %xptr @@ -1909,12 +2036,12 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rsi), %ymm0 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1 -; CHECK-NEXT: vpcmpgtq 32(%rdi), %ymm1, %ymm1 ; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: vpsrlq $63, %ymm1, %ymm1 +; CHECK-NEXT: vpcmpgtq 32(%rdi), %ymm1, %ymm1 ; CHECK-NEXT: vpsrlq $63, %ymm0, %ymm0 -; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vpsrlq $63, %ymm1, %ymm1 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = load <8 x i64>, <8 x i64>* %xptr diff --git a/llvm/test/CodeGen/X86/mmx-cvt.ll b/llvm/test/CodeGen/X86/mmx-cvt.ll --- a/llvm/test/CodeGen/X86/mmx-cvt.ll +++ b/llvm/test/CodeGen/X86/mmx-cvt.ll @@ -260,17 +260,11 @@ define <2 x double> @sitofp_v2i32_v2f64(ptr) nounwind { ; X86-LABEL: sitofp_v2i32_v2f64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $8, %esp -; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movq (%eax), %mm0 ; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: cvtdq2pd (%esp), %xmm0 -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp +; X86-NEXT: movq2dq %mm0, %xmm0 +; X86-NEXT: cvtdq2pd %xmm0, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: sitofp_v2i32_v2f64: diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -259,12 +259,8 @@ ; KNL: # %bb.0: ; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: cmpb $-1, %al -; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper +; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; KNL-NEXT: setb %al ; KNL-NEXT: retq ; ; SKX-LABEL: allones_v8i16_sign: @@ -298,11 +294,8 @@ ; KNL: # %bb.0: ; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kortestw %k0, %k0 +; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: allzeros_v8i16_sign: @@ -350,9 +343,8 @@ ; KNL: # %bb.0: ; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kortestw %k0, %k0 +; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; KNL-NEXT: vptest %ymm1, %ymm0 ; KNL-NEXT: setb %al ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq @@ -401,9 +393,8 @@ ; KNL: # %bb.0: ; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kortestw %k0, %k0 +; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; KNL-NEXT: vptest %ymm1, %ymm0 ; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq @@ -456,18 +447,15 @@ ; ; KNL-LABEL: allones_v32i16_sign: ; KNL: # %bb.0: -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm2 -; KNL-NEXT: vpmovsxwd %ymm2, %zmm2 -; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; KNL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: andl %eax, %ecx -; KNL-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 +; KNL-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; KNL-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; KNL-NEXT: kortestw %k0, %k0 ; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq @@ -524,9 +512,8 @@ ; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; KNL-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; KNL-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 -; KNL-NEXT: vpor %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; KNL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 ; KNL-NEXT: kortestw %k0, %k0 ; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper @@ -976,7 +963,7 @@ ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; @@ -1002,8 +989,10 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movmskpd %xmm0, %eax +; SSE2-NEXT: cmpl $3, %eax ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; @@ -1013,11 +1002,29 @@ ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; -; AVX-LABEL: allzeros_v2i64_not: -; AVX: # %bb.0: -; AVX-NEXT: vptest %xmm0, %xmm0 -; AVX-NEXT: setne %al -; AVX-NEXT: retq +; AVX1OR2-LABEL: allzeros_v2i64_not: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vptest %xmm0, %xmm0 +; AVX1OR2-NEXT: setne %al +; AVX1OR2-NEXT: retq +; +; KNL-LABEL: allzeros_v2i64_not: +; KNL: # %bb.0: +; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testb $3, %al +; KNL-NEXT: setne %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v2i64_not: +; SKX: # %bb.0: +; SKX-NEXT: vptestnmq %xmm0, %xmm0, %k0 +; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: cmpb $3, %al +; SKX-NEXT: setne %al +; SKX-NEXT: retq %1 = icmp eq <2 x i64> %a0, zeroinitializer %2 = bitcast <2 x i1> %1 to i2 %3 = icmp ne i2 %2, -1 @@ -1025,29 +1032,40 @@ } define i1 @allzeros_v8i32_not(<8 x i32> %a0) { -; SSE2-LABEL: allzeros_v8i32_not: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax -; SSE2-NEXT: setne %al -; SSE2-NEXT: retq +; SSE-LABEL: allzeros_v8i32_not: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpb $-1, %al +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; -; SSE41-LABEL: allzeros_v8i32_not: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; AVX1-LABEL: allzeros_v8i32_not: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vptest %xmm0, %xmm0 +; AVX1-NEXT: setne %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq ; -; AVX-LABEL: allzeros_v8i32_not: -; AVX: # %bb.0: -; AVX-NEXT: vptest %ymm0, %ymm0 -; AVX-NEXT: setne %al -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX2-LABEL: allzeros_v8i32_not: +; AVX2: # %bb.0: +; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: setne %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: allzeros_v8i32_not: +; AVX512: # %bb.0: +; AVX512-NEXT: vptest %ymm0, %ymm0 +; AVX512-NEXT: setne %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = icmp eq <8 x i32> %a0, zeroinitializer %2 = bitcast <8 x i1> %1 to i8 %3 = icmp ne i8 %2, -1 @@ -1057,38 +1075,71 @@ define i1 @allzeros_v8i64_not(<8 x i64> %a0) { ; SSE2-LABEL: allzeros_v8i64_not: ; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,0,3,2] +; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: packssdw %xmm5, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 +; SSE2-NEXT: packsswb %xmm1, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: cmpb $-1, %al ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: allzeros_v8i64_not: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm3 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm2 +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: packssdw %xmm2, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpb $-1, %al ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: allzeros_v8i64_not: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 -; AVX1-NEXT: setne %al +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vtestps %xmm1, %xmm0 +; AVX1-NEXT: setae %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v8i64_not: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 -; AVX2-NEXT: setne %al +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vtestps %ymm1, %ymm0 +; AVX2-NEXT: setae %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1106,29 +1157,27 @@ } define i1 @allzeros_v16i8_and1(<16 x i8> %arg) { -; SSE2-LABEL: allzeros_v16i8_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v16i8_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v16i8_and1: +; SSE: # %bb.0: +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: allzeros_v16i8_and1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: testl %eax, %eax ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: allzeros_v16i8_and1: ; KNL: # %bb.0: -; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; KNL-NEXT: vpsllw $7, %xmm0, %xmm0 +; KNL-NEXT: vpmovmskb %xmm0, %eax +; KNL-NEXT: testl %eax, %eax ; KNL-NEXT: sete %al ; KNL-NEXT: retq ; @@ -1199,44 +1248,51 @@ } define i1 @allzeros_v32i8_and1(<32 x i8> %arg) { -; SSE2-LABEL: allzeros_v32i8_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v32i8_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v32i8_and1: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v32i8_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v32i8_and1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v32i8_and1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v32i8_and1: +; KNL: # %bb.0: +; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 +; KNL-NEXT: vpmovmskb %ymm0, %eax +; KNL-NEXT: testl %eax, %eax +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v32i8_and1: +; SKX: # %bb.0: +; SKX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] +; SKX-NEXT: vptest %ymm1, %ymm0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <32 x i8> %arg, %tmp1 = icmp ne <32 x i8> %tmp, zeroinitializer %tmp2 = bitcast <32 x i1> %tmp1 to i32 @@ -1258,11 +1314,11 @@ ; ; AVX1-LABEL: allones_v64i8_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF @@ -1306,30 +1362,27 @@ } define i1 @allzeros_v64i8_and1(<64 x i8> %arg) { -; SSE2-LABEL: allzeros_v64i8_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v64i8_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v64i8_and1: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v64i8_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1337,19 +1390,31 @@ ; AVX2-LABEL: allzeros_v64i8_and1: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v64i8_and1: -; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v64i8_and1: +; KNL: # %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; KNL-NEXT: vpor %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 +; KNL-NEXT: vpmovmskb %ymm0, %eax +; KNL-NEXT: testl %eax, %eax +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v64i8_and1: +; SKX: # %bb.0: +; SKX-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; SKX-NEXT: kortestw %k0, %k0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <64 x i8> %arg, %tmp1 = icmp ne <64 x i8> %tmp, zeroinitializer %tmp2 = bitcast <64 x i1> %tmp1 to i64 @@ -1378,14 +1443,12 @@ ; ; KNL-LABEL: allones_v8i16_and1: ; KNL: # %bb.0: -; KNL-NEXT: vpsllw $15, %xmm0, %xmm0 -; KNL-NEXT: vpsraw $15, %xmm0, %xmm0 -; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: cmpb $-1, %al +; KNL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; KNL-NEXT: vptest %xmm1, %xmm0 ; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: allones_v8i16_and1: @@ -1402,28 +1465,26 @@ } define i1 @allzeros_v8i16_and1(<8 x i16> %arg) { -; SSE2-LABEL: allzeros_v8i16_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v8i16_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v8i16_and1: +; SSE: # %bb.0: +; SSE-NEXT: psllw $15, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: allzeros_v8i16_and1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: testl $43690, %eax # imm = 0xAAAA ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: allzeros_v8i16_and1: ; KNL: # %bb.0: +; KNL-NEXT: vpsllw $15, %xmm0, %xmm0 +; KNL-NEXT: vpsraw $15, %xmm0, %xmm0 ; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; KNL-NEXT: sete %al ; KNL-NEXT: retq @@ -1477,12 +1538,12 @@ ; ; KNL-LABEL: allones_v16i16_and1: ; KNL: # %bb.0: -; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 -; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kortestw %k0, %k0 -; KNL-NEXT: setb %al +; KNL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vptest %ymm1, %ymm0 +; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; @@ -1546,18 +1607,13 @@ ; KNL-LABEL: allones_v32i16_and1: ; KNL: # %bb.0: ; KNL-NEXT: vpsllw $15, %ymm0, %ymm1 -; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 -; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpsrlw $15, %ymm1, %ymm1 ; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 -; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: andl %eax, %ecx -; KNL-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; KNL-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; KNL-NEXT: kortestw %k0, %k0 ; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq @@ -1577,50 +1633,69 @@ } define i1 @allzeros_v32i16_and1(<32 x i16> %arg) { -; SSE2-LABEL: allzeros_v32i16_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v32i16_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq -; +; SSE-LABEL: allzeros_v32i16_and1: +; SSE: # %bb.0: +; SSE-NEXT: psllw $15, %xmm3 +; SSE-NEXT: psllw $15, %xmm2 +; SSE-NEXT: packsswb %xmm3, %xmm2 +; SSE-NEXT: psllw $15, %xmm1 +; SSE-NEXT: psllw $15, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq +; ; AVX1-LABEL: allzeros_v32i16_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsllw $15, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsllw $15, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v32i16_and1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllw $15, %ymm1, %ymm1 +; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v32i16_and1: -; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v32i16_and1: +; KNL: # %bb.0: +; KNL-NEXT: vpsllw $15, %ymm0, %ymm1 +; KNL-NEXT: vpsrlw $15, %ymm1, %ymm1 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 +; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kortestw %k0, %k0 +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v32i16_and1: +; SKX: # %bb.0: +; SKX-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; SKX-NEXT: kortestw %k0, %k0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <32 x i16> %arg, %tmp1 = icmp ne <32 x i16> %tmp, zeroinitializer %tmp2 = bitcast <32 x i1> %tmp1 to i32 @@ -1629,44 +1704,53 @@ } define i1 @allzeros_v16i16_and1(<16 x i16> %arg) { -; SSE2-LABEL: allzeros_v16i16_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v16i16_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v16i16_and1: +; SSE: # %bb.0: +; SSE-NEXT: psllw $15, %xmm1 +; SSE-NEXT: psllw $15, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v16i16_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v16i16_and1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v16i16_and1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v16i16_and1: +; KNL: # %bb.0: +; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 +; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 +; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v16i16_and1: +; SKX: # %bb.0: +; SKX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; SKX-NEXT: vptest %ymm1, %ymm0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <16 x i16> %arg, %tmp1 = icmp ne <16 x i16> %tmp, zeroinitializer %tmp2 = bitcast <16 x i1> %tmp1 to i16 @@ -1716,36 +1800,35 @@ } define i1 @allzeros_v4i32_and1(<4 x i32> %arg) { -; SSE2-LABEL: allzeros_v4i32_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v4i32_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v4i32_and1: +; SSE: # %bb.0: +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: allzeros_v4i32_and1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1OR2-NEXT: vtestps %xmm0, %xmm0 ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: allzeros_v4i32_and1: ; KNL: # %bb.0: -; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; KNL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testb $15, %al ; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: allzeros_v4i32_and1: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967297,4294967297] -; SKX-NEXT: vptest %xmm1, %xmm0 +; SKX-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k0 +; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: retq %tmp = and <4 x i32> %arg, @@ -1812,33 +1895,30 @@ } define i1 @allzeros_v8i32_and1(<8 x i32> %arg) { -; SSE2-LABEL: allzeros_v8i32_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v8i32_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v8i32_and1: +; SSE: # %bb.0: +; SSE-NEXT: pslld $31, %xmm1 +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v8i32_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vtestps %xmm0, %xmm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v8i32_and1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vtestps %ymm0, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1917,39 +1997,46 @@ } define i1 @allzeros_v16i32_and1(<16 x i32> %arg) { -; SSE2-LABEL: allzeros_v16i32_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v16i32_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v16i32_and1: +; SSE: # %bb.0: +; SSE-NEXT: pslld $31, %xmm3 +; SSE-NEXT: pslld $31, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: pslld $31, %xmm1 +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v16i32_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v16i32_and1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1 +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2011,36 +2098,36 @@ } define i1 @allzeros_v2i64_and1(<2 x i64> %arg) { -; SSE2-LABEL: allzeros_v2i64_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testb $5, %al -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v2i64_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v2i64_and1: +; SSE: # %bb.0: +; SSE-NEXT: psllq $63, %xmm0 +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: allzeros_v2i64_and1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX1OR2-NEXT: vtestpd %xmm0, %xmm0 ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: allzeros_v2i64_and1: ; KNL: # %bb.0: -; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; KNL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] +; KNL-NEXT: vptestmq %zmm1, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testb $3, %al ; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: allzeros_v2i64_and1: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; SKX-NEXT: vptest %xmm1, %xmm0 +; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k0 +; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: retq %tmp = and <2 x i64> %arg, @@ -2107,44 +2194,51 @@ } define i1 @allzeros_v4i64_and1(<4 x i64> %arg) { -; SSE2-LABEL: allzeros_v4i64_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testb $5, %al -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v4i64_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v4i64_and1: +; SSE: # %bb.0: +; SSE-NEXT: psllq $63, %xmm1 +; SSE-NEXT: psllq $63, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v4i64_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX1-NEXT: vtestpd %xmm0, %xmm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v4i64_and1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX2-NEXT: vtestpd %ymm0, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v4i64_and1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v4i64_and1: +; KNL: # %bb.0: +; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; KNL-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testb $15, %al +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v4i64_and1: +; SKX: # %bb.0: +; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k0 +; SKX-NEXT: kortestb %k0, %k0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <4 x i64> %arg, %tmp1 = icmp ne <4 x i64> %tmp, zeroinitializer %tmp2 = bitcast <4 x i1> %tmp1 to i4 @@ -2220,39 +2314,42 @@ } define i1 @allzeros_v8i64_and1(<8 x i64> %arg) { -; SSE2-LABEL: allzeros_v8i64_and1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testb $5, %al -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v8i64_and1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v8i64_and1: +; SSE: # %bb.0: +; SSE-NEXT: psllq $63, %xmm3 +; SSE-NEXT: psllq $63, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: psllq $63, %xmm1 +; SSE-NEXT: psllq $63, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packssdw %xmm2, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v8i64_and1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq $63, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsllq $63, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vtestps %xmm0, %xmm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v8i64_and1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllq $63, %ymm1, %ymm1 +; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vtestps %ymm0, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2311,29 +2408,27 @@ } define i1 @allzeros_v16i8_and4(<16 x i8> %arg) { -; SSE2-LABEL: allzeros_v16i8_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $5, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v16i8_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v16i8_and4: +; SSE: # %bb.0: +; SSE-NEXT: psllw $5, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: allzeros_v16i8_and4: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: testl %eax, %eax ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: allzeros_v16i8_and4: ; KNL: # %bb.0: -; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; KNL-NEXT: vpsllw $5, %xmm0, %xmm0 +; KNL-NEXT: vpmovmskb %xmm0, %eax +; KNL-NEXT: testl %eax, %eax ; KNL-NEXT: sete %al ; KNL-NEXT: retq ; @@ -2404,44 +2499,51 @@ } define i1 @allzeros_v32i8_and4(<32 x i8> %arg) { -; SSE2-LABEL: allzeros_v32i8_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $5, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v32i8_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v32i8_and4: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: psllw $5, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v32i8_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v32i8_and4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [289360691352306692,289360691352306692,289360691352306692,289360691352306692] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllw $5, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v32i8_and4: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [289360691352306692,289360691352306692,289360691352306692,289360691352306692] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v32i8_and4: +; KNL: # %bb.0: +; KNL-NEXT: vpsllw $5, %ymm0, %ymm0 +; KNL-NEXT: vpmovmskb %ymm0, %eax +; KNL-NEXT: testl %eax, %eax +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v32i8_and4: +; SKX: # %bb.0: +; SKX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [289360691352306692,289360691352306692,289360691352306692,289360691352306692] +; SKX-NEXT: vptest %ymm1, %ymm0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <32 x i8> %arg, %tmp1 = icmp ne <32 x i8> %tmp, zeroinitializer %tmp2 = bitcast <32 x i1> %tmp1 to i32 @@ -2463,11 +2565,11 @@ ; ; AVX1-LABEL: allones_v64i8_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF @@ -2511,30 +2613,27 @@ } define i1 @allzeros_v64i8_and4(<64 x i8> %arg) { -; SSE2-LABEL: allzeros_v64i8_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $5, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v64i8_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v64i8_and4: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: psllw $5, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v64i8_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2542,19 +2641,31 @@ ; AVX2-LABEL: allzeros_v64i8_and4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [289360691352306692,289360691352306692,289360691352306692,289360691352306692] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllw $5, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v64i8_and4: -; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v64i8_and4: +; KNL: # %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; KNL-NEXT: vpor %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpsllw $5, %ymm0, %ymm0 +; KNL-NEXT: vpmovmskb %ymm0, %eax +; KNL-NEXT: testl %eax, %eax +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v64i8_and4: +; SKX: # %bb.0: +; SKX-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; SKX-NEXT: kortestw %k0, %k0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <64 x i8> %arg, %tmp1 = icmp ne <64 x i8> %tmp, zeroinitializer %tmp2 = bitcast <64 x i1> %tmp1 to i64 @@ -2583,14 +2694,11 @@ ; ; KNL-LABEL: allones_v8i16_and4: ; KNL: # %bb.0: -; KNL-NEXT: vpsllw $13, %xmm0, %xmm0 -; KNL-NEXT: vpsraw $15, %xmm0, %xmm0 -; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: cmpb $-1, %al +; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; KNL-NEXT: sete %al -; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: allones_v8i16_and4: @@ -2607,28 +2715,26 @@ } define i1 @allzeros_v8i16_and4(<8 x i16> %arg) { -; SSE2-LABEL: allzeros_v8i16_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $5, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v8i16_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v8i16_and4: +; SSE: # %bb.0: +; SSE-NEXT: psllw $13, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: allzeros_v8i16_and4: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpsllw $13, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: testl $43690, %eax # imm = 0xAAAA ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: allzeros_v8i16_and4: ; KNL: # %bb.0: +; KNL-NEXT: vpsllw $13, %xmm0, %xmm0 +; KNL-NEXT: vpsraw $15, %xmm0, %xmm0 ; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; KNL-NEXT: sete %al ; KNL-NEXT: retq @@ -2682,12 +2788,12 @@ ; ; KNL-LABEL: allones_v16i16_and4: ; KNL: # %bb.0: -; KNL-NEXT: vpsllw $13, %ymm0, %ymm0 -; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kortestw %k0, %k0 -; KNL-NEXT: setb %al +; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; KNL-NEXT: vptest %ymm1, %ymm0 +; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; @@ -2751,18 +2857,13 @@ ; KNL-LABEL: allones_v32i16_and4: ; KNL: # %bb.0: ; KNL-NEXT: vpsllw $13, %ymm0, %ymm1 -; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 -; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 -; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: vpsrlw $15, %ymm1, %ymm1 ; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vpsllw $13, %ymm0, %ymm0 -; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: andl %eax, %ecx -; KNL-NEXT: cmpl $65535, %ecx # imm = 0xFFFF +; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; KNL-NEXT: vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; KNL-NEXT: kortestw %k0, %k0 ; KNL-NEXT: sete %al ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq @@ -2782,50 +2883,69 @@ } define i1 @allzeros_v32i16_and4(<32 x i16> %arg) { -; SSE2-LABEL: allzeros_v32i16_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $5, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v32i16_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v32i16_and4: +; SSE: # %bb.0: +; SSE-NEXT: psllw $13, %xmm3 +; SSE-NEXT: psllw $13, %xmm2 +; SSE-NEXT: packsswb %xmm3, %xmm2 +; SSE-NEXT: psllw $13, %xmm1 +; SSE-NEXT: psllw $13, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v32i16_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsllw $13, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $13, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsllw $13, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $13, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v32i16_and4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1125917086973956,1125917086973956,1125917086973956,1125917086973956] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllw $13, %ymm1, %ymm1 +; AVX2-NEXT: vpsllw $13, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v32i16_and4: -; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v32i16_and4: +; KNL: # %bb.0: +; KNL-NEXT: vpsllw $13, %ymm0, %ymm1 +; KNL-NEXT: vpsrlw $15, %ymm1, %ymm1 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vpsllw $13, %ymm0, %ymm0 +; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: kortestw %k0, %k0 +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v32i16_and4: +; SKX: # %bb.0: +; SKX-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; SKX-NEXT: kortestw %k0, %k0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <32 x i16> %arg, %tmp1 = icmp ne <32 x i16> %tmp, zeroinitializer %tmp2 = bitcast <32 x i1> %tmp1 to i32 @@ -2834,44 +2954,54 @@ } define i1 @allzeros_v16i16_and4(<16 x i16> %arg) { -; SSE2-LABEL: allzeros_v16i16_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $5, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v16i16_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v16i16_and4: +; SSE: # %bb.0: +; SSE-NEXT: psllw $13, %xmm1 +; SSE-NEXT: psllw $13, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v16i16_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsllw $13, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $13, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v16i16_and4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1125917086973956,1125917086973956,1125917086973956,1125917086973956] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllw $13, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl $-1431655766, %eax # imm = 0xAAAAAAAA ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v16i16_and4: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1125917086973956,1125917086973956,1125917086973956,1125917086973956] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v16i16_and4: +; KNL: # %bb.0: +; KNL-NEXT: vpsllw $13, %ymm0, %ymm0 +; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 +; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; KNL-NEXT: vptest %ymm1, %ymm0 +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v16i16_and4: +; SKX: # %bb.0: +; SKX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1125917086973956,1125917086973956,1125917086973956,1125917086973956] +; SKX-NEXT: vptest %ymm1, %ymm0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <16 x i16> %arg, %tmp1 = icmp ne <16 x i16> %tmp, zeroinitializer %tmp2 = bitcast <16 x i1> %tmp1 to i16 @@ -2921,36 +3051,35 @@ } define i1 @allzeros_v4i32_and4(<4 x i32> %arg) { -; SSE2-LABEL: allzeros_v4i32_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: pslld $29, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v4i32_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v4i32_and4: +; SSE: # %bb.0: +; SSE-NEXT: pslld $29, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: allzeros_v4i32_and4: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpslld $29, %xmm0, %xmm0 +; AVX1OR2-NEXT: vtestps %xmm0, %xmm0 ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: allzeros_v4i32_and4: ; KNL: # %bb.0: -; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; KNL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testb $15, %al ; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: allzeros_v4i32_and4: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17179869188,17179869188] -; SKX-NEXT: vptest %xmm1, %xmm0 +; SKX-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k0 +; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: retq %tmp = and <4 x i32> %arg, @@ -3017,33 +3146,30 @@ } define i1 @allzeros_v8i32_and4(<8 x i32> %arg) { -; SSE2-LABEL: allzeros_v8i32_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $29, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v8i32_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v8i32_and4: +; SSE: # %bb.0: +; SSE-NEXT: pslld $29, %xmm1 +; SSE-NEXT: pslld $29, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v8i32_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $29, %xmm0, %xmm0 +; AVX1-NEXT: vtestps %xmm0, %xmm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v8i32_and4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [17179869188,17179869188,17179869188,17179869188] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpslld $29, %ymm0, %ymm0 +; AVX2-NEXT: vtestps %ymm0, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3122,39 +3248,46 @@ } define i1 @allzeros_v16i32_and4(<16 x i32> %arg) { -; SSE2-LABEL: allzeros_v16i32_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $29, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v16i32_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v16i32_and4: +; SSE: # %bb.0: +; SSE-NEXT: pslld $29, %xmm3 +; SSE-NEXT: pslld $29, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: pslld $29, %xmm1 +; SSE-NEXT: pslld $29, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v16i32_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpslld $29, %xmm2, %xmm2 +; AVX1-NEXT: vpslld $29, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpslld $29, %xmm2, %xmm2 +; AVX1-NEXT: vpslld $29, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v16i32_and4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [17179869188,17179869188,17179869188,17179869188] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpslld $29, %ymm1, %ymm1 +; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1 +; AVX2-NEXT: vpslld $29, %ymm0, %ymm0 +; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3216,36 +3349,36 @@ } define i1 @allzeros_v2i64_and4(<2 x i64> %arg) { -; SSE2-LABEL: allzeros_v2i64_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: pslld $29, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testb $5, %al -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v2i64_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v2i64_and4: +; SSE: # %bb.0: +; SSE-NEXT: psllq $61, %xmm0 +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: allzeros_v2i64_and4: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpsllq $61, %xmm0, %xmm0 +; AVX1OR2-NEXT: vtestpd %xmm0, %xmm0 ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: allzeros_v2i64_and4: ; KNL: # %bb.0: -; KNL-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; KNL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] +; KNL-NEXT: vptestmq %zmm1, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testb $3, %al ; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: allzeros_v2i64_and4: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] -; SKX-NEXT: vptest %xmm1, %xmm0 +; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k0 +; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: retq %tmp = and <2 x i64> %arg, @@ -3312,44 +3445,51 @@ } define i1 @allzeros_v4i64_and4(<4 x i64> %arg) { -; SSE2-LABEL: allzeros_v4i64_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $29, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testb $5, %al -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v4i64_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v4i64_and4: +; SSE: # %bb.0: +; SSE-NEXT: psllq $61, %xmm1 +; SSE-NEXT: psllq $61, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v4i64_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $61, %xmm0, %xmm0 +; AVX1-NEXT: vtestpd %xmm0, %xmm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v4i64_and4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllq $61, %ymm0, %ymm0 +; AVX2-NEXT: vtestpd %ymm0, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: allzeros_v4i64_and4: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; KNL-LABEL: allzeros_v4i64_and4: +; KNL: # %bb.0: +; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; KNL-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testb $15, %al +; KNL-NEXT: sete %al +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: allzeros_v4i64_and4: +; SKX: # %bb.0: +; SKX-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k0 +; SKX-NEXT: kortestb %k0, %k0 +; SKX-NEXT: sete %al +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq %tmp = and <4 x i64> %arg, %tmp1 = icmp ne <4 x i64> %tmp, zeroinitializer %tmp2 = bitcast <4 x i1> %tmp1 to i4 @@ -3425,39 +3565,42 @@ } define i1 @allzeros_v8i64_and4(<8 x i64> %arg) { -; SSE2-LABEL: allzeros_v8i64_and4: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $29, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testb $5, %al -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: allzeros_v8i64_and4: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: allzeros_v8i64_and4: +; SSE: # %bb.0: +; SSE-NEXT: psllq $61, %xmm3 +; SSE-NEXT: psllq $61, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: psllq $61, %xmm1 +; SSE-NEXT: psllq $61, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packssdw %xmm2, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: allzeros_v8i64_and4: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq $61, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsllq $61, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq $61, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vtestps %xmm0, %xmm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: allzeros_v8i64_and4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllq $61, %ymm1, %ymm1 +; AVX2-NEXT: vpsllq $61, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vtestps %ymm0, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3731,20 +3874,34 @@ ; SSE: # %bb.0: ; SSE-NEXT: pcmpgtw %xmm1, %xmm0 ; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: notb %al -; SSE-NEXT: testb $-109, %al -; SSE-NEXT: sete %al +; SSE-NEXT: pmovmskb %xmm0, %ecx +; SSE-NEXT: movl %ecx, %edx +; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: shrb %al +; SSE-NEXT: andb %cl, %al +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; SSE-NEXT: shrb $7, %cl +; SSE-NEXT: andb $16, %dl +; SSE-NEXT: shrb $4, %dl +; SSE-NEXT: andb %cl, %dl +; SSE-NEXT: andb %dl, %al ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: movmsk_v8i16: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax -; AVX1OR2-NEXT: notb %al -; AVX1OR2-NEXT: testb $-109, %al -; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: vpmovmskb %xmm0, %ecx +; AVX1OR2-NEXT: movl %ecx, %edx +; AVX1OR2-NEXT: movl %ecx, %eax +; AVX1OR2-NEXT: shrb %al +; AVX1OR2-NEXT: andb %cl, %al +; AVX1OR2-NEXT: # kill: def $cl killed $cl killed $ecx +; AVX1OR2-NEXT: shrb $7, %cl +; AVX1OR2-NEXT: andb $16, %dl +; AVX1OR2-NEXT: shrb $4, %dl +; AVX1OR2-NEXT: andb %cl, %dl +; AVX1OR2-NEXT: andb %dl, %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: movmsk_v8i16: @@ -3840,24 +3997,31 @@ ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: sete %al +; SSE2-NEXT: movmskpd %xmm1, %ecx +; SSE2-NEXT: xorl $3, %ecx +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: shrb %al +; SSE2-NEXT: andb %cl, %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: movmsk_and_v2i64: ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 -; SSE41-NEXT: movmskpd %xmm0, %eax -; SSE41-NEXT: testl %eax, %eax -; SSE41-NEXT: sete %al +; SSE41-NEXT: movmskpd %xmm0, %ecx +; SSE41-NEXT: xorl $3, %ecx +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: shrb %al +; SSE41-NEXT: andb %cl, %al ; SSE41-NEXT: retq ; ; AVX1OR2-LABEL: movmsk_and_v2i64: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vtestpd %xmm0, %xmm0 -; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: vmovmskpd %xmm0, %ecx +; AVX1OR2-NEXT: xorl $3, %ecx +; AVX1OR2-NEXT: movl %ecx, %eax +; AVX1OR2-NEXT: shrb %al +; AVX1OR2-NEXT: andb %cl, %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: movmsk_and_v2i64: @@ -3891,24 +4055,39 @@ ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: cmpl $3, %eax -; SSE2-NEXT: setne %al +; SSE2-NEXT: movmskpd %xmm1, %ecx +; SSE2-NEXT: xorl $3, %ecx +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: shrb %al +; SSE2-NEXT: orb %cl, %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: movmsk_or_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: setne %al +; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 +; SSE41-NEXT: movmskpd %xmm0, %ecx +; SSE41-NEXT: xorl $3, %ecx +; SSE41-NEXT: movl %ecx, %eax +; SSE41-NEXT: shrb %al +; SSE41-NEXT: orb %cl, %al ; SSE41-NEXT: retq ; -; AVX-LABEL: movmsk_or_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vptest %xmm0, %xmm0 -; AVX-NEXT: setne %al -; AVX-NEXT: retq +; AVX1OR2-LABEL: movmsk_or_v2i64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vmovmskpd %xmm0, %ecx +; AVX1OR2-NEXT: xorl $3, %ecx +; AVX1OR2-NEXT: movl %ecx, %eax +; AVX1OR2-NEXT: shrb %al +; AVX1OR2-NEXT: orb %cl, %al +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: movmsk_or_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vptest %xmm0, %xmm0 +; AVX512-NEXT: setne %al +; AVX512-NEXT: retq %cmp = icmp ne <2 x i64> %x, %y %e1 = extractelement <2 x i1> %cmp, i32 0 %e2 = extractelement <2 x i1> %cmp, i32 1 @@ -3941,18 +4120,24 @@ ; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: vcmpeq_uqps %zmm1, %zmm0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k1 +; KNL-NEXT: kmovw %k1, %ecx ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: testb $14, %al +; KNL-NEXT: testb $6, %al ; KNL-NEXT: setne %al +; KNL-NEXT: orb %cl, %al ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: movmsk_v4f32: ; SKX: # %bb.0: ; SKX-NEXT: vcmpeq_uqps %xmm1, %xmm0, %k0 +; SKX-NEXT: kshiftrb $3, %k0, %k1 +; SKX-NEXT: kmovd %k1, %ecx ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: testb $14, %al +; SKX-NEXT: testb $6, %al ; SKX-NEXT: setne %al +; SKX-NEXT: orb %cl, %al ; SKX-NEXT: retq %cmp = fcmp ueq <4 x float> %x, %y %e1 = extractelement <4 x i1> %cmp, i32 1 @@ -3967,17 +4152,19 @@ ; SSE-LABEL: movmsk_and_v2f64: ; SSE: # %bb.0: ; SSE-NEXT: cmplepd %xmm0, %xmm1 -; SSE-NEXT: movmskpd %xmm1, %eax -; SSE-NEXT: cmpl $3, %eax -; SSE-NEXT: sete %al +; SSE-NEXT: movmskpd %xmm1, %ecx +; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: shrb %al +; SSE-NEXT: andb %cl, %al ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: movmsk_and_v2f64: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vcmplepd %xmm0, %xmm1, %xmm0 -; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1OR2-NEXT: vtestpd %xmm1, %xmm0 -; AVX1OR2-NEXT: setb %al +; AVX1OR2-NEXT: vmovmskpd %xmm0, %ecx +; AVX1OR2-NEXT: movl %ecx, %eax +; AVX1OR2-NEXT: shrb %al +; AVX1OR2-NEXT: andb %cl, %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: movmsk_and_v2f64: @@ -4010,16 +4197,19 @@ ; SSE-LABEL: movmsk_or_v2f64: ; SSE: # %bb.0: ; SSE-NEXT: cmplepd %xmm0, %xmm1 -; SSE-NEXT: movmskpd %xmm1, %eax -; SSE-NEXT: testl %eax, %eax -; SSE-NEXT: setne %al +; SSE-NEXT: movmskpd %xmm1, %ecx +; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: shrb %al +; SSE-NEXT: orb %cl, %al ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: movmsk_or_v2f64: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vcmplepd %xmm0, %xmm1, %xmm0 -; AVX1OR2-NEXT: vtestpd %xmm0, %xmm0 -; AVX1OR2-NEXT: setne %al +; AVX1OR2-NEXT: vmovmskpd %xmm0, %ecx +; AVX1OR2-NEXT: movl %ecx, %eax +; AVX1OR2-NEXT: shrb %al +; AVX1OR2-NEXT: orb %cl, %al ; AVX1OR2-NEXT: retq ; ; KNL-LABEL: movmsk_or_v2f64: @@ -4358,30 +4548,23 @@ ; KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: vcmpltpd %zmm0, %zmm1, %k0 -; KNL-NEXT: kshiftrw $1, %k0, %k1 -; KNL-NEXT: kmovw %k1, %eax -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: testb $1, %al -; KNL-NEXT: movl $42, %eax -; KNL-NEXT: movl $99, %edx -; KNL-NEXT: cmovel %edx, %eax -; KNL-NEXT: testb $1, %cl -; KNL-NEXT: cmovel %edx, %eax +; KNL-NEXT: knotw %k0, %k0 +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: testb $3, %al +; KNL-NEXT: movl $42, %ecx +; KNL-NEXT: movl $99, %eax +; KNL-NEXT: cmovel %ecx, %eax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: PR39665_c_ray: ; SKX: # %bb.0: ; SKX-NEXT: vcmpltpd %xmm0, %xmm1, %k0 -; SKX-NEXT: kshiftrb $1, %k0, %k1 -; SKX-NEXT: kmovd %k1, %eax -; SKX-NEXT: kmovd %k0, %ecx -; SKX-NEXT: testb $1, %al -; SKX-NEXT: movl $42, %eax -; SKX-NEXT: movl $99, %edx -; SKX-NEXT: cmovel %edx, %eax -; SKX-NEXT: testb $1, %cl -; SKX-NEXT: cmovel %edx, %eax +; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: cmpb $3, %al +; SKX-NEXT: movl $42, %ecx +; SKX-NEXT: movl $99, %eax +; SKX-NEXT: cmovel %ecx, %eax ; SKX-NEXT: retq %cmp = fcmp ogt <2 x double> %x, %y %e1 = extractelement <2 x i1> %cmp, i32 0 diff --git a/llvm/test/CodeGen/X86/mul128.ll b/llvm/test/CodeGen/X86/mul128.ll --- a/llvm/test/CodeGen/X86/mul128.ll +++ b/llvm/test/CodeGen/X86/mul128.ll @@ -109,12 +109,12 @@ define void @PR13897() nounwind { ; X64-LABEL: PR13897: ; X64: # %bb.0: # %"0x0" -; X64-NEXT: movq bbb(%rip), %rax -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: shlq $32, %rax -; X64-NEXT: orq %rcx, %rax -; X64-NEXT: movq %rax, aaa+8(%rip) -; X64-NEXT: movq %rax, aaa(%rip) +; X64-NEXT: movl bbb(%rip), %eax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: shlq $32, %rcx +; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movq %rcx, aaa+8(%rip) +; X64-NEXT: movq %rcx, aaa(%rip) ; X64-NEXT: retq ; ; X86-LABEL: PR13897: diff --git a/llvm/test/CodeGen/X86/mulvi32.ll b/llvm/test/CodeGen/X86/mulvi32.ll --- a/llvm/test/CodeGen/X86/mulvi32.ll +++ b/llvm/test/CodeGen/X86/mulvi32.ll @@ -135,22 +135,22 @@ ; SSE2-LABEL: _mul4xi32toi64a: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,1,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,3,3] ; SSE2-NEXT: pmuludq %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE42-LABEL: _mul4xi32toi64a: ; SSE42: # %bb.0: ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,3,3] ; SSE42-NEXT: pmuludq %xmm3, %xmm2 -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; SSE42-NEXT: pmuludq %xmm1, %xmm0 +; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SSE42-NEXT: pmuludq %xmm4, %xmm0 ; SSE42-NEXT: movdqa %xmm2, %xmm1 ; SSE42-NEXT: retq ; @@ -288,11 +288,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero +; AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %lower0 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> %lower1 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/musttail-varargs.ll b/llvm/test/CodeGen/X86/musttail-varargs.ll --- a/llvm/test/CodeGen/X86/musttail-varargs.ll +++ b/llvm/test/CodeGen/X86/musttail-varargs.ll @@ -52,11 +52,11 @@ ; LINUX-NEXT: movq %rdx, %r13 ; LINUX-NEXT: movq %rsi, %rbp ; LINUX-NEXT: movq %rdi, %rbx -; LINUX-NEXT: movq %rsi, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %r8, {{[0-9]+}}(%rsp) ; LINUX-NEXT: movq %r9, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %r8, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %rsi, {{[0-9]+}}(%rsp) ; LINUX-NEXT: testb %al, %al ; LINUX-NEXT: je .LBB0_2 ; LINUX-NEXT: # %bb.1: @@ -145,11 +145,11 @@ ; LINUX-X32-NEXT: movq %rdx, %r13 ; LINUX-X32-NEXT: movq %rsi, %rbp ; LINUX-X32-NEXT: movq %rdi, %rbx -; LINUX-X32-NEXT: movq %rsi, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %rdx, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %rcx, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %r8, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: movq %r9, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %r8, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %rcx, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %rdx, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %rsi, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: testb %al, %al ; LINUX-X32-NEXT: je .LBB0_2 ; LINUX-X32-NEXT: # %bb.1: @@ -218,9 +218,9 @@ ; WINDOWS-NEXT: movq %r8, %rdi ; WINDOWS-NEXT: movq %rdx, %rbx ; WINDOWS-NEXT: movq %rcx, %r14 -; WINDOWS-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WINDOWS-NEXT: movq %r8, {{[0-9]+}}(%rsp) ; WINDOWS-NEXT: movq %r9, {{[0-9]+}}(%rsp) +; WINDOWS-NEXT: movq %r8, {{[0-9]+}}(%rsp) +; WINDOWS-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ; WINDOWS-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; WINDOWS-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; WINDOWS-NEXT: callq get_f diff --git a/llvm/test/CodeGen/X86/neg-abs.ll b/llvm/test/CodeGen/X86/neg-abs.ll --- a/llvm/test/CodeGen/X86/neg-abs.ll +++ b/llvm/test/CodeGen/X86/neg-abs.ll @@ -36,7 +36,7 @@ ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movswl %cx, %eax -; X86-NEXT: sarl $15, %eax +; X86-NEXT: shrl $15, %eax ; X86-NEXT: xorl %eax, %ecx ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -180,7 +180,7 @@ ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movswl %cx, %eax -; X86-NEXT: sarl $15, %eax +; X86-NEXT: shrl $15, %eax ; X86-NEXT: xorl %eax, %ecx ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax diff --git a/llvm/test/CodeGen/X86/neg_fp.ll b/llvm/test/CodeGen/X86/neg_fp.ll --- a/llvm/test/CodeGen/X86/neg_fp.ll +++ b/llvm/test/CodeGen/X86/neg_fp.ll @@ -64,12 +64,14 @@ ; CHECK-NEXT: pushl %eax ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: subss {{[0-9]+}}(%esp), %xmm1 -; CHECK-NEXT: movaps %xmm1, %xmm2 -; CHECK-NEXT: mulss %xmm0, %xmm2 -; CHECK-NEXT: subss %xmm1, %xmm0 -; CHECK-NEXT: divss %xmm2, %xmm0 -; CHECK-NEXT: movss %xmm0, (%esp) +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps %xmm2, %xmm3 +; CHECK-NEXT: subss %xmm1, %xmm3 +; CHECK-NEXT: mulss %xmm0, %xmm3 +; CHECK-NEXT: subss %xmm2, %xmm1 +; CHECK-NEXT: addss %xmm0, %xmm1 +; CHECK-NEXT: divss %xmm3, %xmm1 +; CHECK-NEXT: movss %xmm1, (%esp) ; CHECK-NEXT: flds (%esp) ; CHECK-NEXT: popl %eax ; CHECK-NEXT: retl diff --git a/llvm/test/CodeGen/X86/no-wide-load.ll b/llvm/test/CodeGen/X86/no-wide-load.ll --- a/llvm/test/CodeGen/X86/no-wide-load.ll +++ b/llvm/test/CodeGen/X86/no-wide-load.ll @@ -7,7 +7,7 @@ ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movzwl 4(%rdi), %eax -; CHECK-NEXT: andl $-1121, %eax # imm = 0xFB9F +; CHECK-NEXT: andl $64415, %eax # imm = 0xFB9F ; CHECK-NEXT: orl $1024, %eax # imm = 0x400 ; CHECK-NEXT: movw %ax, 4(%rdi) ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/nontemporal-loads.ll b/llvm/test/CodeGen/X86/nontemporal-loads.ll --- a/llvm/test/CodeGen/X86/nontemporal-loads.ll +++ b/llvm/test/CodeGen/X86/nontemporal-loads.ll @@ -828,12 +828,12 @@ ; ; AVX1-LABEL: test_arg_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v8i32: @@ -908,12 +908,12 @@ ; ; AVX1-LABEL: test_arg_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v4i64: @@ -949,12 +949,12 @@ ; ; AVX1-LABEL: test_arg_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpaddw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm2 -; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v16i16: @@ -990,12 +990,12 @@ ; ; AVX1-LABEL: test_arg_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v32i8: @@ -1090,18 +1090,18 @@ ; ; AVX1-LABEL: test_arg_v16i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm3 -; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm3 -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm4 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v16i32: @@ -1196,18 +1196,18 @@ ; ; AVX1-LABEL: test_arg_v8i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm3 -; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm3 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm4 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v8i64: @@ -1251,18 +1251,18 @@ ; ; AVX1-LABEL: test_arg_v32i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm3 -; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm3 -; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm4 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpaddw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpaddw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v32i16: @@ -1326,18 +1326,18 @@ ; ; AVX1-LABEL: test_arg_v64i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm4 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v64i8: diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -236,12 +236,11 @@ ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,0,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,2,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,6,4,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,2,3,2,4,5,6,7] ; SSE2-NEXT: movw %ax, 12(%rdi) -; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: movd %xmm0, 8(%rdi) +; SSE2-NEXT: movq %xmm1, (%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: v7i16: @@ -473,7 +472,7 @@ ; SSE2-LABEL: v12i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps %xmm2, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[1,0] ; SSE2-NEXT: movaps %xmm0, %xmm4 ; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] @@ -481,7 +480,7 @@ ; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[1,1] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE2-NEXT: movaps %xmm2, 16(%rdi) @@ -799,32 +798,32 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,4,5] -; SSE2-NEXT: packuswb %xmm5, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,2] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,6,6] -; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: movq {{.*#+}} xmm5 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,3,3,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; SSE2-NEXT: pandn %xmm2, %xmm4 ; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,4,5] +; SSE2-NEXT: packuswb %xmm4, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,6,6] +; SSE2-NEXT: pandn %xmm4, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,1,3,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7] ; SSE2-NEXT: packuswb %xmm1, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255] @@ -833,7 +832,7 @@ ; SSE2-NEXT: pandn %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: movq %xmm2, 16(%rdi) -; SSE2-NEXT: movdqu %xmm4, (%rdi) +; SSE2-NEXT: movdqu %xmm3, (%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: interleave_24i8_in: @@ -858,42 +857,42 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5] -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero -; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[13],zero,xmm1[6,14],zero,xmm1[7,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5],zero,zero,xmm0[6],zero,zero,xmm0[7,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero +; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, 16(%rdi) -; AVX1-NEXT: vmovdqu %xmm2, (%rdi) +; AVX1-NEXT: vmovdqu %xmm1, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: interleave_24i8_in: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5] -; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero -; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[13],zero,xmm1[6,14],zero,xmm1[7,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5],zero,zero,xmm0[6],zero,zero,xmm0[7,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, 16(%rdi) -; AVX2-NEXT: vmovdqu %xmm2, (%rdi) +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5,29],zero,ymm0[22,30],zero,ymm0[23,31],zero,ymm0[u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpbroadcastq (%rcx), %ymm1 +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[0],zero,zero,ymm1[1],zero,zero,ymm1[2],zero,zero,ymm1[3],zero,zero,ymm1[4],zero,zero,ymm1[21],zero,zero,ymm1[22],zero,zero,ymm1[23,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovq %xmm1, 16(%rdi) +; AVX2-NEXT: vmovdqu %xmm0, (%rdi) +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; XOP-LABEL: interleave_24i8_in: ; XOP: # %bb.0: ; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; XOP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; XOP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm1[0,8],xmm0[0],xmm1[1,9],xmm0[1],xmm1[2,10],xmm0[2],xmm1[3,11],xmm0[3],xmm1[4,12],xmm0[4],xmm1[5] -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[13],xmm0[5],xmm1[6,14],xmm0[6],xmm1[7,15],xmm0[7],xmm1[u,u,u,u,u,u,u,u] +; XOP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm0[0,8],xmm1[0],xmm0[1,9],xmm1[1],xmm0[2,10],xmm1[2],xmm0[3,11],xmm1[3],xmm0[4,12],xmm1[4],xmm0[5] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[13],xmm1[5],xmm0[6,14],xmm1[6],xmm0[7,15],xmm1[7],xmm0[u,u,u,u,u,u,u,u] ; XOP-NEXT: vmovq %xmm0, 16(%rdi) ; XOP-NEXT: vmovdqu %xmm2, (%rdi) ; XOP-NEXT: retq @@ -1037,12 +1036,12 @@ ; XOP-NEXT: vmovdqu (%rdi), %xmm0 ; XOP-NEXT: vmovdqu 16(%rdi), %xmm1 ; XOP-NEXT: vmovdqu 32(%rdi), %xmm2 -; XOP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15],xmm2[4,5,10,11] -; XOP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11],xmm2[0,1,6,7,12,13] -; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13],xmm2[2,3,8,9,14,15] +; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm0[0,1,6,7,12,13],xmm1[2,3,8,9,14,15],xmm0[u,u,u,u] +; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,7,8,9,10,11],xmm2[4,5,10,11] +; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm0[2,3,8,9,14,15],xmm1[4,5,10,11,u,u,u,u,u,u] +; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,7,8,9],xmm2[0,1,6,7,12,13] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[4,5,10,11],xmm1[0,1,6,7,12,13,u,u,u,u,u,u] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9],xmm2[2,3,8,9,14,15] ; XOP-NEXT: vmovdqu %xmm3, (%rsi) ; XOP-NEXT: vmovdqu %xmm4, (%rdx) ; XOP-NEXT: vmovdqu %xmm0, (%rcx) @@ -1187,12 +1186,12 @@ ; XOP-NEXT: vmovdqu (%rdi), %xmm0 ; XOP-NEXT: vmovdqu 16(%rdi), %xmm1 ; XOP-NEXT: vmovdqu 32(%rdi), %xmm2 -; XOP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm2[14,15,8,9,2,3],xmm3[12,13,6,7,0,1,10,11,4,5] -; XOP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm2[12,13,6,7,0,1],xmm4[10,11,4,5,14,15,8,9,2,3] -; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm2[10,11,4,5],xmm0[14,15,8,9,2,3,12,13,6,7,0,1] +; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,12,13,6,7,0,1],xmm0[10,11,4,5] +; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm2[14,15,8,9,2,3],xmm3[6,7,8,9,10,11,12,13,14,15] +; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u],xmm1[10,11,4,5],xmm0[14,15,8,9,2,3] +; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm2[12,13,6,7,0,1],xmm4[6,7,8,9,10,11,12,13,14,15] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[u,u,u,u,14,15,8,9,2,3],xmm0[12,13,6,7,0,1] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm2[10,11,4,5],xmm0[4,5,6,7,8,9,10,11,12,13,14,15] ; XOP-NEXT: vmovdqu %xmm3, (%rsi) ; XOP-NEXT: vmovdqu %xmm4, (%rdx) ; XOP-NEXT: vmovdqu %xmm0, (%rcx) @@ -1326,14 +1325,14 @@ ; AVX2-FAST-ALL-NEXT: vmovdqu (%rsi), %xmm0 ; AVX2-FAST-ALL-NEXT: vmovdqu (%rdx), %xmm1 ; AVX2-FAST-ALL-NEXT: vmovdqu (%rcx), %xmm2 -; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm4, %ymm4 +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm3, %ymm3 +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6] -; AVX2-FAST-ALL-NEXT: vpermd %ymm3, %ymm5, %ymm3 -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5,u,u,2,3,6,7,u,u,8,9,12,13,u,u,18,19,22,23,u,u,24,25,28,29,u,u,26,27] +; AVX2-FAST-ALL-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,4,5,u,u,2,3,6,7,u,u,8,9,12,13,u,u,18,19,22,23,u,u,24,25,28,29,u,u,26,27] ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-ALL-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-ALL-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 ; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] ; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] @@ -1371,17 +1370,18 @@ ; XOP-NEXT: vmovdqu (%rsi), %xmm0 ; XOP-NEXT: vmovdqu (%rdx), %xmm1 ; XOP-NEXT: vmovdqu (%rcx), %xmm2 -; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm0[u,u,6,7],xmm1[6,7],xmm0[u,u,8,9],xmm1[8,9],xmm0[u,u,10,11] -; XOP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] -; XOP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; XOP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[0,1],xmm4[4,5,6,7],xmm2[2,3],xmm4[8,9,10,11] -; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; XOP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[4,5],xmm2[10,11],xmm0[10,11,8,9],xmm2[12,13],xmm0[14,15,12,13],xmm2[14,15] -; XOP-NEXT: vmovdqu %xmm0, 32(%rdi) -; XOP-NEXT: vmovups %ymm3, (%rdi) -; XOP-NEXT: vzeroupper +; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm1[10,11,u,u],xmm0[12,13],xmm1[12,13,u,u],xmm0[14,15],xmm1[14,15,u,u] +; XOP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] +; XOP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] +; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm0[0,1],xmm1[0,1],xmm0[u,u,2,3],xmm1[2,3],xmm0[u,u,4,5],xmm1[4,5] +; XOP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,0,0] +; XOP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[u,u,6,7],xmm1[6,7],xmm0[u,u,8,9],xmm1[8,9],xmm0[u,u,10,11] +; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] +; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; XOP-NEXT: vmovdqu %xmm0, 16(%rdi) +; XOP-NEXT: vmovdqu %xmm4, (%rdi) +; XOP-NEXT: vmovdqu %xmm3, 32(%rdi) ; XOP-NEXT: retq %s1 = load <8 x i16>, ptr %q1, align 4 %s2 = load <8 x i16>, ptr %q2, align 4 @@ -1418,10 +1418,10 @@ ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[2,0] ; SSE2-NEXT: movdqa %xmm3, %xmm10 ; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm4[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm1[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[3,3] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm2[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[3,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[2,0] ; SSE2-NEXT: movups %xmm10, 16(%rsi) @@ -1474,27 +1474,27 @@ ; AVX1-NEXT: vmovups 32(%rdi), %ymm1 ; AVX1-NEXT: vmovups (%rdi), %ymm2 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX1-NEXT: vmovups 16(%rdi), %xmm4 -; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4] -; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] -; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm4[3,0],ymm0[6,4],ymm4[7,4] -; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm5[2,0],ymm4[4,4],ymm5[6,4] -; AVX1-NEXT: vmovups 16(%rdi), %xmm6 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm5[0,2],ymm3[4,7],ymm5[4,6] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,0,1] +; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,0],ymm5[2,0],ymm0[5,4],ymm5[6,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm0[2,0],ymm5[3,0],ymm0[6,4],ymm5[7,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm5[0,0],ymm6[2,0],ymm5[4,4],ymm6[6,4] ; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] -; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,2],ymm4[0,3],ymm7[5,6],ymm4[4,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5] +; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX1-NEXT: vmovups 16(%rdi), %xmm6 ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,3],ymm2[6,4],ymm1[4,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,1],ymm0[0,3],ymm4[4,5],ymm0[4,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,1],ymm0[0,3],ymm5[4,5],ymm0[4,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-NEXT: vmovups %ymm3, (%rsi) -; AVX1-NEXT: vmovups %ymm5, (%rdx) +; AVX1-NEXT: vmovups %ymm4, (%rdx) ; AVX1-NEXT: vmovups %ymm0, (%rcx) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1596,27 +1596,27 @@ ; XOP-NEXT: vmovups 32(%rdi), %ymm1 ; XOP-NEXT: vmovups (%rdi), %ymm2 ; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; XOP-NEXT: vmovups 16(%rdi), %xmm4 -; XOP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7] -; XOP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6] -; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1] -; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4] -; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] -; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm4[3,0],ymm0[6,4],ymm4[7,4] -; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm5[2,0],ymm4[4,4],ymm5[6,4] -; XOP-NEXT: vmovups 16(%rdi), %xmm6 +; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3,0,1] +; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7] +; XOP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm5[0,2],ymm3[4,7],ymm5[4,6] +; XOP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,0,1] +; XOP-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,0],ymm5[2,0],ymm0[5,4],ymm5[6,4] +; XOP-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] +; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; XOP-NEXT: vshufps {{.*#+}} ymm6 = ymm0[2,0],ymm5[3,0],ymm0[6,4],ymm5[7,4] +; XOP-NEXT: vshufps {{.*#+}} ymm6 = ymm5[0,0],ymm6[2,0],ymm5[4,4],ymm6[6,4] ; XOP-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; XOP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7] -; XOP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] -; XOP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] +; XOP-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,2],ymm4[0,3],ymm7[5,6],ymm4[4,7] +; XOP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5] +; XOP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] ; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; XOP-NEXT: vmovups 16(%rdi), %xmm6 ; XOP-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4] ; XOP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,3],ymm2[6,4],ymm1[4,7] -; XOP-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,1],ymm0[0,3],ymm4[4,5],ymm0[4,7] +; XOP-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,1],ymm0[0,3],ymm5[4,5],ymm0[4,7] ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; XOP-NEXT: vmovups %ymm3, (%rsi) -; XOP-NEXT: vmovups %ymm5, (%rdx) +; XOP-NEXT: vmovups %ymm4, (%rdx) ; XOP-NEXT: vmovups %ymm0, (%rcx) ; XOP-NEXT: vzeroupper ; XOP-NEXT: retq @@ -1633,79 +1633,79 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; SSE2-LABEL: interleave_24i32_in: ; SSE2: # %bb.0: -; SSE2-NEXT: movups (%rsi), %xmm1 -; SSE2-NEXT: movups 16(%rsi), %xmm0 -; SSE2-NEXT: movups (%rdx), %xmm2 -; SSE2-NEXT: movups 16(%rdx), %xmm5 -; SSE2-NEXT: movups (%rcx), %xmm4 -; SSE2-NEXT: movups 16(%rcx), %xmm6 -; SSE2-NEXT: movaps %xmm4, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[1,3] -; SSE2-NEXT: movaps %xmm1, %xmm3 -; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: movups (%rsi), %xmm0 +; SSE2-NEXT: movups 16(%rsi), %xmm1 +; SSE2-NEXT: movups (%rdx), %xmm5 +; SSE2-NEXT: movups 16(%rdx), %xmm2 +; SSE2-NEXT: movups (%rcx), %xmm6 +; SSE2-NEXT: movups 16(%rcx), %xmm4 +; SSE2-NEXT: movaps %xmm6, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm0[1,0] +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm7[0,2] ; SSE2-NEXT: movaps %xmm0, %xmm7 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1] -; SSE2-NEXT: movaps %xmm6, %xmm8 -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[1,3] -; SSE2-NEXT: movaps %xmm0, %xmm9 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm6[2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm5[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm5[1,1] ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm7[0,2] -; SSE2-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[0,2] +; SSE2-NEXT: movaps %xmm4, %xmm5 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm1[1,0] +; SSE2-NEXT: movaps %xmm1, %xmm7 +; SSE2-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[0,2] ; SSE2-NEXT: movaps %xmm1, %xmm5 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[0,2] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE2-NEXT: movups %xmm4, 16(%rdi) -; SSE2-NEXT: movups %xmm9, 48(%rdi) -; SSE2-NEXT: movups %xmm6, 64(%rdi) +; SSE2-NEXT: movups %xmm4, 64(%rdi) +; SSE2-NEXT: movups %xmm7, 48(%rdi) +; SSE2-NEXT: movups %xmm6, 16(%rdi) ; SSE2-NEXT: movups %xmm3, (%rdi) -; SSE2-NEXT: movups %xmm1, 32(%rdi) -; SSE2-NEXT: movups %xmm0, 80(%rdi) +; SSE2-NEXT: movups %xmm1, 80(%rdi) +; SSE2-NEXT: movups %xmm0, 32(%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: interleave_24i32_in: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqu (%rsi), %xmm0 -; SSE42-NEXT: movdqu 16(%rsi), %xmm2 +; SSE42-NEXT: movdqu (%rsi), %xmm2 +; SSE42-NEXT: movdqu 16(%rsi), %xmm0 ; SSE42-NEXT: movdqu (%rdx), %xmm3 ; SSE42-NEXT: movdqu 16(%rdx), %xmm4 ; SSE42-NEXT: movdqu (%rcx), %xmm5 ; SSE42-NEXT: movdqu 16(%rcx), %xmm6 -; SSE42-NEXT: movdqa %xmm0, %xmm1 +; SSE42-NEXT: movdqa %xmm2, %xmm1 ; SSE42-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,2] ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5],xmm7[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,2,2] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,2,2] ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4,5],xmm7[6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] -; SSE42-NEXT: movdqa %xmm2, %xmm8 +; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,3],xmm7[4,5,6,7] +; SSE42-NEXT: movdqa %xmm0, %xmm8 ; SSE42-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] ; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] ; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm8[0,1,2,3],xmm9[4,5],xmm8[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,2,2] +; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,2,2] ; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm0[4,5],xmm8[6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,3],xmm8[4,5,6,7] -; SSE42-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] -; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3,4,5],xmm4[6,7] -; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3],xmm8[4,5,6,7] +; SSE42-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3,4,5],xmm3[6,7] +; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] -; SSE42-NEXT: movdqu %xmm2, 32(%rdi) -; SSE42-NEXT: movdqu %xmm4, 80(%rdi) -; SSE42-NEXT: movdqu %xmm8, 16(%rdi) +; SSE42-NEXT: movdqu %xmm2, 80(%rdi) +; SSE42-NEXT: movdqu %xmm3, 32(%rdi) +; SSE42-NEXT: movdqu %xmm8, 64(%rdi) ; SSE42-NEXT: movdqu %xmm9, 48(%rdi) -; SSE42-NEXT: movdqu %xmm7, 64(%rdi) +; SSE42-NEXT: movdqu %xmm7, 16(%rdi) ; SSE42-NEXT: movdqu %xmm1, (%rdi) ; SSE42-NEXT: retq ; @@ -1716,28 +1716,28 @@ ; AVX1-NEXT: vmovups 16(%rdx), %xmm2 ; AVX1-NEXT: vmovups (%rsi), %xmm3 ; AVX1-NEXT: vmovups 16(%rsi), %xmm4 -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm2[3,3] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1],xmm4[0,2] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,2,3] -; AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] -; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm3[1],xmm1[1] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm3[1],xmm1[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm1[1,1],xmm5[0,2] ; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-NEXT: vbroadcastsd (%rcx), %ymm3 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm4[3,3],xmm2[3,3] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1],xmm4[0,2] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,3,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] ; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = mem[1,0,2,2] ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7] ; AVX1-NEXT: vmovups %ymm0, 32(%rdi) -; AVX1-NEXT: vmovups %ymm1, (%rdi) ; AVX1-NEXT: vmovups %ymm2, 64(%rdi) +; AVX1-NEXT: vmovups %ymm1, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1746,26 +1746,26 @@ ; AVX2-SLOW-NEXT: vmovups (%rsi), %ymm0 ; AVX2-SLOW-NEXT: vmovups (%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovups (%rcx), %ymm2 -; AVX2-SLOW-NEXT: vbroadcastsd 24(%rsi), %ymm3 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = mem[1,0,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vbroadcastsd (%rcx), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = mem[1,0,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vbroadcastsd (%rcx), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 24(%rsi), %ymm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, 32(%rdi) -; AVX2-SLOW-NEXT: vmovups %ymm4, (%rdi) -; AVX2-SLOW-NEXT: vmovups %ymm3, 64(%rdi) +; AVX2-SLOW-NEXT: vmovups %ymm4, 64(%rdi) +; AVX2-SLOW-NEXT: vmovups %ymm3, (%rdi) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1774,28 +1774,29 @@ ; AVX2-FAST-ALL-NEXT: vmovups (%rsi), %ymm0 ; AVX2-FAST-ALL-NEXT: vmovups (%rdx), %ymm1 ; AVX2-FAST-ALL-NEXT: vmovups (%rcx), %ymm2 -; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [5,0,7,6,5,0,7,6] -; AVX2-FAST-ALL-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm3, %ymm3 -; AVX2-FAST-ALL-NEXT: vbroadcastsd 24(%rsi), %ymm4 -; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] -; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-FAST-ALL-NEXT: vmovups (%rdx), %xmm3 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2] ; AVX2-FAST-ALL-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,0,2,1] -; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX2-FAST-ALL-NEXT: vbroadcastsd (%rcx), %ymm5 -; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-ALL-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-ALL-NEXT: vpermps %ymm3, %ymm4, %ymm3 +; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-FAST-ALL-NEXT: vbroadcastsd (%rcx), %ymm4 +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-ALL-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2] -; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2] +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7] +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [5,0,7,6,5,0,7,6] +; AVX2-FAST-ALL-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-FAST-ALL-NEXT: vbroadcastsd 24(%rsi), %ymm4 +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FAST-ALL-NEXT: vmovups %ymm1, 64(%rdi) ; AVX2-FAST-ALL-NEXT: vmovups %ymm0, 32(%rdi) -; AVX2-FAST-ALL-NEXT: vmovups %ymm4, (%rdi) -; AVX2-FAST-ALL-NEXT: vmovups %ymm3, 64(%rdi) +; AVX2-FAST-ALL-NEXT: vmovups %ymm3, (%rdi) ; AVX2-FAST-ALL-NEXT: vzeroupper ; AVX2-FAST-ALL-NEXT: retq ; @@ -1804,26 +1805,26 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups (%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups (%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rsi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = mem[1,0,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rcx), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = mem[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rcx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, 32(%rdi) -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, (%rdi) -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, 64(%rdi) +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, 64(%rdi) +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, (%rdi) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -1837,25 +1838,25 @@ ; XOP-NEXT: vmovups 16(%rdx), %xmm3 ; XOP-NEXT: vmovups (%rsi), %xmm4 ; XOP-NEXT: vmovups 16(%rsi), %xmm5 -; XOP-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,3],xmm3[3,3] -; XOP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] -; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[0,2] -; XOP-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; XOP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3,2,3] -; XOP-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0,0,3,3] -; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] -; XOP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm4[1],xmm2[1] -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1],xmm5[0,2] +; XOP-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm4[1],xmm2[1] +; XOP-NEXT: vshufps {{.*#+}} xmm6 = xmm2[1,1],xmm6[0,2] ; XOP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] ; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1] -; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; XOP-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 ; XOP-NEXT: vbroadcastsd (%rcx), %ymm4 ; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm5[3,3],xmm3[3,3] +; XOP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[0,2] +; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,2,3] +; XOP-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3] +; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; XOP-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1,1,2,2] ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; XOP-NEXT: vmovups %ymm0, 32(%rdi) -; XOP-NEXT: vmovups %ymm2, (%rdi) ; XOP-NEXT: vmovups %ymm3, 64(%rdi) +; XOP-NEXT: vmovups %ymm2, (%rdi) ; XOP-NEXT: vzeroupper ; XOP-NEXT: retq %s1 = load <8 x i32>, ptr %q1, align 4 @@ -2229,12 +2230,14 @@ ; ; SSE42-LABEL: splat_v3i32: ; SSE42: # %bb.0: -; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE42-NEXT: movq (%rdi), %rax +; SSE42-NEXT: movq %rax, %xmm0 +; SSE42-NEXT: shrq $32, %rax ; SSE42-NEXT: pxor %xmm1, %xmm1 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,0,1] +; SSE42-NEXT: movd %eax, %xmm0 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; SSE42-NEXT: xorps %xmm3, %xmm3 ; SSE42-NEXT: retq ; @@ -2390,8 +2393,8 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[3,3,3,3] -; SSE-NEXT: movdqa %xmm0, 128(%rsi) ; SSE-NEXT: movdqa %xmm2, 144(%rsi) +; SSE-NEXT: movdqa %xmm0, 128(%rsi) ; SSE-NEXT: movdqa %xmm0, 16(%rsi) ; SSE-NEXT: movdqa %xmm7, 240(%rsi) ; SSE-NEXT: movdqa %xmm6, 208(%rsi) @@ -2403,19 +2406,19 @@ ; ; AVX1-LABEL: D107009: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovups 96(%rdi), %ymm0 -; AVX1-NEXT: vmovups (%rdi), %ymm1 +; AVX1-NEXT: vmovups (%rdi), %ymm0 +; AVX1-NEXT: vmovups 64(%rdi), %ymm1 ; AVX1-NEXT: vmovups 128(%rdi), %ymm2 -; AVX1-NEXT: vmovups 224(%rdi), %ymm3 +; AVX1-NEXT: vmovups 192(%rdi), %ymm3 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2] ; AVX1-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] -; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,0],ymm2[4,5],ymm3[6,4] -; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1],ymm3[0,2],ymm2[4,5],ymm3[4,6] +; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 @@ -2424,16 +2427,16 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vmovshdup {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7] -; AVX1-NEXT: vshufpd {{.*#+}} ymm5 = ymm1[0,0,3,2] +; AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[0,0,3,2] +; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,3,3,3,7,7,7,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] ; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi) ; AVX1-NEXT: vmovdqa %xmm7, 112(%rsi) ; AVX1-NEXT: vmovdqa %xmm6, 48(%rsi) ; AVX1-NEXT: vmovups %ymm1, 128(%rsi) -; AVX1-NEXT: vmovupd %ymm5, 192(%rsi) -; AVX1-NEXT: vmovups %ymm4, 224(%rsi) +; AVX1-NEXT: vmovups %ymm5, 224(%rsi) +; AVX1-NEXT: vmovupd %ymm4, 192(%rsi) ; AVX1-NEXT: vmovups %ymm3, 160(%rsi) ; AVX1-NEXT: vmovups %ymm2, 64(%rsi) ; AVX1-NEXT: vzeroupper @@ -2455,39 +2458,39 @@ ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpbroadcastd %xmm0, %ymm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[1,1,1,1,5,5,5,5] -; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[3,3,3,3,7,7,7,7] -; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[3,3,3,3,7,7,7,7] ; AVX2-NEXT: vmovdqu %ymm0, 128(%rsi) -; AVX2-NEXT: vmovdqu %ymm7, 192(%rsi) -; AVX2-NEXT: vmovdqu %ymm6, 224(%rsi) +; AVX2-NEXT: vmovdqu %ymm7, 224(%rsi) +; AVX2-NEXT: vmovdqu %ymm6, 192(%rsi) ; AVX2-NEXT: vmovdqu %ymm5, 160(%rsi) -; AVX2-NEXT: vmovdqu %ymm4, 64(%rsi) -; AVX2-NEXT: vmovdqa %xmm3, 112(%rsi) -; AVX2-NEXT: vmovdqu %ymm2, (%rsi) -; AVX2-NEXT: vmovdqa %xmm1, 48(%rsi) +; AVX2-NEXT: vmovdqa %xmm4, 112(%rsi) +; AVX2-NEXT: vmovdqu %ymm3, 64(%rsi) +; AVX2-NEXT: vmovdqa %xmm2, 48(%rsi) +; AVX2-NEXT: vmovdqu %ymm1, (%rsi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; XOP-LABEL: D107009: ; XOP: # %bb.0: -; XOP-NEXT: vmovups 96(%rdi), %ymm0 -; XOP-NEXT: vmovups (%rdi), %ymm1 +; XOP-NEXT: vmovups (%rdi), %ymm0 +; XOP-NEXT: vmovups 64(%rdi), %ymm1 ; XOP-NEXT: vmovups 128(%rdi), %ymm2 -; XOP-NEXT: vmovups 224(%rdi), %ymm3 +; XOP-NEXT: vmovups 192(%rdi), %ymm3 ; XOP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2] ; XOP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[4],mem[4],ymm2[5],mem[5] -; XOP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,0],ymm2[4,5],ymm3[6,4] -; XOP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 -; XOP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; XOP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; XOP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1],ymm3[0,2],ymm2[4,5],ymm3[4,6] +; XOP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm0 -; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; XOP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; XOP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 +; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; XOP-NEXT: vpsrld $16, %xmm0, %xmm0 ; XOP-NEXT: vextractf128 $1, %ymm2, %xmm1 ; XOP-NEXT: vpsrld $16, %xmm1, %xmm1 @@ -2496,16 +2499,16 @@ ; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; XOP-NEXT: vmovshdup {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] -; XOP-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7] -; XOP-NEXT: vshufpd {{.*#+}} ymm5 = ymm1[0,0,3,2] +; XOP-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[0,0,3,2] +; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,3,3,3,7,7,7,7] ; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] ; XOP-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] ; XOP-NEXT: vmovdqa %xmm0, 16(%rsi) ; XOP-NEXT: vmovdqa %xmm7, 112(%rsi) ; XOP-NEXT: vmovdqa %xmm6, 48(%rsi) ; XOP-NEXT: vmovups %ymm1, 128(%rsi) -; XOP-NEXT: vmovupd %ymm5, 192(%rsi) -; XOP-NEXT: vmovups %ymm4, 224(%rsi) +; XOP-NEXT: vmovups %ymm5, 224(%rsi) +; XOP-NEXT: vmovupd %ymm4, 192(%rsi) ; XOP-NEXT: vmovups %ymm3, 160(%rsi) ; XOP-NEXT: vmovups %ymm2, 64(%rsi) ; XOP-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -77,28 +77,51 @@ ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: retq ; -; AVX-LABEL: PR40815: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX-NEXT: vmovaps 48(%rdi), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rsi) -; AVX-NEXT: vmovaps %xmm3, (%rsi) -; AVX-NEXT: vmovaps %xmm0, 48(%rsi) -; AVX-NEXT: vmovaps %xmm1, 32(%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: PR40815: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-NEXT: vmovaps (%rdi), %ymm1 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,0,1] +; AVX1-NEXT: vmovups %ymm2, (%rsi) +; AVX1-NEXT: vextractf128 $1, %ymm1, 32(%rsi) +; AVX1-NEXT: vmovaps %xmm0, 48(%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR40815: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = mem[2,3,0,1] +; AVX2-NEXT: vmovups %ymm2, (%rsi) +; AVX2-NEXT: vextractf128 $1, %ymm1, 32(%rsi) +; AVX2-NEXT: vmovaps %xmm0, 48(%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: PR40815: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX512-NEXT: vmovups 16(%rdi), %ymm1 -; AVX512-NEXT: vinsertf128 $1, (%rdi), %ymm1, %ymm1 -; AVX512-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0 -; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vmovups %zmm0, (%rsi) +; AVX512-NEXT: vmovaps (%rdi), %xmm0 +; AVX512-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX512-NEXT: vpermpd {{.*#+}} ymm2 = mem[2,3,0,1] +; AVX512-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX512-NEXT: vmovups %ymm2, (%rsi) +; AVX512-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX512-NEXT: vmovaps %xmm0, 48(%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq +; +; XOP-LABEL: PR40815: +; XOP: # %bb.0: +; XOP-NEXT: vmovaps (%rdi), %xmm0 +; XOP-NEXT: vmovaps (%rdi), %ymm1 +; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,0,1] +; XOP-NEXT: vmovups %ymm2, (%rsi) +; XOP-NEXT: vextractf128 $1, %ymm1, 32(%rsi) +; XOP-NEXT: vmovaps %xmm0, 48(%rsi) +; XOP-NEXT: vzeroupper +; XOP-NEXT: retq %3 = load <16 x float>, ptr %0, align 64 %4 = shufflevector <16 x float> %3, <16 x float> undef, <4 x i32> %5 = getelementptr inbounds %struct.Mat4, ptr %1, i64 0, i32 0, i32 0, i64 4 @@ -157,106 +180,125 @@ ; SSE2-LABEL: PR42833: ; SSE2: # %bb.0: ; SSE2-NEXT: movl b(%rip), %eax +; SSE2-NEXT: movdqa c+144(%rip), %xmm2 ; SSE2-NEXT: movdqa c+128(%rip), %xmm0 -; SSE2-NEXT: movdqa c+144(%rip), %xmm1 ; SSE2-NEXT: addl c+128(%rip), %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: movdqa d+144(%rip), %xmm4 -; SSE2-NEXT: psubd %xmm1, %xmm4 -; SSE2-NEXT: paddd %xmm1, %xmm1 +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movaps {{.*#+}} xmm3 = +; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pslld $23, %xmm3 +; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: cvttps2dq %xmm3, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: paddd %xmm0, %xmm5 -; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] -; SSE2-NEXT: movdqa %xmm1, c+144(%rip) +; SSE2-NEXT: pmuludq %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3] +; SSE2-NEXT: movdqa d+144(%rip), %xmm3 +; SSE2-NEXT: psubd %xmm2, %xmm3 +; SSE2-NEXT: paddd %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm2, c+144(%rip) ; SSE2-NEXT: movaps %xmm5, c+128(%rip) -; SSE2-NEXT: movdqa c+160(%rip), %xmm1 -; SSE2-NEXT: movdqa c+176(%rip), %xmm3 -; SSE2-NEXT: movdqa d+160(%rip), %xmm5 -; SSE2-NEXT: movdqa d+176(%rip), %xmm6 +; SSE2-NEXT: movdqa c+176(%rip), %xmm2 +; SSE2-NEXT: movdqa c+160(%rip), %xmm4 +; SSE2-NEXT: movdqa d+176(%rip), %xmm5 +; SSE2-NEXT: movdqa d+160(%rip), %xmm6 ; SSE2-NEXT: movdqa d+128(%rip), %xmm7 -; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE2-NEXT: psubd %xmm0, %xmm7 -; SSE2-NEXT: psubd %xmm3, %xmm6 -; SSE2-NEXT: psubd %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm5, d+160(%rip) -; SSE2-NEXT: movdqa %xmm6, d+176(%rip) -; SSE2-NEXT: movdqa %xmm4, d+144(%rip) +; SSE2-NEXT: psubd %xmm4, %xmm6 +; SSE2-NEXT: psubd %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm5, d+176(%rip) +; SSE2-NEXT: movdqa %xmm6, d+160(%rip) +; SSE2-NEXT: movdqa %xmm3, d+144(%rip) ; SSE2-NEXT: movdqa %xmm7, d+128(%rip) -; SSE2-NEXT: paddd %xmm3, %xmm3 -; SSE2-NEXT: paddd %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, c+160(%rip) -; SSE2-NEXT: movdqa %xmm3, c+176(%rip) +; SSE2-NEXT: paddd %xmm4, %xmm4 +; SSE2-NEXT: paddd %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm2, c+176(%rip) +; SSE2-NEXT: movdqa %xmm4, c+160(%rip) ; SSE2-NEXT: retq ; ; SSE42-LABEL: PR42833: ; SSE42: # %bb.0: ; SSE42-NEXT: movl b(%rip), %eax -; SSE42-NEXT: movdqa c+128(%rip), %xmm0 ; SSE42-NEXT: movdqa c+144(%rip), %xmm1 +; SSE42-NEXT: movdqa c+128(%rip), %xmm0 ; SSE42-NEXT: addl c+128(%rip), %eax -; SSE42-NEXT: movd %eax, %xmm2 -; SSE42-NEXT: paddd %xmm0, %xmm2 +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = +; SSE42-NEXT: pinsrd $0, %eax, %xmm2 +; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: paddd %xmm2, %xmm3 +; SSE42-NEXT: pslld $23, %xmm2 +; SSE42-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE42-NEXT: cvttps2dq %xmm2, %xmm2 +; SSE42-NEXT: pmulld %xmm0, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] ; SSE42-NEXT: movdqa d+144(%rip), %xmm3 ; SSE42-NEXT: psubd %xmm1, %xmm3 ; SSE42-NEXT: paddd %xmm1, %xmm1 -; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: paddd %xmm0, %xmm4 -; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, c+144(%rip) -; SSE42-NEXT: movdqa %xmm4, c+128(%rip) -; SSE42-NEXT: movdqa c+160(%rip), %xmm1 -; SSE42-NEXT: movdqa c+176(%rip), %xmm2 -; SSE42-NEXT: movdqa d+160(%rip), %xmm4 -; SSE42-NEXT: movdqa d+176(%rip), %xmm5 +; SSE42-NEXT: movdqa %xmm2, c+128(%rip) +; SSE42-NEXT: movdqa c+176(%rip), %xmm1 +; SSE42-NEXT: movdqa c+160(%rip), %xmm2 +; SSE42-NEXT: movdqa d+176(%rip), %xmm4 +; SSE42-NEXT: movdqa d+160(%rip), %xmm5 ; SSE42-NEXT: movdqa d+128(%rip), %xmm6 ; SSE42-NEXT: pinsrd $0, %eax, %xmm0 ; SSE42-NEXT: psubd %xmm0, %xmm6 ; SSE42-NEXT: psubd %xmm2, %xmm5 ; SSE42-NEXT: psubd %xmm1, %xmm4 -; SSE42-NEXT: movdqa %xmm4, d+160(%rip) -; SSE42-NEXT: movdqa %xmm5, d+176(%rip) +; SSE42-NEXT: movdqa %xmm4, d+176(%rip) +; SSE42-NEXT: movdqa %xmm5, d+160(%rip) ; SSE42-NEXT: movdqa %xmm3, d+144(%rip) ; SSE42-NEXT: movdqa %xmm6, d+128(%rip) ; SSE42-NEXT: paddd %xmm2, %xmm2 ; SSE42-NEXT: paddd %xmm1, %xmm1 -; SSE42-NEXT: movdqa %xmm1, c+160(%rip) -; SSE42-NEXT: movdqa %xmm2, c+176(%rip) +; SSE42-NEXT: movdqa %xmm1, c+176(%rip) +; SSE42-NEXT: movdqa %xmm2, c+160(%rip) ; SSE42-NEXT: retq ; ; AVX1-LABEL: PR42833: ; AVX1: # %bb.0: ; AVX1-NEXT: movl b(%rip), %eax ; AVX1-NEXT: addl c+128(%rip), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; AVX1-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa c+128(%rip), %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vmovdqa c+144(%rip), %xmm3 +; AVX1-NEXT: vpslld $23, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX1-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7] ; AVX1-NEXT: vmovdqa d+144(%rip), %xmm2 ; AVX1-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vmovups %ymm0, c+128(%rip) ; AVX1-NEXT: vpinsrd $0, %eax, %xmm1, %xmm0 ; AVX1-NEXT: vmovdqa d+128(%rip), %xmm1 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa d+176(%rip), %xmm1 -; AVX1-NEXT: vmovdqa c+176(%rip), %xmm3 +; AVX1-NEXT: vmovdqa d+160(%rip), %xmm1 +; AVX1-NEXT: vmovdqa c+160(%rip), %xmm3 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa d+160(%rip), %xmm4 -; AVX1-NEXT: vmovdqa c+160(%rip), %xmm5 +; AVX1-NEXT: vmovdqa d+176(%rip), %xmm4 +; AVX1-NEXT: vmovdqa c+176(%rip), %xmm5 ; AVX1-NEXT: vpsubd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vmovdqa %xmm4, d+176(%rip) +; AVX1-NEXT: vmovdqa %xmm1, d+160(%rip) ; AVX1-NEXT: vmovdqa %xmm2, d+144(%rip) -; AVX1-NEXT: vmovdqa %xmm4, d+160(%rip) -; AVX1-NEXT: vmovdqa %xmm1, d+176(%rip) ; AVX1-NEXT: vmovdqa %xmm0, d+128(%rip) ; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm0 ; AVX1-NEXT: vpaddd %xmm5, %xmm5, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, c+160(%rip) -; AVX1-NEXT: vmovdqa %xmm0, c+176(%rip) +; AVX1-NEXT: vmovdqa %xmm1, c+176(%rip) +; AVX1-NEXT: vmovdqa %xmm0, c+160(%rip) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -288,12 +330,12 @@ ; AVX512-NEXT: movl b(%rip), %eax ; AVX512-NEXT: vmovdqu c+128(%rip), %ymm0 ; AVX512-NEXT: vmovdqu64 c+128(%rip), %zmm1 +; AVX512-NEXT: vmovdqa c+128(%rip), %xmm2 ; AVX512-NEXT: addl c+128(%rip), %eax -; AVX512-NEXT: vmovd %eax, %xmm2 -; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm2 +; AVX512-NEXT: vmovd %eax, %xmm3 +; AVX512-NEXT: vpaddd %ymm3, %ymm0, %ymm3 ; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7] -; AVX512-NEXT: vmovdqa c+128(%rip), %xmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7] ; AVX512-NEXT: vmovdqu %ymm0, c+128(%rip) ; AVX512-NEXT: vmovdqu c+160(%rip), %ymm0 ; AVX512-NEXT: vmovdqu64 d+128(%rip), %zmm3 @@ -311,34 +353,35 @@ ; XOP: # %bb.0: ; XOP-NEXT: movl b(%rip), %eax ; XOP-NEXT: addl c+128(%rip), %eax -; XOP-NEXT: vmovd %eax, %xmm0 +; XOP-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; XOP-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 ; XOP-NEXT: vmovdqa c+128(%rip), %xmm1 -; XOP-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; XOP-NEXT: vpaddd %xmm1, %xmm1, %xmm2 +; XOP-NEXT: vpaddd %xmm0, %xmm1, %xmm2 ; XOP-NEXT: vmovdqa c+144(%rip), %xmm3 +; XOP-NEXT: vpshld %xmm0, %xmm1, %xmm0 ; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm3 -; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7] +; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7] ; XOP-NEXT: vmovdqa d+144(%rip), %xmm2 ; XOP-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2 ; XOP-NEXT: vmovups %ymm0, c+128(%rip) ; XOP-NEXT: vpinsrd $0, %eax, %xmm1, %xmm0 ; XOP-NEXT: vmovdqa d+128(%rip), %xmm1 ; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; XOP-NEXT: vmovdqa d+176(%rip), %xmm1 -; XOP-NEXT: vmovdqa c+176(%rip), %xmm3 +; XOP-NEXT: vmovdqa d+160(%rip), %xmm1 +; XOP-NEXT: vmovdqa c+160(%rip), %xmm3 ; XOP-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vmovdqa d+160(%rip), %xmm4 -; XOP-NEXT: vmovdqa c+160(%rip), %xmm5 +; XOP-NEXT: vmovdqa d+176(%rip), %xmm4 +; XOP-NEXT: vmovdqa c+176(%rip), %xmm5 ; XOP-NEXT: vpsubd %xmm5, %xmm4, %xmm4 +; XOP-NEXT: vmovdqa %xmm4, d+176(%rip) +; XOP-NEXT: vmovdqa %xmm1, d+160(%rip) ; XOP-NEXT: vmovdqa %xmm2, d+144(%rip) -; XOP-NEXT: vmovdqa %xmm4, d+160(%rip) -; XOP-NEXT: vmovdqa %xmm1, d+176(%rip) ; XOP-NEXT: vmovdqa %xmm0, d+128(%rip) ; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm0 ; XOP-NEXT: vpaddd %xmm5, %xmm5, %xmm1 -; XOP-NEXT: vmovdqa %xmm1, c+160(%rip) -; XOP-NEXT: vmovdqa %xmm0, c+176(%rip) +; XOP-NEXT: vmovdqa %xmm1, c+176(%rip) +; XOP-NEXT: vmovdqa %xmm0, c+160(%rip) ; XOP-NEXT: vzeroupper ; XOP-NEXT: retq %1 = load i32, ptr @b, align 4 diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll --- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll +++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll @@ -16,8 +16,14 @@ define i1 @p0_scalar_urem_by_const(i32 %x, i32 %y) { ; CHECK-LABEL: p0_scalar_urem_by_const: ; CHECK: # %bb.0: -; CHECK-NEXT: testb %dil, %dil -; CHECK-NEXT: setns %al +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: andl $128, %edi +; CHECK-NEXT: imulq $715827883, %rdi, %rax # imm = 0x2AAAAAAB +; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: addl %eax, %eax +; CHECK-NEXT: leal (%rax,%rax,2), %eax +; CHECK-NEXT: cmpl %eax, %edi +; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %t0 = and i32 %x, 128 ; clearly a power-of-two or zero %t1 = urem i32 %t0, 6 ; '6' is clearly not a power of two @@ -42,12 +48,16 @@ ; CHECK-LABEL: p2_scalar_shifted_urem_by_const: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shll %cl, %edi -; CHECK-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB -; CHECK-NEXT: cmpl $1431655766, %eax # imm = 0x55555556 -; CHECK-NEXT: setb %al +; CHECK-NEXT: movl $2863311531, %eax # imm = 0xAAAAAAAB +; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: shrq $33, %rax +; CHECK-NEXT: leal (%rax,%rax,2), %eax +; CHECK-NEXT: cmpl %eax, %edi +; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %t0 = and i32 %x, 1 ; clearly a power-of-two or zero %t1 = shl i32 %t0, %y ; will still be a power-of-two or zero with any %y @@ -60,12 +70,16 @@ ; CHECK-LABEL: p3_scalar_shifted2_urem_by_const: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: andl $2, %edi ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shll %cl, %edi -; CHECK-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB -; CHECK-NEXT: cmpl $1431655766, %eax # imm = 0x55555556 -; CHECK-NEXT: setb %al +; CHECK-NEXT: movl $2863311531, %eax # imm = 0xAAAAAAAB +; CHECK-NEXT: imulq %rdi, %rax +; CHECK-NEXT: shrq $33, %rax +; CHECK-NEXT: leal (%rax,%rax,2), %eax +; CHECK-NEXT: cmpl %eax, %edi +; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %t0 = and i32 %x, 2 ; clearly a power-of-two or zero %t1 = shl i32 %t0, %y ; will still be a power-of-two or zero with any %y @@ -83,29 +97,33 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: psrld $2, %xmm2 +; SSE2-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: psubd %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: p4_vector_urem_by_const__splat: ; SSE4: # %bb.0: ; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: psrld $1, %xmm0 -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] -; SSE4-NEXT: pminud %xmm0, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; SSE4-NEXT: pmuludq %xmm2, %xmm1 +; SSE4-NEXT: pmuludq %xmm0, %xmm2 +; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; SSE4-NEXT: psrld $2, %xmm2 +; SSE4-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE4-NEXT: psubd %xmm2, %xmm0 +; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE4-NEXT: retq ; @@ -113,11 +131,16 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 +; AVX2-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %t0 = and <4 x i32> %x, ; clearly a power-of-two or zero @@ -130,43 +153,55 @@ ; SSE2-LABEL: p5_vector_urem_by_const__nonsplat: ; SSE2: # %bb.0: ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: psrlq $32, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,954437177] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrld $2, %xmm2 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[1,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] +; SSE2-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: p5_vector_urem_by_const__nonsplat: ; SSE4: # %bb.0: ; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = <1,u,2147483648,u> +; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,954437177] ; SSE4-NEXT: pmuludq %xmm0, %xmm1 -; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; SSE4-NEXT: psrlq $32, %xmm1 -; SSE4-NEXT: por %xmm1, %xmm0 -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,858993459,715827882,477218588] -; SSE4-NEXT: pminud %xmm0, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE4-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; SSE4-NEXT: movdqa %xmm2, %xmm1 +; SSE4-NEXT: psrld $2, %xmm1 +; SSE4-NEXT: psrld $1, %xmm2 +; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; SSE4-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE4-NEXT: psubd %xmm2, %xmm0 +; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE4-NEXT: retq ; ; AVX2-LABEL: p5_vector_urem_by_const__nonsplat: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %t0 = and <4 x i32> %x, @@ -180,32 +215,39 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: psrld $2, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: psubd %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: p6_vector_urem_by_const__nonsplat_undef0: ; SSE4: # %bb.0: ; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: movdqa %xmm0, %xmm1 -; SSE4-NEXT: psrld $1, %xmm1 -; SSE4-NEXT: pslld $31, %xmm0 -; SSE4-NEXT: por %xmm1, %xmm0 -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] -; SSE4-NEXT: pminud %xmm0, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; SSE4-NEXT: pmuludq %xmm2, %xmm1 +; SSE4-NEXT: pmuludq %xmm0, %xmm2 +; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; SSE4-NEXT: psrld $2, %xmm2 +; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE4-NEXT: psubd %xmm2, %xmm0 +; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE4-NEXT: retq ; @@ -213,11 +255,17 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6] +; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %t0 = and <4 x i32> %x, @@ -354,10 +402,13 @@ define i1 @n0_urem_of_maybe_not_power_of_two(i32 %x, i32 %y) { ; CHECK-LABEL: n0_urem_of_maybe_not_power_of_two: ; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: andl $3, %edi -; CHECK-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB -; CHECK-NEXT: cmpl $1431655766, %eax # imm = 0x55555556 -; CHECK-NEXT: setb %al +; CHECK-NEXT: imulq $1431655766, %rdi, %rax # imm = 0x55555556 +; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: leal (%rax,%rax,2), %eax +; CHECK-NEXT: cmpl %eax, %edi +; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %t0 = and i32 %x, 3 ; up to two bits set, not power-of-two %t1 = urem i32 %t0, 3 diff --git a/llvm/test/CodeGen/X86/overflow.ll b/llvm/test/CodeGen/X86/overflow.ll --- a/llvm/test/CodeGen/X86/overflow.ll +++ b/llvm/test/CodeGen/X86/overflow.ll @@ -56,9 +56,11 @@ ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: andl $1, %ecx +; X64-NEXT: xorl %esi, %esi ; X64-NEXT: addq %rdx, %rcx +; X64-NEXT: setb %sil ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: xorl %edx, %edx +; X64-NEXT: movq %rsi, %rdx ; X64-NEXT: retq %1 = zext i64 %a to i128 %2 = zext i64 %b to i128 diff --git a/llvm/test/CodeGen/X86/parity-vec.ll b/llvm/test/CodeGen/X86/parity-vec.ll --- a/llvm/test/CodeGen/X86/parity-vec.ll +++ b/llvm/test/CodeGen/X86/parity-vec.ll @@ -36,8 +36,8 @@ ; POPCNT-NEXT: psllw $7, %xmm0 ; POPCNT-NEXT: pmovmskb %xmm0, %eax ; POPCNT-NEXT: popcntl %eax, %eax -; POPCNT-NEXT: andl $1, %eax -; POPCNT-NEXT: # kill: def $al killed $al killed $eax +; POPCNT-NEXT: testb $1, %al +; POPCNT-NEXT: setne %al ; POPCNT-NEXT: retq %i1 = bitcast <16 x i1> %x to i16 %i2 = call i16 @llvm.ctpop.i16(i16 %i1) @@ -50,23 +50,8 @@ ; NOPOPCNT: # %bb.0: ; NOPOPCNT-NEXT: psllw $7, %xmm0 ; NOPOPCNT-NEXT: pmovmskb %xmm0, %eax -; NOPOPCNT-NEXT: movl %eax, %ecx -; NOPOPCNT-NEXT: shrl %ecx -; NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; NOPOPCNT-NEXT: subl %ecx, %eax -; NOPOPCNT-NEXT: movl %eax, %ecx -; NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; NOPOPCNT-NEXT: shrl $2, %eax -; NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; NOPOPCNT-NEXT: addl %ecx, %eax -; NOPOPCNT-NEXT: movl %eax, %ecx -; NOPOPCNT-NEXT: shrl $4, %ecx -; NOPOPCNT-NEXT: addl %eax, %ecx -; NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; NOPOPCNT-NEXT: movl %ecx, %eax -; NOPOPCNT-NEXT: shrl $8, %eax -; NOPOPCNT-NEXT: addl %ecx, %eax -; NOPOPCNT-NEXT: # kill: def $al killed $al killed $eax +; NOPOPCNT-NEXT: xorb %ah, %al +; NOPOPCNT-NEXT: setnp %al ; NOPOPCNT-NEXT: retq ; ; POPCNT-LABEL: canonical_parity_noncanonical_pred: @@ -74,7 +59,9 @@ ; POPCNT-NEXT: psllw $7, %xmm0 ; POPCNT-NEXT: pmovmskb %xmm0, %eax ; POPCNT-NEXT: popcntl %eax, %eax -; POPCNT-NEXT: # kill: def $al killed $al killed $eax +; POPCNT-NEXT: andl $1, %eax +; POPCNT-NEXT: cmpw $1, %ax +; POPCNT-NEXT: sete %al ; POPCNT-NEXT: retq %i1 = bitcast <16 x i1> %x to i16 %i2 = call i16 @llvm.ctpop.i16(i16 %i1) @@ -143,8 +130,8 @@ ; POPCNT-NEXT: pmovmskb %xmm0, %eax ; POPCNT-NEXT: popcntl %eax, %eax ; POPCNT-NEXT: andl $1, %eax -; POPCNT-NEXT: xorb $1, %al -; POPCNT-NEXT: # kill: def $al killed $al killed $eax +; POPCNT-NEXT: cmpw $1, %ax +; POPCNT-NEXT: setne %al ; POPCNT-NEXT: retq %i1 = bitcast <16 x i1> %x to i16 %i2 = call i16 @llvm.ctpop.i16(i16 %i1) diff --git a/llvm/test/CodeGen/X86/phaddsub-extract.ll b/llvm/test/CodeGen/X86/phaddsub-extract.ll --- a/llvm/test/CodeGen/X86/phaddsub-extract.ll +++ b/llvm/test/CodeGen/X86/phaddsub-extract.ll @@ -1687,23 +1687,64 @@ ; SSE3-FAST-NEXT: movd %xmm1, %eax ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: partial_reduction_add_v8i32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovd %xmm0, %eax -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: partial_reduction_add_v8i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: partial_reduction_add_v8i32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vmovd %xmm0, %eax -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: partial_reduction_add_v8i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: partial_reduction_add_v8i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: partial_reduction_add_v8i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %eax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: partial_reduction_add_v8i32: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-SLOW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vmovd %xmm0, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: partial_reduction_add_v8i32: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vmovd %xmm0, %eax +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq %x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> %x0213 = add <8 x i32> %x, %x23 %x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32> @@ -1730,23 +1771,64 @@ ; SSE3-FAST-NEXT: movd %xmm1, %eax ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: partial_reduction_add_v16i32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovd %xmm0, %eax -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: partial_reduction_add_v16i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: partial_reduction_add_v16i32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vmovd %xmm0, %eax -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: partial_reduction_add_v16i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: partial_reduction_add_v16i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: partial_reduction_add_v16i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %eax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: partial_reduction_add_v16i32: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-SLOW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vmovd %xmm0, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: partial_reduction_add_v16i32: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-FAST-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vmovd %xmm0, %eax +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq %x23 = shufflevector <16 x i32> %x, <16 x i32> undef, <16 x i32> %x0213 = add <16 x i32> %x, %x23 %x13 = shufflevector <16 x i32> %x0213, <16 x i32> undef, <16 x i32> @@ -1773,24 +1855,64 @@ ; SSE3-FAST-NEXT: movd %xmm0, %eax ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: partial_reduction_sub_v8i32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovd %xmm0, %eax -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: partial_reduction_sub_v8i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: partial_reduction_sub_v8i32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vmovd %xmm0, %eax -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: partial_reduction_sub_v8i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: partial_reduction_sub_v8i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: partial_reduction_sub_v8i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %eax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: partial_reduction_sub_v8i32: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-SLOW-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vmovd %xmm0, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: partial_reduction_sub_v8i32: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-FAST-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vmovd %xmm0, %eax +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq %x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> %x0213 = sub <8 x i32> %x, %x23 %x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32> @@ -1817,15 +1939,15 @@ ; SSE3-FAST-NEXT: movd %xmm0, %eax ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: partial_reduction_sub_v16i32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovd %xmm0, %eax -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: partial_reduction_sub_v16i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: partial_reduction_sub_v16i32: ; AVX1-FAST: # %bb.0: @@ -1836,19 +1958,40 @@ ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq ; +; AVX2-SLOW-LABEL: partial_reduction_sub_v16i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; ; AVX2-FAST-LABEL: partial_reduction_sub_v16i32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovd %xmm0, %eax ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; +; AVX512-SLOW-LABEL: partial_reduction_sub_v16i32: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-SLOW-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vmovd %xmm0, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq +; ; AVX512-FAST-LABEL: partial_reduction_sub_v16i32: ; AVX512-FAST: # %bb.0: ; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vpsubd %zmm1, %zmm0, %zmm0 ; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX512-FAST-NEXT: vmovd %xmm0, %eax @@ -1937,8 +2080,10 @@ ; ; SSE3-FAST-LABEL: hadd16_8: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 -; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE3-FAST-NEXT: paddw %xmm0, %xmm1 +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE3-FAST-NEXT: paddw %xmm1, %xmm0 ; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 ; SSE3-FAST-NEXT: movd %xmm0, %eax ; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -1958,8 +2103,10 @@ ; ; AVX-FAST-LABEL: hadd16_8: ; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vmovd %xmm0, %eax ; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -1986,9 +2133,10 @@ ; ; SSE3-FAST-LABEL: hadd32_4: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 -; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 -; SSE3-FAST-NEXT: movd %xmm0, %eax +; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE3-FAST-NEXT: paddd %xmm0, %xmm1 +; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1 +; SSE3-FAST-NEXT: movd %xmm1, %eax ; SSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: hadd32_4: @@ -2002,7 +2150,8 @@ ; ; AVX-FAST-LABEL: hadd32_4: ; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vmovd %xmm0, %eax ; AVX-FAST-NEXT: retq @@ -2032,23 +2181,64 @@ ; SSE3-FAST-NEXT: movd %xmm1, %eax ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: hadd32_8: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovd %xmm0, %eax -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: hadd32_8: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: hadd32_8: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vmovd %xmm0, %eax -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: hadd32_8: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: hadd32_8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: hadd32_8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %eax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: hadd32_8: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-SLOW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vmovd %xmm0, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: hadd32_8: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vmovd %xmm0, %eax +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq %x226 = shufflevector <8 x i32> %x225, <8 x i32> undef, <8 x i32> %x227 = add <8 x i32> %x225, %x226 %x228 = shufflevector <8 x i32> %x227, <8 x i32> undef, <8 x i32> @@ -2075,23 +2265,64 @@ ; SSE3-FAST-NEXT: movd %xmm1, %eax ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: hadd32_16: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovd %xmm0, %eax -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: hadd32_16: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: hadd32_16: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vmovd %xmm0, %eax -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: hadd32_16: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: hadd32_16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: hadd32_16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %eax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: hadd32_16: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-SLOW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vmovd %xmm0, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: hadd32_16: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-FAST-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vmovd %xmm0, %eax +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq %x226 = shufflevector <16 x i32> %x225, <16 x i32> undef, <16 x i32> %x227 = add <16 x i32> %x225, %x226 %x228 = shufflevector <16 x i32> %x227, <16 x i32> undef, <16 x i32> @@ -2103,8 +2334,10 @@ define i16 @hadd16_8_optsize(<8 x i16> %x223) optsize { ; SSE3-LABEL: hadd16_8_optsize: ; SSE3: # %bb.0: -; SSE3-NEXT: phaddw %xmm0, %xmm0 -; SSE3-NEXT: phaddw %xmm0, %xmm0 +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE3-NEXT: paddw %xmm0, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE3-NEXT: paddw %xmm1, %xmm0 ; SSE3-NEXT: phaddw %xmm0, %xmm0 ; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: # kill: def $ax killed $ax killed $eax @@ -2112,8 +2345,10 @@ ; ; AVX-LABEL: hadd16_8_optsize: ; AVX: # %bb.0: -; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax @@ -2131,14 +2366,16 @@ define i32 @hadd32_4_optsize(<4 x i32> %x225) optsize { ; SSE3-LABEL: hadd32_4_optsize: ; SSE3: # %bb.0: -; SSE3-NEXT: phaddd %xmm0, %xmm0 -; SSE3-NEXT: phaddd %xmm0, %xmm0 -; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE3-NEXT: paddd %xmm0, %xmm1 +; SSE3-NEXT: phaddd %xmm1, %xmm1 +; SSE3-NEXT: movd %xmm1, %eax ; SSE3-NEXT: retq ; ; AVX-LABEL: hadd32_4_optsize: ; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq @@ -2153,14 +2390,16 @@ define i32 @hadd32_4_pgso(<4 x i32> %x225) !prof !14 { ; SSE3-LABEL: hadd32_4_pgso: ; SSE3: # %bb.0: -; SSE3-NEXT: phaddd %xmm0, %xmm0 -; SSE3-NEXT: phaddd %xmm0, %xmm0 -; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE3-NEXT: paddd %xmm0, %xmm1 +; SSE3-NEXT: phaddd %xmm1, %xmm1 +; SSE3-NEXT: movd %xmm1, %eax ; SSE3-NEXT: retq ; ; AVX-LABEL: hadd32_4_pgso: ; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq @@ -2181,13 +2420,63 @@ ; SSE3-NEXT: movd %xmm1, %eax ; SSE3-NEXT: retq ; -; AVX-LABEL: hadd32_8_optsize: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-SLOW-LABEL: hadd32_8_optsize: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: hadd32_8_optsize: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: hadd32_8_optsize: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: hadd32_8_optsize: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %eax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: hadd32_8_optsize: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-SLOW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vmovd %xmm0, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: hadd32_8_optsize: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vmovd %xmm0, %eax +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq %x226 = shufflevector <8 x i32> %x225, <8 x i32> undef, <8 x i32> %x227 = add <8 x i32> %x225, %x226 %x228 = shufflevector <8 x i32> %x227, <8 x i32> undef, <8 x i32> @@ -2205,13 +2494,63 @@ ; SSE3-NEXT: movd %xmm1, %eax ; SSE3-NEXT: retq ; -; AVX-LABEL: hadd32_16_optsize: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-SLOW-LABEL: hadd32_16_optsize: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: hadd32_16_optsize: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: hadd32_16_optsize: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: hadd32_16_optsize: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %eax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: hadd32_16_optsize: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-SLOW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vmovd %xmm0, %eax +; AVX512-SLOW-NEXT: vzeroupper +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: hadd32_16_optsize: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-FAST-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vmovd %xmm0, %eax +; AVX512-FAST-NEXT: vzeroupper +; AVX512-FAST-NEXT: retq %x226 = shufflevector <16 x i32> %x225, <16 x i32> undef, <16 x i32> %x227 = add <16 x i32> %x225, %x226 %x228 = shufflevector <16 x i32> %x227, <16 x i32> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/phaddsub.ll b/llvm/test/CodeGen/X86/phaddsub.ll --- a/llvm/test/CodeGen/X86/phaddsub.ll +++ b/llvm/test/CodeGen/X86/phaddsub.ll @@ -340,38 +340,17 @@ } define <4 x i32> @phaddd_single_source2(<4 x i32> %x) { -; SSSE3-SLOW-LABEL: phaddd_single_source2: -; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: retq -; -; SSSE3-FAST-LABEL: phaddd_single_source2: -; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] -; SSSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: phaddd_single_source2: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: phaddd_single_source2: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] -; AVX-FAST-NEXT: retq +; SSSE3-LABEL: phaddd_single_source2: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; SSSE3-NEXT: retq ; -; AVX2-SHUF-LABEL: phaddd_single_source2: -; AVX2-SHUF: # %bb.0: -; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] -; AVX2-SHUF-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-SHUF-NEXT: retq +; AVX-LABEL: phaddd_single_source2: +; AVX: # %bb.0: +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %r @@ -503,38 +482,17 @@ } define <8 x i16> @phaddw_single_source2(<8 x i16> %x) { -; SSSE3-SLOW-LABEL: phaddw_single_source2: -; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7] -; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: retq -; -; SSSE3-FAST-LABEL: phaddw_single_source2: -; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7] -; SSSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: phaddw_single_source2: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7] -; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: phaddw_single_source2: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7] -; AVX-FAST-NEXT: retq +; SSSE3-LABEL: phaddw_single_source2: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] +; SSSE3-NEXT: retq ; -; AVX2-SHUF-LABEL: phaddw_single_source2: -; AVX2-SHUF: # %bb.0: -; AVX2-SHUF-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-SHUF-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX2-SHUF-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-SHUF-NEXT: retq +; AVX-LABEL: phaddw_single_source2: +; AVX: # %bb.0: +; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] +; AVX-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %add = add <8 x i16> %l, %r @@ -644,9 +602,11 @@ ; AVX1-FAST-LABEL: PR39936_v8i32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -664,8 +624,8 @@ ; ; AVX2-FAST-LABEL: PR39936_v8i32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/pmaddubsw.ll b/llvm/test/CodeGen/X86/pmaddubsw.ll --- a/llvm/test/CodeGen/X86/pmaddubsw.ll +++ b/llvm/test/CodeGen/X86/pmaddubsw.ll @@ -10,15 +10,67 @@ define <8 x i16> @pmaddubsw_128(ptr %Aptr, ptr %Bptr) { ; SSE-LABEL: pmaddubsw_128: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rsi), %xmm0 -; SSE-NEXT: pmaddubsw (%rdi), %xmm0 +; SSE-NEXT: movdqa (%rsi), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: psllw $8, %xmm0 +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: psraw $8, %xmm2 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: pmaddwd %xmm4, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: pmaddwd %xmm1, %xmm0 +; SSE-NEXT: packssdw %xmm5, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: pmaddubsw_128: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rsi), %xmm0 -; AVX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: pmaddubsw_128: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa (%rsi), %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 +; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-NEXT: vpmaddwd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX256-LABEL: pmaddubsw_128: +; AVX256: # %bb.0: +; AVX256-NEXT: vmovdqa (%rdi), %xmm0 +; AVX256-NEXT: vmovdqa (%rsi), %xmm1 +; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX256-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX256-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX256-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX256-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX256-NEXT: vpmovsxbd %xmm3, %ymm3 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX256-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 +; AVX256-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %A = load <16 x i8>, ptr %Aptr %B = load <16 x i8>, ptr %Bptr %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> @@ -43,26 +95,189 @@ define <16 x i16> @pmaddubsw_256(ptr %Aptr, ptr %Bptr) { ; SSE-LABEL: pmaddubsw_256: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rsi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm1 -; SSE-NEXT: pmaddubsw (%rdi), %xmm0 -; SSE-NEXT: pmaddubsw 16(%rdi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: psllw $8, %xmm0 +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: psllw $8, %xmm1 +; SSE-NEXT: psraw $8, %xmm1 +; SSE-NEXT: psraw $8, %xmm3 +; SSE-NEXT: psraw $8, %xmm5 +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; SSE-NEXT: pmaddwd %xmm7, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: pmaddwd %xmm4, %xmm1 +; SSE-NEXT: packssdw %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; SSE-NEXT: pmaddwd %xmm4, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: pmaddwd %xmm2, %xmm0 +; SSE-NEXT: packssdw %xmm5, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: pmaddubsw_256: ; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX1-NEXT: # xmm5 = mem[0,0] +; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm0[0],xmm1[0] ; AVX1-NEXT: vmovdqa (%rsi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX1-NEXT: vpmaddubsw 16(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm7 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpackuswb %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] +; AVX1-NEXT: vpackuswb %xmm3, %xmm3, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,1,1] +; AVX1-NEXT: vpackuswb %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[2,3,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[3,3,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[3,3,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[1,1,1,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm15 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; AVX1-NEXT: vpmovsxbw %xmm6, %xmm6 +; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX1-NEXT: vpmovsxbw %xmm12, %xmm6 +; AVX1-NEXT: vpmovsxbw %xmm5, %xmm5 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpackssdw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-NEXT: vpmovsxbw %xmm11, %xmm5 +; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX1-NEXT: vpmaddwd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX1-NEXT: vpmovsxbw %xmm10, %xmm4 +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; -; AVX256-LABEL: pmaddubsw_256: -; AVX256: # %bb.0: -; AVX256-NEXT: vmovdqa (%rsi), %ymm0 -; AVX256-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 -; AVX256-NEXT: retq +; AVX2-LABEL: pmaddubsw_256: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX2-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm3 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm7 +; AVX2-NEXT: vpmovsxbd %xmm7, %ymm7 +; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 +; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2 +; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 +; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpaddd %ymm2, %ymm5, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: pmaddubsw_256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm4 +; AVX512F-NEXT: vpackuswb %xmm3, %xmm4, %xmm3 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX512F-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX512F-NEXT: vpand %xmm2, %xmm5, %xmm6 +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX512F-NEXT: vpackuswb %xmm6, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX512F-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero +; AVX512F-NEXT: vpmulld %zmm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512F-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; AVX512F-NEXT: vpmovsdw %zmm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: pmaddubsw_256: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm3 +; AVX512BW-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmaddwd %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512BW-NEXT: vpmaddwd %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsdw %zmm0, %ymm0 +; AVX512BW-NEXT: retq %A = load <32 x i8>, ptr %Aptr %B = load <32 x i8>, ptr %Bptr %A_even = shufflevector <32 x i8> %A, <32 x i8> undef, <16 x i32> @@ -91,63 +306,548 @@ ; SSE-NEXT: movdqa (%rdx), %xmm0 ; SSE-NEXT: movdqa 16(%rdx), %xmm1 ; SSE-NEXT: movdqa 32(%rdx), %xmm2 -; SSE-NEXT: movdqa 48(%rdx), %xmm3 -; SSE-NEXT: pmaddubsw (%rsi), %xmm0 -; SSE-NEXT: pmaddubsw 16(%rsi), %xmm1 -; SSE-NEXT: pmaddubsw 32(%rsi), %xmm2 -; SSE-NEXT: pmaddubsw 48(%rsi), %xmm3 -; SSE-NEXT: movdqa 64(%rdx), %xmm4 -; SSE-NEXT: pmaddubsw 64(%rsi), %xmm4 -; SSE-NEXT: movdqa 80(%rdx), %xmm5 -; SSE-NEXT: pmaddubsw 80(%rsi), %xmm5 -; SSE-NEXT: movdqa 96(%rdx), %xmm6 -; SSE-NEXT: pmaddubsw 96(%rsi), %xmm6 -; SSE-NEXT: movdqa 112(%rdx), %xmm7 -; SSE-NEXT: pmaddubsw 112(%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm7, 112(%rdi) -; SSE-NEXT: movdqa %xmm6, 96(%rdi) -; SSE-NEXT: movdqa %xmm5, 80(%rdi) -; SSE-NEXT: movdqa %xmm4, 64(%rdi) -; SSE-NEXT: movdqa %xmm3, 48(%rdi) -; SSE-NEXT: movdqa %xmm2, 32(%rdi) -; SSE-NEXT: movdqa %xmm1, 16(%rdi) -; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: movdqa 48(%rdx), %xmm4 +; SSE-NEXT: movdqa 64(%rdx), %xmm5 +; SSE-NEXT: movdqa 80(%rdx), %xmm8 +; SSE-NEXT: movdqa 96(%rdx), %xmm10 +; SSE-NEXT: movdqa 112(%rdx), %xmm9 +; SSE-NEXT: movdqa 112(%rsi), %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: psllw $8, %xmm3 +; SSE-NEXT: psraw $8, %xmm3 +; SSE-NEXT: psraw $8, %xmm7 +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15] +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; SSE-NEXT: pmaddwd %xmm11, %xmm12 +; SSE-NEXT: movdqa 96(%rsi), %xmm11 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: psllw $8, %xmm7 +; SSE-NEXT: psraw $8, %xmm7 +; SSE-NEXT: psraw $8, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; SSE-NEXT: pmaddwd %xmm9, %xmm3 +; SSE-NEXT: packssdw %xmm12, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; SSE-NEXT: pmaddwd %xmm9, %xmm12 +; SSE-NEXT: movdqa 80(%rsi), %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; SSE-NEXT: movdqa %xmm13, %xmm9 +; SSE-NEXT: psllw $8, %xmm9 +; SSE-NEXT: psraw $8, %xmm9 +; SSE-NEXT: psraw $8, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] +; SSE-NEXT: pmaddwd %xmm10, %xmm7 +; SSE-NEXT: packssdw %xmm12, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm6[8],xmm10[9],xmm6[9],xmm10[10],xmm6[10],xmm10[11],xmm6[11],xmm10[12],xmm6[12],xmm10[13],xmm6[13],xmm10[14],xmm6[14],xmm10[15],xmm6[15] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] +; SSE-NEXT: pmaddwd %xmm10, %xmm11 +; SSE-NEXT: movdqa 64(%rsi), %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: psllw $8, %xmm10 +; SSE-NEXT: psraw $8, %xmm10 +; SSE-NEXT: psraw $8, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE-NEXT: pmaddwd %xmm8, %xmm9 +; SSE-NEXT: packssdw %xmm11, %xmm9 +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSE-NEXT: pmaddwd %xmm8, %xmm11 +; SSE-NEXT: movdqa 48(%rsi), %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: psllw $8, %xmm8 +; SSE-NEXT: psraw $8, %xmm8 +; SSE-NEXT: psraw $8, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE-NEXT: pmaddwd %xmm5, %xmm10 +; SSE-NEXT: packssdw %xmm11, %xmm10 +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] +; SSE-NEXT: pmaddwd %xmm5, %xmm11 +; SSE-NEXT: movdqa 32(%rsi), %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: psllw $8, %xmm5 +; SSE-NEXT: psraw $8, %xmm5 +; SSE-NEXT: psraw $8, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE-NEXT: pmaddwd %xmm4, %xmm8 +; SSE-NEXT: packssdw %xmm11, %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSE-NEXT: pmaddwd %xmm4, %xmm11 +; SSE-NEXT: movdqa 16(%rsi), %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: psllw $8, %xmm4 +; SSE-NEXT: psraw $8, %xmm4 +; SSE-NEXT: psraw $8, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: pmaddwd %xmm2, %xmm5 +; SSE-NEXT: packssdw %xmm11, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] +; SSE-NEXT: pmaddwd %xmm2, %xmm11 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: psllw $8, %xmm12 +; SSE-NEXT: psraw $8, %xmm12 +; SSE-NEXT: psraw $8, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: pmaddwd %xmm1, %xmm4 +; SSE-NEXT: packssdw %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; SSE-NEXT: pmaddwd %xmm1, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] +; SSE-NEXT: pmaddwd %xmm0, %xmm12 +; SSE-NEXT: packssdw %xmm11, %xmm12 +; SSE-NEXT: movdqa %xmm3, 112(%rdi) +; SSE-NEXT: movdqa %xmm7, 96(%rdi) +; SSE-NEXT: movdqa %xmm9, 80(%rdi) +; SSE-NEXT: movdqa %xmm10, 64(%rdi) +; SSE-NEXT: movdqa %xmm8, 48(%rdi) +; SSE-NEXT: movdqa %xmm5, 32(%rdi) +; SSE-NEXT: movdqa %xmm4, 16(%rdi) +; SSE-NEXT: movdqa %xmm12, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: pmaddubsw_512: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX1-NEXT: subq $104, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 112 +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa 80(%rdi), %xmm14 +; AVX1-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX1-NEXT: vmovdqa 112(%rdi), %xmm10 +; AVX1-NEXT: vmovdqa 96(%rdi), %xmm9 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm5 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm5[0],xmm0[0] +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm0 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm6 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0] +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm4, %xmm14, %xmm0 +; AVX1-NEXT: vpshufb %xmm4, %xmm13, %xmm8 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm8[0],xmm0[0] +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm4, %xmm10, %xmm0 +; AVX1-NEXT: vpshufb %xmm4, %xmm9, %xmm8 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm8[0],xmm0[0] +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa (%rsi), %xmm12 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm7 +; AVX1-NEXT: vpshufb %xmm4, %xmm7, %xmm1 +; AVX1-NEXT: vpshufb %xmm4, %xmm12, %xmm8 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm8[0],xmm1[0] +; AVX1-NEXT: vmovdqa 32(%rsi), %xmm5 ; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3 -; AVX1-NEXT: vpmaddubsw 16(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm6 +; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm8 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm8[0],xmm6[0] +; AVX1-NEXT: vmovdqa 80(%rsi), %xmm1 +; AVX1-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm8 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm15 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm15[0],xmm8[0] +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX1-NEXT: vpand {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX1-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpand {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX1-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpand {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX1-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpand %xmm0, %xmm14, %xmm14 +; AVX1-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpand %xmm0, %xmm13, %xmm13 +; AVX1-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpand %xmm0, %xmm10, %xmm14 +; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm9 +; AVX1-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpand %xmm0, %xmm7, %xmm10 +; AVX1-NEXT: vpand %xmm0, %xmm12, %xmm7 +; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm12 +; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm9 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm5 +; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa 112(%rsi), %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm1[0],xmm0[0] +; AVX1-NEXT: vpackuswb %xmm10, %xmm7, %xmm4 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-NEXT: vpackuswb %xmm8, %xmm1, %xmm2 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-NEXT: vpmovsxbw %xmm1, %xmm3 +; AVX1-NEXT: vpmovsxbw %xmm2, %xmm7 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-NEXT: vpmaddwd %xmm0, %xmm3, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,1,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpackssdw %xmm0, %xmm7, %xmm0 +; AVX1-NEXT: vpackuswb %xmm10, %xmm10, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,3,3,3] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[1,1,1,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-NEXT: vpackuswb %xmm8, %xmm8, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] +; AVX1-NEXT: vmovdqa %xmm1, %xmm10 +; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,1,1] +; AVX1-NEXT: vpmovsxbw %xmm8, %xmm8 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX1-NEXT: vpmaddwd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm8, %xmm8 +; AVX1-NEXT: vpmovsxbw %xmm7, %xmm7 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX1-NEXT: vpmaddwd %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpmaddubsw 48(%rdi), %xmm3, %xmm1 -; AVX1-NEXT: vpmaddubsw 32(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vmovdqa 80(%rsi), %xmm2 -; AVX1-NEXT: vpmaddubsw 80(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX1-NEXT: vpmaddubsw 64(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vmovdqa 112(%rsi), %xmm3 -; AVX1-NEXT: vpmaddubsw 112(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa 96(%rsi), %xmm4 -; AVX1-NEXT: vpmaddubsw 96(%rdi), %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vpackuswb %xmm12, %xmm9, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vpackuswb %xmm11, %xmm4, %xmm7 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vpmovsxbw %xmm10, %xmm8 +; AVX1-NEXT: vpmovsxbw %xmm7, %xmm9 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-NEXT: vpmaddwd %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[1,1,1,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,1,1,1] +; AVX1-NEXT: vpmovsxbw %xmm8, %xmm8 +; AVX1-NEXT: vpmovsxbw %xmm7, %xmm7 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX1-NEXT: vpmaddwd %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm12, %xmm12, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[3,3,3,3] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,1,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-NEXT: vpackuswb %xmm11, %xmm11, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[3,3,3,3] +; AVX1-NEXT: vmovdqa %xmm10, %xmm11 +; AVX1-NEXT: vpmovsxbw %xmm9, %xmm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[1,1,1,1] +; AVX1-NEXT: vpmovsxbw %xmm10, %xmm10 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-NEXT: vpmaddwd %xmm7, %xmm9, %xmm7 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm6, %xmm6 +; AVX1-NEXT: vpmovsxbw %xmm8, %xmm8 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX1-NEXT: vpmaddwd %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpackssdw %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload +; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-NEXT: vpackuswb %xmm12, %xmm4, %xmm7 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vpmovsxbw %xmm10, %xmm8 +; AVX1-NEXT: vpmovsxbw %xmm7, %xmm9 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-NEXT: vpmaddwd %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[1,1,1,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,1,1,1] +; AVX1-NEXT: vpmovsxbw %xmm8, %xmm8 +; AVX1-NEXT: vpmovsxbw %xmm7, %xmm7 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX1-NEXT: vpackuswb %xmm12, %xmm12, %xmm8 +; AVX1-NEXT: vpackuswb %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] +; AVX1-NEXT: vpackssdw %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[3,3,3,3] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,3,3,3] +; AVX1-NEXT: vpmovsxbw %xmm7, %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[1,1,1,1] +; AVX1-NEXT: vpmovsxbw %xmm9, %xmm9 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm7, %xmm7 +; AVX1-NEXT: vpmovsxbw %xmm8, %xmm8 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-NEXT: vpackuswb %xmm14, %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-NEXT: vpackuswb %xmm4, %xmm9, %xmm9 +; AVX1-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vpmovsxbw %xmm11, %xmm6 +; AVX1-NEXT: vpmovsxbw %xmm8, %xmm7 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[1,1,1,1] +; AVX1-NEXT: vpackuswb %xmm14, %xmm14, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] +; AVX1-NEXT: vpmaddwd %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,1,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[1,1,1,1] +; AVX1-NEXT: vpmovsxbw %xmm9, %xmm9 +; AVX1-NEXT: vpmovsxbw %xmm7, %xmm7 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[1,1,1,1] +; AVX1-NEXT: vpackuswb %xmm4, %xmm4, %xmm10 +; AVX1-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] +; AVX1-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[3,3,3,3] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[3,3,3,3] +; AVX1-NEXT: vpmovsxbw %xmm7, %xmm7 +; AVX1-NEXT: vpmovsxbw %xmm9, %xmm9 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm3[8],xmm13[9],xmm3[9],xmm13[10],xmm3[10],xmm13[11],xmm3[11],xmm13[12],xmm3[12],xmm13[13],xmm3[13],xmm13[14],xmm3[14],xmm13[15],xmm3[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 +; AVX1-NEXT: vpmovsxbw %xmm8, %xmm7 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackssdw %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: addq $104, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 8 ; AVX1-NEXT: retq ; ; AVX2-LABEL: pmaddubsw_512: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX2-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX2-NEXT: vmovdqa 96(%rsi), %ymm3 -; AVX2-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpmaddubsw 32(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vpmaddubsw 64(%rdi), %ymm2, %ymm2 -; AVX2-NEXT: vpmaddubsw 96(%rdi), %ymm3, %ymm3 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX2-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX2-NEXT: vmovdqa 64(%rsi), %ymm3 +; AVX2-NEXT: vmovdqa 96(%rsi), %ymm7 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm10 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX2-NEXT: vpshufb %ymm10, %ymm1, %ymm5 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX2-NEXT: vpshufb %ymm11, %ymm0, %ymm8 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,1,3] +; AVX2-NEXT: vpshufb %ymm10, %ymm6, %ymm8 +; AVX2-NEXT: vpshufb %ymm11, %ymm4, %ymm12 +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,1,3] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm12 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX2-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm13 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX2-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshufb %ymm12, %ymm6, %ymm1 +; AVX2-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpshufb %ymm10, %ymm9, %ymm4 +; AVX2-NEXT: vpshufb %ymm11, %ymm2, %ymm6 +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] +; AVX2-NEXT: vpshufb %ymm10, %ymm7, %ymm6 +; AVX2-NEXT: vpshufb %ymm11, %ymm3, %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3],ymm10[4,5],ymm6[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,1,3] +; AVX2-NEXT: vpshufb %ymm12, %ymm9, %ymm9 +; AVX2-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm9[2,3],ymm2[4,5],ymm9[6,7] +; AVX2-NEXT: vpshufb %ymm12, %ymm7, %ymm7 +; AVX2-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3],ymm3[4,5],ymm7[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm7 +; AVX2-NEXT: vpmovsxbd %xmm7, %ymm9 +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] +; AVX2-NEXT: vpmovsxbd %xmm7, %ymm7 +; AVX2-NEXT: vpmovsxbd %xmm8, %ymm12 +; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; AVX2-NEXT: vpmovsxbd %xmm8, %ymm8 +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm10 +; AVX2-NEXT: vpmovsxbd %xmm10, %ymm13 +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,3,2,3] +; AVX2-NEXT: vpmovsxbd %xmm10, %ymm14 +; AVX2-NEXT: vpmovsxbd %xmm5, %ymm15 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX2-NEXT: vpmovsxbd %xmm5, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm10, %ymm9, %ymm11 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm5, %ymm7, %ymm10 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm5, %ymm12, %ymm9 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm5, %ymm8, %ymm8 +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm6, %ymm13, %ymm7 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm5, %ymm14, %ymm6 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm5, %ymm15, %ymm5 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm4, %ymm0, %ymm4 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm12 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero,xmm13[2],zero,zero,zero,xmm13[3],zero,zero,zero,xmm13[4],zero,zero,zero,xmm13[5],zero,zero,zero,xmm13[6],zero,zero,zero,xmm13[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm14, %ymm12, %ymm12 +; AVX2-NEXT: vpaddd %ymm12, %ymm11, %ymm11 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[2,3,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero,xmm12[4],zero,zero,zero,xmm12[5],zero,zero,zero,xmm12[6],zero,zero,zero,xmm12[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm0, %ymm12, %ymm0 +; AVX2-NEXT: vpaddd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vpmovsxbd %xmm1, %ymm10 +; AVX2-NEXT: vpackssdw %ymm0, %ymm11, %ymm11 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vpermq $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-NEXT: # ymm10 = mem[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm9, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm9 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpmovsxbd %xmm9, %ymm3 +; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm8 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] +; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddd %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vpmovsxbd %xmm10, %ymm3 +; AVX2-NEXT: vpackssdw %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddd %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] +; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX2-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpaddd %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpackssdw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm8[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm11[0,2,1,3] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: pmaddubsw_512: @@ -166,10 +866,73 @@ ; ; AVX512BW-LABEL: pmaddubsw_512: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512BW-NEXT: vpmaddubsw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaddubsw 64(%rdi), %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [128,128,128,128,128,128,128,128,0,2,4,6,8,10,12,14,128,128,128,128,128,128,128,128,16,18,20,22,24,26,28,30,128,128,128,128,128,128,128,128,32,34,36,38,40,42,44,46,128,128,128,128,128,128,128,128,48,50,52,54,56,58,60,62] +; AVX512BW-NEXT: vpshufb %zmm4, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,2,4,6,8,10,12,14,128,128,128,128,128,128,128,128,16,18,20,22,24,26,28,30,128,128,128,128,128,128,128,128,32,34,36,38,40,42,44,46,128,128,128,128,128,128,128,128,48,50,52,54,56,58,60,62,128,128,128,128,128,128,128,128] +; AVX512BW-NEXT: vpshufb %zmm6, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,2,4,6,9,11,13,15] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [128,128,128,128,128,128,128,128,1,3,5,7,9,11,13,15,128,128,128,128,128,128,128,128,17,19,21,23,25,27,29,31,128,128,128,128,128,128,128,128,33,35,37,39,41,43,45,47,128,128,128,128,128,128,128,128,49,51,53,55,57,59,61,63] +; AVX512BW-NEXT: vpshufb %zmm5, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [1,3,5,7,9,11,13,15,128,128,128,128,128,128,128,128,17,19,21,23,25,27,29,31,128,128,128,128,128,128,128,128,33,35,37,39,41,43,45,47,128,128,128,128,128,128,128,128,49,51,53,55,57,59,61,63,128,128,128,128,128,128,128,128] +; AVX512BW-NEXT: vpshufb %zmm10, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm1 +; AVX512BW-NEXT: vpshufb %zmm4, %zmm3, %zmm0 +; AVX512BW-NEXT: vpshufb %zmm6, %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 +; AVX512BW-NEXT: vpshufb %zmm5, %zmm3, %zmm3 +; AVX512BW-NEXT: vpshufb %zmm10, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm7, %ymm2 +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512BW-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm5 +; AVX512BW-NEXT: vpmovsxbd %xmm5, %zmm5 +; AVX512BW-NEXT: vpmovsxbd %xmm7, %zmm6 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm4, %ymm7 +; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero,xmm8[4],zero,zero,zero,xmm8[5],zero,zero,zero,xmm8[6],zero,zero,zero,xmm8[7],zero,zero,zero,xmm8[8],zero,zero,zero,xmm8[9],zero,zero,zero,xmm8[10],zero,zero,zero,xmm8[11],zero,zero,zero,xmm8[12],zero,zero,zero,xmm8[13],zero,zero,zero,xmm8[14],zero,zero,zero,xmm8[15],zero,zero,zero +; AVX512BW-NEXT: vpmaddwd %zmm8, %zmm3, %zmm3 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero,xmm7[8],zero,zero,zero,xmm7[9],zero,zero,zero,xmm7[10],zero,zero,zero,xmm7[11],zero,zero,zero,xmm7[12],zero,zero,zero,xmm7[13],zero,zero,zero,xmm7[14],zero,zero,zero,xmm7[15],zero,zero,zero +; AVX512BW-NEXT: vpmaddwd %zmm7, %zmm2, %zmm2 +; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero,xmm7[8],zero,zero,zero,xmm7[9],zero,zero,zero,xmm7[10],zero,zero,zero,xmm7[11],zero,zero,zero,xmm7[12],zero,zero,zero,xmm7[13],zero,zero,zero,xmm7[14],zero,zero,zero,xmm7[15],zero,zero,zero +; AVX512BW-NEXT: vpmaddwd %zmm7, %zmm5, %zmm5 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero +; AVX512BW-NEXT: vpmaddwd %zmm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512BW-NEXT: vpmovsxbd %xmm7, %zmm7 +; AVX512BW-NEXT: vpmovsxbd %xmm6, %zmm6 +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX512BW-NEXT: vpmovsxbd %xmm8, %zmm8 +; AVX512BW-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm9 +; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero,xmm10[4],zero,zero,zero,xmm10[5],zero,zero,zero,xmm10[6],zero,zero,zero,xmm10[7],zero,zero,zero,xmm10[8],zero,zero,zero,xmm10[9],zero,zero,zero,xmm10[10],zero,zero,zero,xmm10[11],zero,zero,zero,xmm10[12],zero,zero,zero,xmm10[13],zero,zero,zero,xmm10[14],zero,zero,zero,xmm10[15],zero,zero,zero +; AVX512BW-NEXT: vpmaddwd %zmm10, %zmm7, %zmm7 +; AVX512BW-NEXT: vpaddd %zmm7, %zmm3, %zmm3 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm7 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero,xmm9[8],zero,zero,zero,xmm9[9],zero,zero,zero,xmm9[10],zero,zero,zero,xmm9[11],zero,zero,zero,xmm9[12],zero,zero,zero,xmm9[13],zero,zero,zero,xmm9[14],zero,zero,zero,xmm9[15],zero,zero,zero +; AVX512BW-NEXT: vpmaddwd %zmm7, %zmm6, %zmm6 +; AVX512BW-NEXT: vpaddd %zmm6, %zmm2, %zmm2 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero,xmm6[8],zero,zero,zero,xmm6[9],zero,zero,zero,xmm6[10],zero,zero,zero,xmm6[11],zero,zero,zero,xmm6[12],zero,zero,zero,xmm6[13],zero,zero,zero,xmm6[14],zero,zero,zero,xmm6[15],zero,zero,zero +; AVX512BW-NEXT: vpmaddwd %zmm6, %zmm8, %zmm6 +; AVX512BW-NEXT: vpaddd %zmm6, %zmm5, %zmm5 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512BW-NEXT: vpmaddwd %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddd %zmm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vpmovsdw %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovsdw %zmm5, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovsdw %zmm2, %ymm1 +; AVX512BW-NEXT: vpmovsdw %zmm3, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512BW-NEXT: retq %A = load <128 x i8>, ptr %Aptr %B = load <128 x i8>, ptr %Bptr @@ -195,15 +958,78 @@ define <8 x i16> @pmaddubsw_swapped_indices(ptr %Aptr, ptr %Bptr) { ; SSE-LABEL: pmaddubsw_swapped_indices: ; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa (%rsi), %xmm0 -; SSE-NEXT: pmaddubsw (%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pshufb {{.*#+}} xmm2 = xmm2[u,1,u,2,u,5,u,6,u,9,u,10,u,13,u,14] +; SSE-NEXT: psraw $8, %xmm2 +; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,0,u,3,u,4,u,7,u,8,u,11,u,12,u,15] +; SSE-NEXT: psraw $8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pshufb {{.*#+}} xmm4 = xmm4[9],zero,xmm4[8],zero,xmm4[10],zero,xmm4[11],zero,xmm4[13],zero,xmm4[12],zero,xmm4[14],zero,xmm4[15],zero +; SSE-NEXT: pmaddwd %xmm3, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1],zero,xmm0[0],zero,xmm0[2],zero,xmm0[3],zero,xmm0[5],zero,xmm0[4],zero,xmm0[6],zero,xmm0[7],zero +; SSE-NEXT: pmaddwd %xmm2, %xmm0 +; SSE-NEXT: packssdw %xmm4, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: pmaddubsw_swapped_indices: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rsi), %xmm0 -; AVX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: pmaddubsw_swapped_indices: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa (%rsi), %xmm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,2,5,6,9,10,13,14,1,2,5,6,9,10,13,14] +; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [0,3,4,7,8,11,12,15,0,3,4,7,8,11,12,15] +; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbw %xmm3, %xmm2 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-NEXT: vpmovsxbw %xmm6, %xmm2 +; AVX1-NEXT: vpmovsxbw %xmm4, %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX256-LABEL: pmaddubsw_swapped_indices: +; AVX256: # %bb.0: +; AVX256-NEXT: vmovdqa (%rdi), %xmm0 +; AVX256-NEXT: vmovdqa (%rsi), %xmm1 +; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,2,5,6,9,10,13,14,1,2,5,6,9,10,13,14] +; AVX256-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,3,4,7,8,11,12,15,0,3,4,7,8,11,12,15] +; AVX256-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX256-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX256-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX256-NEXT: vpmovsxbd %xmm3, %ymm3 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX256-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 +; AVX256-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %A = load <16 x i8>, ptr %Aptr %B = load <16 x i8>, ptr %Bptr %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> ;indices aren't all even @@ -228,15 +1054,67 @@ define <8 x i16> @pmaddubsw_swapped_extend(ptr %Aptr, ptr %Bptr) { ; SSE-LABEL: pmaddubsw_swapped_extend: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: pmaddubsw (%rsi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: psllw $8, %xmm0 +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: psraw $8, %xmm2 +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: pmaddwd %xmm4, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: pmaddwd %xmm1, %xmm0 +; SSE-NEXT: packssdw %xmm5, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: pmaddubsw_swapped_extend: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpmaddubsw (%rsi), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: pmaddubsw_swapped_extend: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa (%rsi), %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] +; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4 +; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-NEXT: vpmaddwd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX256-LABEL: pmaddubsw_swapped_extend: +; AVX256: # %bb.0: +; AVX256-NEXT: vmovdqa (%rdi), %xmm0 +; AVX256-NEXT: vmovdqa (%rsi), %xmm1 +; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX256-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX256-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX256-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX256-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX256-NEXT: vpmovsxbd %xmm2, %ymm2 +; AVX256-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX256-NEXT: vpmovsxbd %xmm1, %ymm1 +; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %A = load <16 x i8>, ptr %Aptr %B = load <16 x i8>, ptr %Bptr %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> @@ -261,15 +1139,77 @@ define <8 x i16> @pmaddubsw_commuted_mul(ptr %Aptr, ptr %Bptr) { ; SSE-LABEL: pmaddubsw_commuted_mul: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rsi), %xmm0 -; SSE-NEXT: pmaddubsw (%rdi), %xmm0 +; SSE-NEXT: movdqa (%rsi), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: psllw $8, %xmm0 +; SSE-NEXT: psraw $8, %xmm0 +; SSE-NEXT: psraw $8, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,8],zero,xmm3[10],zero,xmm3[12],zero,xmm3[14],zero +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psrlw $8, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: pmaddwd %xmm3, %xmm5 +; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2],zero,xmm1[4],zero,xmm1[6],zero,xmm1[u,u,u,u,u,u,u,u] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: pmaddwd %xmm1, %xmm0 +; SSE-NEXT: packssdw %xmm5, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: pmaddubsw_commuted_mul: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rsi), %xmm0 -; AVX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: pmaddubsw_commuted_mul: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa (%rsi), %xmm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] +; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm7 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-NEXT: vpmaddwd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbw %xmm4, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[9],zero,xmm1[11],zero,xmm1[13],zero,xmm1[15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX1-NEXT: vpmovsxbw %xmm6, %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX256-LABEL: pmaddubsw_commuted_mul: +; AVX256: # %bb.0: +; AVX256-NEXT: vmovdqa (%rdi), %xmm0 +; AVX256-NEXT: vmovdqa (%rsi), %xmm1 +; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX256-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX256-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX256-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX256-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX256-NEXT: vpmovsxbd %xmm3, %ymm3 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX256-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2 +; AVX256-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %A = load <16 x i8>, ptr %Aptr %B = load <16 x i8>, ptr %Bptr %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> @@ -470,3 +1410,5 @@ %trunc = trunc <8 x i32> %min to <8 x i16> ret <8 x i16> %trunc } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -26,38 +26,46 @@ define <4 x i16> @and_mulhuw_v4i16(<4 x i64> %a, <4 x i64> %b) { ; SSE2-LABEL: and_mulhuw_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,0,0,65535,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pmuludq %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: psrlq $16, %xmm0 +; SSE2-NEXT: psrlq $16, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: pmulhuw %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: and_mulhuw_v4i16: ; SSE41: # %bb.0: ; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: packusdw %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4],xmm4[5],xmm2[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4],xmm4[5],xmm0[6,7] +; SSE41-NEXT: pmuldq %xmm2, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4],xmm4[5],xmm3[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3,4],xmm4[5],xmm1[6,7] +; SSE41-NEXT: pmuldq %xmm3, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2,3,128,128,128,128,128,128,10,11,128,128,128,128,128,128] +; SSE41-NEXT: pshufb %xmm2, %xmm1 +; SSE41-NEXT: pshufb %xmm2, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 ; SSE41-NEXT: packusdw %xmm0, %xmm0 -; SSE41-NEXT: pmulhuw %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX2-LABEL: and_mulhuw_v4i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] +; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $16, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 @@ -66,7 +74,11 @@ ; ; AVX512-LABEL: and_mulhuw_v4i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] +; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrlq $16, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -79,14 +91,32 @@ } define <4 x i16> @sext_mulhw_v4i16(<4 x i16> %a, <4 x i16> %b) { -; SSE-LABEL: sext_mulhw_v4i16: -; SSE: # %bb.0: -; SSE-NEXT: pmulhw %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sext_mulhw_v4i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmaddwd %xmm1, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sext_mulhw_v4i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pmaddwd %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: sext_mulhw_v4i16: ; AVX: # %bb.0: -; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %a1 = sext <4 x i16> %a to <4 x i32> %b1 = sext <4 x i16> %b to <4 x i32> @@ -226,31 +256,58 @@ } define <8 x i16> @sextinreg_mulhw_v8i16(<8 x i32> %a, <8 x i32> %b) { -; SSE-LABEL: sextinreg_mulhw_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pslld $24, %xmm1 -; SSE-NEXT: psrad $24, %xmm1 -; SSE-NEXT: pslld $24, %xmm0 -; SSE-NEXT: psrad $24, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: pslld $25, %xmm3 -; SSE-NEXT: psrad $25, %xmm3 -; SSE-NEXT: pslld $25, %xmm2 -; SSE-NEXT: psrad $25, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: pmulhw %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sextinreg_mulhw_v8i16: +; SSE2: # %bb.0: +; SSE2-NEXT: pslld $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: psllw $9, %xmm2 +; SSE2-NEXT: psraw $9, %xmm2 +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: packssdw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sextinreg_mulhw_v8i16: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: psllw $9, %xmm2 +; SSE41-NEXT: psraw $9, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: psllw $8, %xmm0 +; SSE41-NEXT: psraw $8, %xmm0 +; SSE41-NEXT: pmulhw %xmm2, %xmm0 +; SSE41-NEXT: retq ; ; AVX2-LABEL: sextinreg_mulhw_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpslld $24, %ymm0, %ymm0 -; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0 -; AVX2-NEXT: vpslld $25, %ymm1, %ymm1 -; AVX2-NEXT: vpsrad $25, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpsllw $9, %xmm1, %xmm1 +; AVX2-NEXT: vpsraw $9, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpsllw $8, %xmm0, %xmm0 +; AVX2-NEXT: vpsraw $8, %xmm0, %xmm0 ; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -318,53 +375,63 @@ define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: and_mulhuw_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767] -; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm8, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm8, %xmm5 -; SSE2-NEXT: pand %xmm8, %xmm4 -; SSE2-NEXT: packssdw %xmm5, %xmm4 -; SSE2-NEXT: pmulhw %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm8, %xmm7 -; SSE2-NEXT: pand %xmm6, %xmm8 -; SSE2-NEXT: packssdw %xmm7, %xmm8 -; SSE2-NEXT: pmulhw %xmm2, %xmm8 -; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767] +; SSE2-NEXT: pand %xmm1, %xmm8 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm5 +; SSE2-NEXT: pmaddwd %xmm8, %xmm5 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pmaddwd %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm7 +; SSE2-NEXT: pmaddwd %xmm3, %xmm7 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pmaddwd %xmm2, %xmm1 +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm5, %xmm0 +; SSE2-NEXT: psrad $16, %xmm7 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm7, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: and_mulhuw_v16i16: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767] -; SSE41-NEXT: pand %xmm8, %xmm1 -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: packusdw %xmm1, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm3 ; SSE41-NEXT: pand %xmm8, %xmm2 -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: pand %xmm8, %xmm5 -; SSE41-NEXT: pand %xmm8, %xmm4 -; SSE41-NEXT: packusdw %xmm5, %xmm4 -; SSE41-NEXT: pmulhw %xmm4, %xmm0 +; SSE41-NEXT: pand %xmm8, %xmm1 +; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm7 -; SSE41-NEXT: pand %xmm6, %xmm8 -; SSE41-NEXT: packusdw %xmm7, %xmm8 -; SSE41-NEXT: pmulhw %xmm2, %xmm8 -; SSE41-NEXT: movdqa %xmm8, %xmm1 +; SSE41-NEXT: pmaddwd %xmm3, %xmm7 +; SSE41-NEXT: pand %xmm8, %xmm6 +; SSE41-NEXT: pmaddwd %xmm2, %xmm6 +; SSE41-NEXT: pand %xmm8, %xmm5 +; SSE41-NEXT: pmaddwd %xmm1, %xmm5 +; SSE41-NEXT: pand %xmm4, %xmm8 +; SSE41-NEXT: pmaddwd %xmm8, %xmm0 +; SSE41-NEXT: psrld $16, %xmm7 +; SSE41-NEXT: psrld $16, %xmm6 +; SSE41-NEXT: packusdw %xmm7, %xmm6 +; SSE41-NEXT: psrld $16, %xmm5 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm1 ; SSE41-NEXT: retq ; ; AVX2-LABEL: and_mulhuw_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32767,32767,32767,32767,32767,32767,32767,32767] -; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 -; AVX2-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq @@ -384,7 +451,8 @@ ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] ; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandd %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512BW-NEXT: retq %a1 = and <16 x i32> %a, @@ -417,13 +485,6 @@ define <16 x i16> @ashr_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) { ; SSE2-LABEL: ashr_mulhuw_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: packssdw %xmm7, %xmm6 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: pmulhw %xmm6, %xmm2 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: packssdw %xmm5, %xmm4 @@ -431,6 +492,13 @@ ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: pmulhw %xmm4, %xmm0 +; SSE2-NEXT: psrad $16, %xmm7 +; SSE2-NEXT: psrad $16, %xmm6 +; SSE2-NEXT: packssdw %xmm7, %xmm6 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: pmulhw %xmm6, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; @@ -687,15 +755,109 @@ } define <8 x i16> @zext_mulhuw_v8i16_i64(<8 x i16> %a, <8 x i16> %b) { -; SSE-LABEL: zext_mulhuw_v8i16_i64: -; SSE: # %bb.0: -; SSE-NEXT: pmulhuw %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: zext_mulhuw_v8i16_i64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: psrlq $16, %xmm7 +; SSE2-NEXT: psrlq $16, %xmm4 +; SSE2-NEXT: psrlq $16, %xmm2 +; SSE2-NEXT: psrlq $16, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: retq ; -; AVX-LABEL: zext_mulhuw_v8i16_i64: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE41-LABEL: zext_mulhuw_v8i16_i64: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: pmuldq %xmm3, %xmm2 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: pmuldq %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; SSE41-NEXT: pmuldq %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: pmuldq %xmm6, %xmm1 +; SSE41-NEXT: psrlq $16, %xmm2 +; SSE41-NEXT: psrlq $16, %xmm0 +; SSE41-NEXT: psrlq $16, %xmm3 +; SSE41-NEXT: psrlq $16, %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm1, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm2, %xmm0 +; SSE41-NEXT: packusdw %xmm3, %xmm0 +; SSE41-NEXT: retq +; +; AVX2-LABEL: zext_mulhuw_v8i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $16, %ymm2, %ymm1 +; AVX2-NEXT: vpsrlq $16, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_mulhuw_v8i16_i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %a1 = zext <8 x i16> %a to <8 x i64> %b1 = zext <8 x i16> %b to <8 x i64> %c = mul <8 x i64> %a1, %b1 @@ -705,15 +867,76 @@ } define <8 x i16> @sext_mulhuw_v8i16_i64(<8 x i16> %a, <8 x i16> %b) { -; SSE-LABEL: sext_mulhuw_v8i16_i64: -; SSE: # %bb.0: -; SSE-NEXT: pmulhw %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sext_mulhuw_v8i16_i64: +; SSE2: # %bb.0: +; SSE2-NEXT: pmulhw %xmm1, %xmm0 +; SSE2-NEXT: retq ; -; AVX-LABEL: sext_mulhuw_v8i16_i64: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE41-LABEL: sext_mulhuw_v8i16_i64: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE41-NEXT: pmovsxwq %xmm2, %xmm3 +; SSE41-NEXT: pmovsxwq %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE41-NEXT: pmovsxwq %xmm2, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE41-NEXT: pmovsxwq %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE41-NEXT: pmovsxwq %xmm0, %xmm2 +; SSE41-NEXT: pmuldq %xmm3, %xmm2 +; SSE41-NEXT: pmovsxwq %xmm1, %xmm0 +; SSE41-NEXT: pmuldq %xmm4, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE41-NEXT: pmovsxwq %xmm3, %xmm3 +; SSE41-NEXT: pmuldq %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE41-NEXT: pmovsxwq %xmm1, %xmm1 +; SSE41-NEXT: pmuldq %xmm6, %xmm1 +; SSE41-NEXT: psrlq $16, %xmm2 +; SSE41-NEXT: psrlq $16, %xmm0 +; SSE41-NEXT: psrlq $16, %xmm3 +; SSE41-NEXT: psrlq $16, %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm1, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm2, %xmm0 +; SSE41-NEXT: packusdw %xmm3, %xmm0 +; SSE41-NEXT: retq +; +; AVX2-LABEL: sext_mulhuw_v8i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm2, %ymm2 +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm3, %ymm3 +; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 +; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $16, %ymm2, %ymm1 +; AVX2-NEXT: vpsrlq $16, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: sext_mulhuw_v8i16_i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %a1 = sext <8 x i16> %a to <8 x i64> %b1 = sext <8 x i16> %b to <8 x i64> %c = mul <8 x i64> %a1, %b1 @@ -751,21 +974,27 @@ define <4 x i32> @mulhsw_v4i16_lshr(<4 x i16> %a, <4 x i16> %b) { ; SSE2-LABEL: mulhsw_v4i16_lshr: ; SSE2: # %bb.0: -; SSE2-NEXT: pmulhw %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmaddwd %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mulhsw_v4i16_lshr: ; SSE41: # %bb.0: -; SSE41-NEXT: pmulhw %xmm1, %xmm0 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pmaddwd %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: mulhsw_v4i16_lshr: ; AVX: # %bb.0: -; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX-NEXT: retq %a1 = sext <4 x i16> %a to <4 x i32> %b1 = sext <4 x i16> %b to <4 x i32> @@ -777,21 +1006,27 @@ define <4 x i32> @mulhsw_v4i16_ashr(<4 x i16> %a, <4 x i16> %b) { ; SSE2-LABEL: mulhsw_v4i16_ashr: ; SSE2: # %bb.0: -; SSE2-NEXT: pmulhw %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmaddwd %xmm1, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mulhsw_v4i16_ashr: ; SSE41: # %bb.0: -; SSE41-NEXT: pmulhw %xmm1, %xmm0 -; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pmaddwd %xmm1, %xmm0 +; SSE41-NEXT: psrad $16, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: mulhsw_v4i16_ashr: ; AVX: # %bb.0: -; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX-NEXT: retq %a1 = sext <4 x i16> %a to <4 x i32> %b1 = sext <4 x i16> %b to <4 x i32> @@ -868,11 +1103,10 @@ ; SSE2-LABEL: mulhsw_v8i16_ashr: ; SSE2: # %bb.0: ; SSE2-NEXT: pmulhw %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mulhsw_v8i16_ashr: @@ -900,15 +1134,15 @@ ; SSE2-LABEL: zext_mulhuw_v16i16_lshr: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pmulhuw %xmm1, %xmm3 ; SSE2-NEXT: pmulhuw %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE2-NEXT: pmulhuw %xmm1, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: retq ; @@ -950,15 +1184,15 @@ ; SSE2-LABEL: mulhsw_v16i16_lshr: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pmulhw %xmm1, %xmm3 ; SSE2-NEXT: pmulhw %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE2-NEXT: pmulhw %xmm1, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: retq ; @@ -999,18 +1233,17 @@ define <16 x i32> @mulhsw_v16i16_ashr(<16 x i16> %a, <16 x i16> %b) { ; SSE2-LABEL: mulhsw_v16i16_ashr: ; SSE2: # %bb.0: -; SSE2-NEXT: pmulhw %xmm2, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pmulhw %xmm3, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE2-NEXT: pmulhw %xmm2, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm4, %xmm2 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mulhsw_v16i16_ashr: @@ -1051,31 +1284,31 @@ ; SSE2-LABEL: zext_mulhuw_v32i16_lshr: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: pmulhuw %xmm7, %xmm3 +; SSE2-NEXT: pmulhuw %xmm6, %xmm2 +; SSE2-NEXT: pmulhuw %xmm5, %xmm1 ; SSE2-NEXT: pmulhuw %xmm4, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: pmulhuw %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: pmulhuw %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: pmulhuw %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm2, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: movdqa %xmm3, 112(%rdi) -; SSE2-NEXT: movdqa %xmm7, 96(%rdi) +; SSE2-NEXT: movdqa %xmm8, 96(%rdi) ; SSE2-NEXT: movdqa %xmm2, 80(%rdi) -; SSE2-NEXT: movdqa %xmm6, 64(%rdi) +; SSE2-NEXT: movdqa %xmm7, 64(%rdi) ; SSE2-NEXT: movdqa %xmm1, 48(%rdi) -; SSE2-NEXT: movdqa %xmm5, 32(%rdi) +; SSE2-NEXT: movdqa %xmm6, 32(%rdi) ; SSE2-NEXT: movdqa %xmm0, 16(%rdi) -; SSE2-NEXT: movdqa %xmm8, (%rdi) +; SSE2-NEXT: movdqa %xmm5, (%rdi) ; SSE2-NEXT: retq ; ; SSE41-LABEL: zext_mulhuw_v32i16_lshr: @@ -1146,31 +1379,31 @@ ; SSE2-LABEL: mulhsw_v32i16_lshr: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: pmulhw %xmm7, %xmm3 +; SSE2-NEXT: pmulhw %xmm6, %xmm2 +; SSE2-NEXT: pmulhw %xmm5, %xmm1 ; SSE2-NEXT: pmulhw %xmm4, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: pmulhw %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: pmulhw %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: pmulhw %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm2, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: movdqa %xmm3, 112(%rdi) -; SSE2-NEXT: movdqa %xmm7, 96(%rdi) +; SSE2-NEXT: movdqa %xmm8, 96(%rdi) ; SSE2-NEXT: movdqa %xmm2, 80(%rdi) -; SSE2-NEXT: movdqa %xmm6, 64(%rdi) +; SSE2-NEXT: movdqa %xmm7, 64(%rdi) ; SSE2-NEXT: movdqa %xmm1, 48(%rdi) -; SSE2-NEXT: movdqa %xmm5, 32(%rdi) +; SSE2-NEXT: movdqa %xmm6, 32(%rdi) ; SSE2-NEXT: movdqa %xmm0, 16(%rdi) -; SSE2-NEXT: movdqa %xmm8, (%rdi) +; SSE2-NEXT: movdqa %xmm5, (%rdi) ; SSE2-NEXT: retq ; ; SSE41-LABEL: mulhsw_v32i16_lshr: @@ -1241,34 +1474,34 @@ ; SSE2-LABEL: mulhsw_v32i16_ashr: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: pmulhw %xmm7, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmulhw %xmm6, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmulhw %xmm5, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pmulhw %xmm4, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pmulhw %xmm5, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pmulhw %xmm6, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: pmulhw %xmm7, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 112(%rdi) -; SSE2-NEXT: movdqa %xmm7, 96(%rdi) -; SSE2-NEXT: movdqa %xmm2, 80(%rdi) -; SSE2-NEXT: movdqa %xmm6, 64(%rdi) -; SSE2-NEXT: movdqa %xmm1, 48(%rdi) -; SSE2-NEXT: movdqa %xmm5, 32(%rdi) -; SSE2-NEXT: movdqa %xmm0, 16(%rdi) -; SSE2-NEXT: movdqa %xmm4, (%rdi) +; SSE2-NEXT: psrad $16, %xmm7 +; SSE2-NEXT: movdqa %xmm7, 112(%rdi) +; SSE2-NEXT: movdqa %xmm3, 96(%rdi) +; SSE2-NEXT: movdqa %xmm6, 80(%rdi) +; SSE2-NEXT: movdqa %xmm2, 64(%rdi) +; SSE2-NEXT: movdqa %xmm5, 48(%rdi) +; SSE2-NEXT: movdqa %xmm1, 32(%rdi) +; SSE2-NEXT: movdqa %xmm4, 16(%rdi) +; SSE2-NEXT: movdqa %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE41-LABEL: mulhsw_v32i16_ashr: @@ -1343,40 +1576,40 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6 +; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5 +; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4 +; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3 +; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2 +; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1 ; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pxor %xmm11, %xmm11 ; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] ; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm11 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm3, %xmm12 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] -; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm4, %xmm13 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm5, %xmm14 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm6, %xmm15 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] -; SSE2-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm8, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm8, 240(%rdi) ; SSE2-NEXT: movdqa %xmm7, 224(%rdi) ; SSE2-NEXT: movdqa %xmm6, 208(%rdi) @@ -1388,7 +1621,7 @@ ; SSE2-NEXT: movdqa %xmm3, 112(%rdi) ; SSE2-NEXT: movdqa %xmm12, 96(%rdi) ; SSE2-NEXT: movdqa %xmm2, 80(%rdi) -; SSE2-NEXT: movdqa %xmm11, 64(%rdi) +; SSE2-NEXT: movdqa %xmm10, 64(%rdi) ; SSE2-NEXT: movdqa %xmm1, 48(%rdi) ; SSE2-NEXT: movdqa %xmm9, 32(%rdi) ; SSE2-NEXT: movdqa %xmm0, 16(%rdi) @@ -1524,40 +1757,40 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pxor %xmm11, %xmm11 ; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] ; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm11 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm3, %xmm12 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm4, %xmm13 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm5, %xmm14 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm6, %xmm15 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm8, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] ; SSE2-NEXT: movdqa %xmm8, 240(%rdi) ; SSE2-NEXT: movdqa %xmm7, 224(%rdi) ; SSE2-NEXT: movdqa %xmm6, 208(%rdi) @@ -1569,7 +1802,7 @@ ; SSE2-NEXT: movdqa %xmm3, 112(%rdi) ; SSE2-NEXT: movdqa %xmm12, 96(%rdi) ; SSE2-NEXT: movdqa %xmm2, 80(%rdi) -; SSE2-NEXT: movdqa %xmm11, 64(%rdi) +; SSE2-NEXT: movdqa %xmm10, 64(%rdi) ; SSE2-NEXT: movdqa %xmm1, 48(%rdi) ; SSE2-NEXT: movdqa %xmm9, 32(%rdi) ; SSE2-NEXT: movdqa %xmm0, 16(%rdi) @@ -1704,62 +1937,62 @@ ; SSE2-LABEL: mulhsw_v64i16_ashr: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: psrad $16, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE2-NEXT: psrad $16, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm15 ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; SSE2-NEXT: psrad $16, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm14 ; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] -; SSE2-NEXT: psrad $16, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm13 ; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] ; SSE2-NEXT: psrad $16, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; SSE2-NEXT: psrad $16, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm11 ; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; SSE2-NEXT: psrad $16, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm10 ; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] -; SSE2-NEXT: psrad $16, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm9 ; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: movdqa %xmm7, 240(%rdi) -; SSE2-NEXT: movdqa %xmm15, 224(%rdi) -; SSE2-NEXT: movdqa %xmm6, 208(%rdi) -; SSE2-NEXT: movdqa %xmm14, 192(%rdi) -; SSE2-NEXT: movdqa %xmm5, 176(%rdi) -; SSE2-NEXT: movdqa %xmm13, 160(%rdi) -; SSE2-NEXT: movdqa %xmm4, 144(%rdi) -; SSE2-NEXT: movdqa %xmm12, 128(%rdi) -; SSE2-NEXT: movdqa %xmm3, 112(%rdi) -; SSE2-NEXT: movdqa %xmm11, 96(%rdi) -; SSE2-NEXT: movdqa %xmm2, 80(%rdi) -; SSE2-NEXT: movdqa %xmm10, 64(%rdi) -; SSE2-NEXT: movdqa %xmm1, 48(%rdi) -; SSE2-NEXT: movdqa %xmm9, 32(%rdi) -; SSE2-NEXT: movdqa %xmm0, 16(%rdi) -; SSE2-NEXT: movdqa %xmm8, (%rdi) +; SSE2-NEXT: psrad $16, %xmm8 +; SSE2-NEXT: movdqa %xmm8, 240(%rdi) +; SSE2-NEXT: movdqa %xmm7, 224(%rdi) +; SSE2-NEXT: movdqa %xmm9, 208(%rdi) +; SSE2-NEXT: movdqa %xmm6, 192(%rdi) +; SSE2-NEXT: movdqa %xmm10, 176(%rdi) +; SSE2-NEXT: movdqa %xmm5, 160(%rdi) +; SSE2-NEXT: movdqa %xmm11, 144(%rdi) +; SSE2-NEXT: movdqa %xmm4, 128(%rdi) +; SSE2-NEXT: movdqa %xmm12, 112(%rdi) +; SSE2-NEXT: movdqa %xmm3, 96(%rdi) +; SSE2-NEXT: movdqa %xmm13, 80(%rdi) +; SSE2-NEXT: movdqa %xmm2, 64(%rdi) +; SSE2-NEXT: movdqa %xmm14, 48(%rdi) +; SSE2-NEXT: movdqa %xmm1, 32(%rdi) +; SSE2-NEXT: movdqa %xmm15, 16(%rdi) +; SSE2-NEXT: movdqa %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE41-LABEL: mulhsw_v64i16_ashr: @@ -1885,45 +2118,79 @@ define <8 x i64> @zext_mulhuw_v8i16_lshr_i64(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: zext_mulhuw_v8i16_lshr_i64: ; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pmulhuw %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,1,3,3] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,3,3] +; SSE2-NEXT: pmuludq %xmm6, %xmm3 +; SSE2-NEXT: psrlq $16, %xmm0 +; SSE2-NEXT: psrlq $16, %xmm4 +; SSE2-NEXT: psrlq $16, %xmm2 +; SSE2-NEXT: psrlq $16, %xmm3 +; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: zext_mulhuw_v8i16_lshr_i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmulhuw %xmm1, %xmm0 -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: pmuldq %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE41-NEXT: pmuldq %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE41-NEXT: pmuldq %xmm5, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; SSE41-NEXT: pmuldq %xmm6, %xmm3 +; SSE41-NEXT: psrlq $16, %xmm0 +; SSE41-NEXT: psrlq $16, %xmm4 +; SSE41-NEXT: psrlq $16, %xmm2 +; SSE41-NEXT: psrlq $16, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm1 ; SSE41-NEXT: retq ; ; AVX2-LABEL: zext_mulhuw_v8i16_lshr_i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpsrlq $16, %ymm2, %ymm0 +; AVX2-NEXT: vpsrlq $16, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: zext_mulhuw_v8i16_lshr_i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm0 ; AVX512-NEXT: retq %a1 = zext <8 x i16> %a to <8 x i64> %b1 = zext <8 x i16> %b to <8 x i64> @@ -1951,29 +2218,51 @@ ; ; SSE41-LABEL: sext_mulhsw_v8i16_lshr_i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmulhw %xmm1, %xmm0 -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; SSE41-NEXT: pmovsxwq %xmm0, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSE41-NEXT: pmovsxwq %xmm3, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE41-NEXT: pmovsxwq %xmm4, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pmovsxwq %xmm0, %xmm6 +; SSE41-NEXT: pmovsxwq %xmm1, %xmm0 +; SSE41-NEXT: pmuldq %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; SSE41-NEXT: pmovsxwq %xmm2, %xmm4 +; SSE41-NEXT: pmuldq %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE41-NEXT: pmovsxwq %xmm2, %xmm2 +; SSE41-NEXT: pmuldq %xmm5, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE41-NEXT: pmovsxwq %xmm1, %xmm3 +; SSE41-NEXT: pmuldq %xmm6, %xmm3 +; SSE41-NEXT: psrlq $16, %xmm0 +; SSE41-NEXT: psrlq $16, %xmm4 +; SSE41-NEXT: psrlq $16, %xmm2 +; SSE41-NEXT: psrlq $16, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm1 ; SSE41-NEXT: retq ; ; AVX2-LABEL: sext_mulhsw_v8i16_lshr_i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 +; AVX2-NEXT: vpmovsxwq %xmm1, %ymm3 +; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 +; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpsrlq $16, %ymm2, %ymm0 +; AVX2-NEXT: vpsrlq $16, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: sext_mulhsw_v8i16_lshr_i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm0 ; AVX512-NEXT: retq %a1 = sext <8 x i16> %a to <8 x i64> %b1 = sext <8 x i16> %b to <8 x i64> @@ -2005,29 +2294,67 @@ ; ; SSE41-LABEL: sext_mulhsw_v8i16_ashr_i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pmulhw %xmm1, %xmm0 -; SSE41-NEXT: pmovsxwq %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE41-NEXT: pmovsxwq %xmm1, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pmovsxwq %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE41-NEXT: pmovsxwq %xmm2, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE41-NEXT: pmovsxwq %xmm4, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE41-NEXT: pmovsxwq %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pmovsxwq %xmm0, %xmm5 +; SSE41-NEXT: pmovsxwq %xmm3, %xmm0 +; SSE41-NEXT: pmuldq %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE41-NEXT: pmovsxwq %xmm1, %xmm1 +; SSE41-NEXT: pmuldq %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; SSE41-NEXT: pmovsxwq %xmm2, %xmm2 +; SSE41-NEXT: pmuldq %xmm4, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; SSE41-NEXT: pmovsxwq %xmm3, %xmm3 +; SSE41-NEXT: pmuldq %xmm5, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrad $16, %xmm4 +; SSE41-NEXT: psrlq $16, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psrad $16, %xmm4 +; SSE41-NEXT: psrlq $16, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psrad $16, %xmm4 +; SSE41-NEXT: psrlq $16, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psrad $16, %xmm4 +; SSE41-NEXT: psrlq $16, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] ; SSE41-NEXT: retq ; ; AVX2-LABEL: sext_mulhsw_v8i16_ashr_i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpmovsxwq %xmm1, %ymm0 +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 +; AVX2-NEXT: vpmovsxwq %xmm1, %ymm3 +; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 +; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpsrad $16, %ymm2, %ymm0 +; AVX2-NEXT: vpsrlq $16, %ymm2, %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] +; AVX2-NEXT: vpsrad $16, %ymm1, %ymm2 +; AVX2-NEXT: vpsrlq $16, %ymm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX2-NEXT: retq ; ; AVX512-LABEL: sext_mulhsw_v8i16_ashr_i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsraq $16, %zmm0, %zmm0 ; AVX512-NEXT: retq %a1 = sext <8 x i16> %a to <8 x i64> %b1 = sext <8 x i16> %b to <8 x i64> diff --git a/llvm/test/CodeGen/X86/pr15267.ll b/llvm/test/CodeGen/X86/pr15267.ll --- a/llvm/test/CodeGen/X86/pr15267.ll +++ b/llvm/test/CodeGen/X86/pr15267.ll @@ -7,18 +7,14 @@ ; CHECK-NEXT: movzwl (%rdi), %eax ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $3, %ecx -; CHECK-NEXT: andl $7, %ecx -; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: andl $7, %edx -; CHECK-NEXT: vmovd %edx, %xmm0 -; CHECK-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %eax, %xmm0 +; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $6, %ecx -; CHECK-NEXT: andl $7, %ecx -; CHECK-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 +; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: shrl $9, %eax -; CHECK-NEXT: andl $7, %eax -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 +; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: retq %ret = load <4 x i3>, ptr %in, align 1 ret <4 x i3> %ret @@ -50,29 +46,28 @@ ; CHECK-LABEL: test3: ; CHECK: # %bb.0: ; CHECK-NEXT: movzbl (%rdi), %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shrb $3, %cl +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: negq %rcx +; CHECK-NEXT: vmovq %rcx, %xmm0 ; CHECK-NEXT: movzbl %al, %ecx -; CHECK-NEXT: shrb %al +; CHECK-NEXT: shrb $2, %al ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: negl %eax -; CHECK-NEXT: movl %ecx, %edx -; CHECK-NEXT: andl $1, %edx -; CHECK-NEXT: negl %edx -; CHECK-NEXT: vmovd %edx, %xmm0 -; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; CHECK-NEXT: negq %rax +; CHECK-NEXT: vmovq %rax, %xmm1 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: shrb $2, %al -; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: negl %eax -; CHECK-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; CHECK-NEXT: shrb $3, %cl +; CHECK-NEXT: negq %rax +; CHECK-NEXT: vmovq %rax, %xmm1 +; CHECK-NEXT: shrb %cl ; CHECK-NEXT: movzbl %cl, %eax -; CHECK-NEXT: negl %eax -; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: negq %rax +; CHECK-NEXT: vmovq %rax, %xmm2 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %wide.load35 = load <4 x i1>, ptr %in, align 1 diff --git a/llvm/test/CodeGen/X86/pr22338.ll b/llvm/test/CodeGen/X86/pr22338.ll --- a/llvm/test/CodeGen/X86/pr22338.ll +++ b/llvm/test/CodeGen/X86/pr22338.ll @@ -8,17 +8,20 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: .cfi_offset %ebx, -8 +; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $1, {{[0-9]+}}(%esp) ; X86-NEXT: sete %cl ; X86-NEXT: setne %al +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: cmpl $1, {{[0-9]+}}(%esp) ; X86-NEXT: sete %dl +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: addl %edx, %edx ; X86-NEXT: negl %eax -; X86-NEXT: addb %cl, %cl ; X86-NEXT: movl %eax, %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NEXT: shll %cl, %ebx -; X86-NEXT: addb %dl, %dl ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: shll %cl, %eax ; X86-NEXT: .p2align 4, 0x90 @@ -33,17 +36,20 @@ ; ; X64-LABEL: fn: ; X64: # %bb.0: # %entry +; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpl $1, %edi ; X64-NEXT: sete %cl ; X64-NEXT: setne %al +; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpl $1, %esi ; X64-NEXT: sete %dl +; X64-NEXT: addl %ecx, %ecx +; X64-NEXT: addl %edx, %edx ; X64-NEXT: negl %eax -; X64-NEXT: addb %cl, %cl ; X64-NEXT: movl %eax, %esi +; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shll %cl, %esi -; X64-NEXT: addb %dl, %dl ; X64-NEXT: movl %edx, %ecx ; X64-NEXT: shll %cl, %eax ; X64-NEXT: .p2align 4, 0x90 diff --git a/llvm/test/CodeGen/X86/pr23258.ll b/llvm/test/CodeGen/X86/pr23258.ll --- a/llvm/test/CodeGen/X86/pr23258.ll +++ b/llvm/test/CodeGen/X86/pr23258.ll @@ -22,11 +22,11 @@ ; HAS-RAX-LABEL: bar: ; HAS-RAX: # %bb.0: ; HAS-RAX-NEXT: subq $56, %rsp -; HAS-RAX-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) -; HAS-RAX-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; HAS-RAX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; HAS-RAX-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; HAS-RAX-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; HAS-RAX-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; HAS-RAX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; HAS-RAX-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; HAS-RAX-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) ; HAS-RAX-NEXT: testb %al, %al ; HAS-RAX-NEXT: je .LBB1_2 ; HAS-RAX-NEXT: # %bb.1: @@ -39,28 +39,28 @@ ; HAS-RAX-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) ; HAS-RAX-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) ; HAS-RAX-NEXT: .LBB1_2: -; HAS-RAX-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; HAS-RAX-NEXT: movq %rax, 8 ; HAS-RAX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax ; HAS-RAX-NEXT: movq %rax, 16 -; HAS-RAX-NEXT: movl $8, 0 +; HAS-RAX-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; HAS-RAX-NEXT: movq %rax, 8 ; HAS-RAX-NEXT: movl $48, 4 +; HAS-RAX-NEXT: movl $8, 0 ; HAS-RAX-NEXT: addq $56, %rsp ; HAS-RAX-NEXT: retq ; ; NO-RAX-LABEL: bar: ; NO-RAX: # %bb.0: -; NO-RAX-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) -; NO-RAX-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; NO-RAX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; NO-RAX-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; NO-RAX-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; NO-RAX-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; NO-RAX-NEXT: movq %rax, 8 +; NO-RAX-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; NO-RAX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; NO-RAX-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; NO-RAX-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) ; NO-RAX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax ; NO-RAX-NEXT: movq %rax, 16 -; NO-RAX-NEXT: movl $8, 0 +; NO-RAX-NEXT: leaq {{[0-9]+}}(%rsp), %rax +; NO-RAX-NEXT: movq %rax, 8 ; NO-RAX-NEXT: movl $48, 4 +; NO-RAX-NEXT: movl $8, 0 ; NO-RAX-NEXT: retq call void @llvm.va_start(ptr null) ret void diff --git a/llvm/test/CodeGen/X86/pr28472.ll b/llvm/test/CodeGen/X86/pr28472.ll --- a/llvm/test/CodeGen/X86/pr28472.ll +++ b/llvm/test/CodeGen/X86/pr28472.ll @@ -4,6 +4,11 @@ define float @same_dynamic_index_fp_vector_type(float %val, i32 %idx) { ; CHECK-LABEL: same_dynamic_index_fp_vector_type: ; CHECK: # %bb.0: # %bb +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: andl $3, %edi +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: retq bb: %tmp0 = insertelement <4 x float> undef, float %val, i32 %idx diff --git a/llvm/test/CodeGen/X86/pr31045.ll b/llvm/test/CodeGen/X86/pr31045.ll --- a/llvm/test/CodeGen/X86/pr31045.ll +++ b/llvm/test/CodeGen/X86/pr31045.ll @@ -19,7 +19,7 @@ ; CHECK-LABEL: _Z1av: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl struct_obj_3+8(%rip), %eax -; CHECK-NEXT: movzbl var_46(%rip), %ecx +; CHECK-NEXT: movsbl var_46(%rip), %ecx ; CHECK-NEXT: movzbl var_49(%rip), %edx ; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: addl %eax, %eax diff --git a/llvm/test/CodeGen/X86/pr32284.ll b/llvm/test/CodeGen/X86/pr32284.ll --- a/llvm/test/CodeGen/X86/pr32284.ll +++ b/llvm/test/CodeGen/X86/pr32284.ll @@ -162,17 +162,17 @@ ; X64-NEXT: cmpq %rcx, %rax ; X64-NEXT: setne -{{[0-9]+}}(%rsp) ; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: cmpq $-1, %rax -; X64-NEXT: sete %cl -; X64-NEXT: xorl %edx, %edx ; X64-NEXT: cmpl $-1, %eax -; X64-NEXT: sete %dl +; X64-NEXT: sete %cl ; X64-NEXT: addq $7093, %rax # imm = 0x1BB5 -; X64-NEXT: xorl %esi, %esi -; X64-NEXT: cmpq %rax, %rdx -; X64-NEXT: setg %sil -; X64-NEXT: movq %rsi, var_57(%rip) -; X64-NEXT: movq %rcx, _ZN8struct_210member_2_0E(%rip) +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: cmpq %rax, %rcx +; X64-NEXT: setg %dl +; X64-NEXT: movq %rdx, var_57(%rip) +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: cmpl $-1, var_5(%rip) +; X64-NEXT: sete %al +; X64-NEXT: movq %rax, _ZN8struct_210member_2_0E(%rip) ; X64-NEXT: retq ; ; X86-O0-LABEL: f1: @@ -213,35 +213,28 @@ ; ; X86-LABEL: f1: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %ebx -; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: subl $1, %esp -; X86-NEXT: .cfi_def_cfa_offset 13 -; X86-NEXT: .cfi_offset %esi, -12 -; X86-NEXT: .cfi_offset %ebx, -8 -; X86-NEXT: movl var_5, %eax -; X86-NEXT: movl %eax, %edx +; X86-NEXT: .cfi_def_cfa_offset 9 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl var_5, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sarl $31, %eax +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: xorl $208307499, %edx # imm = 0xC6A852B -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl %eax, %esi ; X86-NEXT: xorl $-2, %esi ; X86-NEXT: orl %edx, %esi ; X86-NEXT: setne (%esp) -; X86-NEXT: movl %eax, %esi -; X86-NEXT: andl %ecx, %esi +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: addl $7093, %esi # imm = 0x1BB5 +; X86-NEXT: adcl $0, %eax ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpl $-1, %esi +; X86-NEXT: cmpl $-1, %ecx ; X86-NEXT: sete %dl -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: cmpl $-1, %eax -; X86-NEXT: sete %bl -; X86-NEXT: addl $7093, %eax # imm = 0x1BB5 -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: cmpl %ebx, %eax -; X86-NEXT: sbbl $0, %ecx +; X86-NEXT: cmpl %edx, %esi +; X86-NEXT: sbbl $0, %eax ; X86-NEXT: setl %al ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: movl %eax, var_57 @@ -249,10 +242,8 @@ ; X86-NEXT: movl %edx, _ZN8struct_210member_2_0E ; X86-NEXT: movl $0, _ZN8struct_210member_2_0E+4 ; X86-NEXT: addl $1, %esp -; X86-NEXT: .cfi_def_cfa_offset 12 -; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: popl %ebx +; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl entry: @@ -451,14 +442,16 @@ ; X64: # %bb.0: # %entry ; X64-NEXT: movl var_13(%rip), %eax ; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testl %eax, %eax -; X64-NEXT: notl %eax +; X64-NEXT: testq %rax, %rax ; X64-NEXT: sete %cl -; X64-NEXT: movl var_16(%rip), %edx -; X64-NEXT: xorl %eax, %edx -; X64-NEXT: andl %edx, %ecx -; X64-NEXT: orl %eax, %ecx +; X64-NEXT: movl $4294967295, %edx # imm = 0xFFFFFFFF +; X64-NEXT: xorq %rax, %rdx +; X64-NEXT: movl var_16(%rip), %esi +; X64-NEXT: xorl %edx, %esi +; X64-NEXT: andl %esi, %ecx +; X64-NEXT: orq %rdx, %rcx ; X64-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NEXT: notl %eax ; X64-NEXT: movl %eax, var_46(%rip) ; X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/pr32345.ll b/llvm/test/CodeGen/X86/pr32345.ll --- a/llvm/test/CodeGen/X86/pr32345.ll +++ b/llvm/test/CodeGen/X86/pr32345.ll @@ -72,9 +72,9 @@ ; ; X64-LABEL: foo: ; X64: # %bb.0: # %bb -; X64-NEXT: movzbl var_27(%rip), %ecx ; X64-NEXT: movzwl var_22(%rip), %eax ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-NEXT: movzbl var_27(%rip), %ecx ; X64-NEXT: addb $30, %cl ; X64-NEXT: shrq %cl, %rax ; X64-NEXT: movb %al, (%rax) @@ -89,10 +89,10 @@ ; 686-NEXT: .cfi_def_cfa_register %ebp ; 686-NEXT: andl $-8, %esp ; 686-NEXT: subl $8, %esp -; 686-NEXT: movzbl var_27, %ecx ; 686-NEXT: movzwl var_22, %eax ; 686-NEXT: movl %eax, (%esp) ; 686-NEXT: movl $0, {{[0-9]+}}(%esp) +; 686-NEXT: movzbl var_27, %ecx ; 686-NEXT: addb $30, %cl ; 686-NEXT: xorl %edx, %edx ; 686-NEXT: shrdl %cl, %edx, %eax diff --git a/llvm/test/CodeGen/X86/pr33290.ll b/llvm/test/CodeGen/X86/pr33290.ll --- a/llvm/test/CodeGen/X86/pr33290.ll +++ b/llvm/test/CodeGen/X86/pr33290.ll @@ -14,8 +14,8 @@ ; X86-NEXT: .LBB0_1: # %for.cond ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movzbl c, %ecx -; X86-NEXT: movb $0, c ; X86-NEXT: leal a+2(%ecx), %ecx +; X86-NEXT: movb $0, c ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: jmp .LBB0_1 ; diff --git a/llvm/test/CodeGen/X86/pr33828.ll b/llvm/test/CodeGen/X86/pr33828.ll --- a/llvm/test/CodeGen/X86/pr33828.ll +++ b/llvm/test/CodeGen/X86/pr33828.ll @@ -7,18 +7,18 @@ define void @foo(i8 %a0) { ; X86-LABEL: foo: ; X86: # %bb.0: # %entry -; X86-NEXT: movsbl var_580, %eax -; X86-NEXT: testl $-536870913, %eax # imm = 0xDFFFFFFF -; X86-NEXT: jne .LBB0_1 +; X86-NEXT: movb $1, %al +; X86-NEXT: testb %al, %al +; X86-NEXT: je .LBB0_1 ; X86-NEXT: # %bb.2: # %if.end13 ; X86-NEXT: retl ; X86-NEXT: .LBB0_1: # %if.then11 ; ; X64-LABEL: foo: ; X64: # %bb.0: # %entry -; X64-NEXT: movsbl var_580(%rip), %eax -; X64-NEXT: testl $-536870913, %eax # imm = 0xDFFFFFFF -; X64-NEXT: jne .LBB0_1 +; X64-NEXT: movb $1, %al +; X64-NEXT: testb %al, %al +; X64-NEXT: je .LBB0_1 ; X64-NEXT: # %bb.2: # %if.end13 ; X64-NEXT: retq ; X64-NEXT: .LBB0_1: # %if.then11 diff --git a/llvm/test/CodeGen/X86/pr34137.ll b/llvm/test/CodeGen/X86/pr34137.ll --- a/llvm/test/CodeGen/X86/pr34137.ll +++ b/llvm/test/CodeGen/X86/pr34137.ll @@ -8,16 +8,13 @@ define void @pr34127() { ; CHECK-LABEL: pr34127: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movzwl var_13(%rip), %eax -; CHECK-NEXT: movzwl var_3(%rip), %ecx -; CHECK-NEXT: andw %ax, %cx -; CHECK-NEXT: movzwl %cx, %ecx -; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzwl var_3(%rip), %ecx +; CHECK-NEXT: movzwl var_3(%rip), %eax +; CHECK-NEXT: movzwl var_13(%rip), %ecx ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: testl %eax, %ecx +; CHECK-NEXT: andl %eax, %ecx +; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: sete %dl -; CHECK-NEXT: andl %ecx, %edx +; CHECK-NEXT: andl %eax, %edx ; CHECK-NEXT: movq %rdx, var_212(%rip) ; CHECK-NEXT: movw $0, (%rax) ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr34592.ll b/llvm/test/CodeGen/X86/pr34592.ll --- a/llvm/test/CodeGen/X86/pr34592.ll +++ b/llvm/test/CodeGen/X86/pr34592.ll @@ -22,7 +22,7 @@ ; CHECK-NEXT: vmovaps 80(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 48(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 16(%rbp), %ymm11 -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm2[4,5,6,7] ; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] ; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,1] diff --git a/llvm/test/CodeGen/X86/pr35316.ll b/llvm/test/CodeGen/X86/pr35316.ll --- a/llvm/test/CodeGen/X86/pr35316.ll +++ b/llvm/test/CodeGen/X86/pr35316.ll @@ -25,20 +25,20 @@ ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl a(%rip), %esi ; CHECK-NEXT: movl $0, b(%rip) -; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d -; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl c(%rip), %eax ; CHECK-NEXT: cltd -; CHECK-NEXT: idivl a(%rip) +; CHECK-NEXT: idivl -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: movl c(%rip), %eax +; CHECK-NEXT: andl %r8d, %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: cltd -; CHECK-NEXT: idivl %r8d -; CHECK-NEXT: andl %edi, %eax +; CHECK-NEXT: idivl %esi ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: andl %esi, %eax +; CHECK-NEXT: andl %edi, %eax ; CHECK-NEXT: movl %eax, (%rax) ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/pr35443.ll b/llvm/test/CodeGen/X86/pr35443.ll --- a/llvm/test/CodeGen/X86/pr35443.ll +++ b/llvm/test/CodeGen/X86/pr35443.ll @@ -9,11 +9,10 @@ ; CHECK-LABEL: pr35443: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpbroadcastb ac+4(%rip), %xmm0 -; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpsubq %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vpmovqd %ymm0, ai3+16(%rip) -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, ai3+16(%rip) ; CHECK-NEXT: retq entry: %wide.masked.load66 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr getelementptr inbounds ([20 x i8], ptr @ac, i64 0, i64 4), i32 1, <4 x i1> , <4 x i8> undef) diff --git a/llvm/test/CodeGen/X86/pr35765.ll b/llvm/test/CodeGen/X86/pr35765.ll --- a/llvm/test/CodeGen/X86/pr35765.ll +++ b/llvm/test/CodeGen/X86/pr35765.ll @@ -15,10 +15,9 @@ ; CHECK-NEXT: shll %cl, %eax ; CHECK-NEXT: movzwl x(%rip), %ecx ; CHECK-NEXT: movzwl s2(%rip), %edx -; CHECK-NEXT: notl %edx -; CHECK-NEXT: orl $63488, %edx # imm = 0xF800 -; CHECK-NEXT: movzwl %dx, %edx +; CHECK-NEXT: xorl $2047, %edx # imm = 0x7FF ; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: orl $63488, %edx # imm = 0xF800 ; CHECK-NEXT: xorl %eax, %edx ; CHECK-NEXT: movslq %edx, %rax ; CHECK-NEXT: movq %rax, ll(%rip) diff --git a/llvm/test/CodeGen/X86/pr35982.ll b/llvm/test/CodeGen/X86/pr35982.ll --- a/llvm/test/CodeGen/X86/pr35982.ll +++ b/llvm/test/CodeGen/X86/pr35982.ll @@ -5,8 +5,14 @@ define float @PR35982_emms(<1 x i64>) nounwind { ; NO-POSTRA-LABEL: PR35982_emms: ; NO-POSTRA: # %bb.0: -; NO-POSTRA-NEXT: subl $8, %esp -; NO-POSTRA-NEXT: movl {{[0-9]+}}(%esp), %eax +; NO-POSTRA-NEXT: pushl %ebp +; NO-POSTRA-NEXT: movl %esp, %ebp +; NO-POSTRA-NEXT: andl $-8, %esp +; NO-POSTRA-NEXT: subl $16, %esp +; NO-POSTRA-NEXT: movl 8(%ebp), %eax +; NO-POSTRA-NEXT: movl 12(%ebp), %ecx +; NO-POSTRA-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NO-POSTRA-NEXT: movl %eax, {{[0-9]+}}(%esp) ; NO-POSTRA-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; NO-POSTRA-NEXT: punpckhdq %mm0, %mm0 # mm0 = mm0[1,1] ; NO-POSTRA-NEXT: movd %mm0, %ecx @@ -15,14 +21,21 @@ ; NO-POSTRA-NEXT: fildl (%esp) ; NO-POSTRA-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; NO-POSTRA-NEXT: fiaddl {{[0-9]+}}(%esp) -; NO-POSTRA-NEXT: addl $8, %esp +; NO-POSTRA-NEXT: movl %ebp, %esp +; NO-POSTRA-NEXT: popl %ebp ; NO-POSTRA-NEXT: retl ; ; POSTRA-LABEL: PR35982_emms: ; POSTRA: # %bb.0: -; POSTRA-NEXT: subl $8, %esp +; POSTRA-NEXT: pushl %ebp +; POSTRA-NEXT: movl %esp, %ebp +; POSTRA-NEXT: andl $-8, %esp +; POSTRA-NEXT: subl $16, %esp +; POSTRA-NEXT: movl 8(%ebp), %eax +; POSTRA-NEXT: movl 12(%ebp), %ecx +; POSTRA-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; POSTRA-NEXT: movl %eax, {{[0-9]+}}(%esp) ; POSTRA-NEXT: movq {{[0-9]+}}(%esp), %mm0 -; POSTRA-NEXT: movl {{[0-9]+}}(%esp), %eax ; POSTRA-NEXT: punpckhdq %mm0, %mm0 # mm0 = mm0[1,1] ; POSTRA-NEXT: movd %mm0, %ecx ; POSTRA-NEXT: emms @@ -30,7 +43,8 @@ ; POSTRA-NEXT: fildl (%esp) ; POSTRA-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; POSTRA-NEXT: fiaddl {{[0-9]+}}(%esp) -; POSTRA-NEXT: addl $8, %esp +; POSTRA-NEXT: movl %ebp, %esp +; POSTRA-NEXT: popl %ebp ; POSTRA-NEXT: retl %2 = bitcast <1 x i64> %0 to <2 x i32> %3 = extractelement <2 x i32> %2, i32 0 @@ -49,8 +63,14 @@ define float @PR35982_femms(<1 x i64>) nounwind { ; NO-POSTRA-LABEL: PR35982_femms: ; NO-POSTRA: # %bb.0: -; NO-POSTRA-NEXT: subl $8, %esp -; NO-POSTRA-NEXT: movl {{[0-9]+}}(%esp), %eax +; NO-POSTRA-NEXT: pushl %ebp +; NO-POSTRA-NEXT: movl %esp, %ebp +; NO-POSTRA-NEXT: andl $-8, %esp +; NO-POSTRA-NEXT: subl $16, %esp +; NO-POSTRA-NEXT: movl 8(%ebp), %eax +; NO-POSTRA-NEXT: movl 12(%ebp), %ecx +; NO-POSTRA-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; NO-POSTRA-NEXT: movl %eax, {{[0-9]+}}(%esp) ; NO-POSTRA-NEXT: movq {{[0-9]+}}(%esp), %mm0 ; NO-POSTRA-NEXT: punpckhdq %mm0, %mm0 # mm0 = mm0[1,1] ; NO-POSTRA-NEXT: movd %mm0, %ecx @@ -59,14 +79,21 @@ ; NO-POSTRA-NEXT: fildl (%esp) ; NO-POSTRA-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; NO-POSTRA-NEXT: fiaddl {{[0-9]+}}(%esp) -; NO-POSTRA-NEXT: addl $8, %esp +; NO-POSTRA-NEXT: movl %ebp, %esp +; NO-POSTRA-NEXT: popl %ebp ; NO-POSTRA-NEXT: retl ; ; POSTRA-LABEL: PR35982_femms: ; POSTRA: # %bb.0: -; POSTRA-NEXT: subl $8, %esp +; POSTRA-NEXT: pushl %ebp +; POSTRA-NEXT: movl %esp, %ebp +; POSTRA-NEXT: andl $-8, %esp +; POSTRA-NEXT: subl $16, %esp +; POSTRA-NEXT: movl 8(%ebp), %eax +; POSTRA-NEXT: movl 12(%ebp), %ecx +; POSTRA-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; POSTRA-NEXT: movl %eax, {{[0-9]+}}(%esp) ; POSTRA-NEXT: movq {{[0-9]+}}(%esp), %mm0 -; POSTRA-NEXT: movl {{[0-9]+}}(%esp), %eax ; POSTRA-NEXT: punpckhdq %mm0, %mm0 # mm0 = mm0[1,1] ; POSTRA-NEXT: movd %mm0, %ecx ; POSTRA-NEXT: femms @@ -74,7 +101,8 @@ ; POSTRA-NEXT: fildl (%esp) ; POSTRA-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; POSTRA-NEXT: fiaddl {{[0-9]+}}(%esp) -; POSTRA-NEXT: addl $8, %esp +; POSTRA-NEXT: movl %ebp, %esp +; POSTRA-NEXT: popl %ebp ; POSTRA-NEXT: retl %2 = bitcast <1 x i64> %0 to <2 x i32> %3 = extractelement <2 x i32> %2, i32 0 diff --git a/llvm/test/CodeGen/X86/pr38185.ll b/llvm/test/CodeGen/X86/pr38185.ll --- a/llvm/test/CodeGen/X86/pr38185.ll +++ b/llvm/test/CodeGen/X86/pr38185.ll @@ -14,11 +14,10 @@ ; CHECK-NEXT: # %bb.2: # %body ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: movl $1, (%rdx,%rax,4) -; CHECK-NEXT: movzbl (%rdi,%rax,4), %r8d -; CHECK-NEXT: movzbl (%rsi,%rax,4), %r9d -; CHECK-NEXT: andl %r8d, %r9d -; CHECK-NEXT: andl $1, %r9d -; CHECK-NEXT: movl %r9d, (%rdi,%rax,4) +; CHECK-NEXT: movl (%rdi,%rax,4), %r8d +; CHECK-NEXT: andl (%rsi,%rax,4), %r8d +; CHECK-NEXT: andl $1, %r8d +; CHECK-NEXT: movl %r8d, (%rdi,%rax,4) ; CHECK-NEXT: incq %rax ; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: jmp .LBB0_1 diff --git a/llvm/test/CodeGen/X86/pr38217.ll b/llvm/test/CodeGen/X86/pr38217.ll --- a/llvm/test/CodeGen/X86/pr38217.ll +++ b/llvm/test/CodeGen/X86/pr38217.ll @@ -17,11 +17,11 @@ ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: mulq %r8 ; CHECK-NEXT: shrq $11, %rdx -; CHECK-NEXT: imulq $10000, %rdx, %rax # imm = 0x2710 -; CHECK-NEXT: movq %rdi, %r9 -; CHECK-NEXT: subq %rax, %r9 -; CHECK-NEXT: imulq $42949673, %r9, %rax # imm = 0x28F5C29 -; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: imull $10000, %edx, %eax # imm = 0x2710 +; CHECK-NEXT: movl %edi, %r9d +; CHECK-NEXT: subl %eax, %r9d +; CHECK-NEXT: imulq $1374389535, %r9, %rax # imm = 0x51EB851F +; CHECK-NEXT: shrq $37, %rax ; CHECK-NEXT: imull $100, %eax, %r10d ; CHECK-NEXT: subl %r10d, %r9d ; CHECK-NEXT: movl %ecx, %r10d diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll --- a/llvm/test/CodeGen/X86/pr38539.ll +++ b/llvm/test/CodeGen/X86/pr38539.ll @@ -6,13 +6,253 @@ define void @f() { ; X64-LABEL: f: ; X64: # %bb.0: # %BB -; X64-NEXT: movzbl (%rax), %eax +; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: movzbl (%rax), %ecx ; X64-NEXT: cmpb $0, (%rax) ; X64-NEXT: setne (%rax) -; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax ; X64-NEXT: movq %rax, (%rax) ; X64-NEXT: movb $0, (%rax) ; X64-NEXT: retq +; +; X86-LABEL: f: +; X86: # %bb.0: # %BB_udiv-special-cases +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $144, %esp +; X86-NEXT: .cfi_offset %esi, -20 +; X86-NEXT: .cfi_offset %edi, -16 +; X86-NEXT: .cfi_offset %ebx, -12 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movzbl (%eax), %eax +; X86-NEXT: movzbl (%eax), %ebx +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: divb %bl +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shll $30, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: sarl $30, %ecx +; X86-NEXT: sarl $31, %eax +; X86-NEXT: shrdl $1, %eax, %ecx +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: xorl %eax, %edi +; X86-NEXT: xorl %ecx, %esi +; X86-NEXT: subl %ecx, %esi +; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: andl $3, %edx +; X86-NEXT: testl %edi, %edi +; X86-NEXT: jne .LBB0_1 +; X86-NEXT: # %bb.2: # %BB_udiv-special-cases +; X86-NEXT: bsrl %esi, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: addl $32, %ecx +; X86-NEXT: jmp .LBB0_3 +; X86-NEXT: .LBB0_1: +; X86-NEXT: bsrl %edi, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: .LBB0_3: # %BB_udiv-special-cases +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: testl %edx, %edx +; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: jne .LBB0_4 +; X86-NEXT: # %bb.5: # %BB_udiv-special-cases +; X86-NEXT: addl $64, %ecx +; X86-NEXT: jmp .LBB0_6 +; X86-NEXT: .LBB0_4: +; X86-NEXT: bsrl %edx, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: addl $32, %ecx +; X86-NEXT: .LBB0_6: # %BB_udiv-special-cases +; X86-NEXT: subl $62, %ecx +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: addl $-66, %ecx +; X86-NEXT: adcl $-1, %ebx +; X86-NEXT: adcl $3, %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movb $1, %al +; X86-NEXT: testb %al, %al +; X86-NEXT: jne .LBB0_11 +; X86-NEXT: # %bb.7: # %BB_udiv-special-cases +; X86-NEXT: andl $3, %edi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: xorl $65, %eax +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl %ebx, %eax +; X86-NEXT: je .LBB0_11 +; X86-NEXT: # %bb.8: # %udiv-bb1 +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: addl $1, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl $0, %esi +; X86-NEXT: andl $3, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movb $65, %cl +; X86-NEXT: subb %al, %cl +; X86-NEXT: movb %cl, %ch +; X86-NEXT: andb $7, %ch +; X86-NEXT: shrb $3, %cl +; X86-NEXT: andb $15, %cl +; X86-NEXT: negb %cl +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movsbl %cl, %edx +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl 120(%esp,%edx), %eax +; X86-NEXT: movb %ch, %cl +; X86-NEXT: shll %cl, %eax +; X86-NEXT: notb %cl +; X86-NEXT: movl 112(%esp,%edx), %edi +; X86-NEXT: movl 116(%esp,%edx), %esi +; X86-NEXT: movl %esi, %edx +; X86-NEXT: shrl %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: movb %ch, %cl +; X86-NEXT: shldl %cl, %edi, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: orl %ebx, %ecx +; X86-NEXT: je .LBB0_11 +; X86-NEXT: # %bb.9: # %udiv-preheader +; X86-NEXT: orl %edx, %eax +; X86-NEXT: andl $3, %eax +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movb %dl, %ch +; X86-NEXT: andb $7, %ch +; X86-NEXT: movb %dl, %cl +; X86-NEXT: shrb $3, %cl +; X86-NEXT: andb $15, %cl +; X86-NEXT: movzbl %cl, %esi +; X86-NEXT: movl 64(%esp,%esi), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 68(%esp,%esi), %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movb %ch, %cl +; X86-NEXT: shrl %cl, %edi +; X86-NEXT: notb %cl +; X86-NEXT: movl 72(%esp,%esi), %esi +; X86-NEXT: addl %esi, %esi +; X86-NEXT: shll %cl, %esi +; X86-NEXT: orl %edi, %esi +; X86-NEXT: movb %ch, %cl +; X86-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: addl $-1, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl $-1, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl $3, %ecx +; X86-NEXT: andl $3, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: .p2align 4, 0x90 +; X86-NEXT: .LBB0_10: # %udiv-do-while +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $1, %esi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl $1, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $2, %edx +; X86-NEXT: shrl %edx +; X86-NEXT: leal (%edx,%esi,2), %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: orl %ebx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %edi +; X86-NEXT: orl %ebx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edx, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl $3, %eax +; X86-NEXT: cmpl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: sbbl %ecx, %ebx +; X86-NEXT: shll $30, %ebx +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: sarl $30, %edx +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: shrdl $1, %ebx, %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: andl $1, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: subl %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: andl $3, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: addl $-1, %edx +; X86-NEXT: adcl $-1, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl $3, %edi +; X86-NEXT: andl $3, %edi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %edx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: jne .LBB0_10 +; X86-NEXT: .LBB0_11: # %udiv-end +; X86-NEXT: cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload +; X86-NEXT: setne (%eax) +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%eax) +; X86-NEXT: movb $0, (%eax) +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 +; X86-NEXT: retl BB: %A30 = alloca i66 %L17 = load i66, ptr %A30 @@ -41,10 +281,10 @@ define void @g() { ; X64-LABEL: g: ; X64: # %bb.0: # %BB -; X64-NEXT: movzbl (%rax), %eax +; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: movzbl (%rax), %ecx ; X64-NEXT: cmpb $0, (%rax) ; X64-NEXT: setne (%rax) -; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax ; X64-NEXT: movq %rax, (%rax) ; X64-NEXT: movb $0, (%rax) ; X64-NEXT: retq @@ -58,10 +298,10 @@ ; X86-NEXT: .cfi_def_cfa_register %ebp ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: movzbl (%eax), %eax +; X86-NEXT: leal -{{[0-9]+}}(%esp), %eax +; X86-NEXT: movzbl (%eax), %ecx ; X86-NEXT: cmpb $0, (%eax) ; X86-NEXT: setne (%eax) -; X86-NEXT: leal -{{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, (%eax) ; X86-NEXT: movb $0, (%eax) ; X86-NEXT: movl %ebp, %esp diff --git a/llvm/test/CodeGen/X86/pr38639.ll b/llvm/test/CodeGen/X86/pr38639.ll --- a/llvm/test/CodeGen/X86/pr38639.ll +++ b/llvm/test/CodeGen/X86/pr38639.ll @@ -10,6 +10,7 @@ ; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [8.2071743224100002E-1,8.2071743224100002E-1] ; CHECK-NEXT: # xmm2 = mem[0,0] +; CHECK-NEXT: vblendps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; CHECK-NEXT: retq %1 = shufflevector <4 x double> %a, <4 x double> , <8 x i32> diff --git a/llvm/test/CodeGen/X86/pr38738.ll b/llvm/test/CodeGen/X86/pr38738.ll --- a/llvm/test/CodeGen/X86/pr38738.ll +++ b/llvm/test/CodeGen/X86/pr38738.ll @@ -130,22 +130,22 @@ ; X86SSE2-LABEL: tryset: ; X86SSE2: # %bb.0: ; X86SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86SSE2-NEXT: movl $0, 4(%eax) -; X86SSE2-NEXT: movl $0, (%eax) -; X86SSE2-NEXT: movl $0, 12(%eax) -; X86SSE2-NEXT: movl $0, 8(%eax) -; X86SSE2-NEXT: movl $0, 20(%eax) -; X86SSE2-NEXT: movl $0, 16(%eax) -; X86SSE2-NEXT: movl $0, 28(%eax) -; X86SSE2-NEXT: movl $0, 24(%eax) -; X86SSE2-NEXT: movl $0, 36(%eax) -; X86SSE2-NEXT: movl $0, 32(%eax) -; X86SSE2-NEXT: movl $0, 44(%eax) -; X86SSE2-NEXT: movl $0, 40(%eax) -; X86SSE2-NEXT: movl $0, 52(%eax) -; X86SSE2-NEXT: movl $0, 48(%eax) ; X86SSE2-NEXT: movl $0, 60(%eax) ; X86SSE2-NEXT: movl $0, 56(%eax) +; X86SSE2-NEXT: movl $0, 52(%eax) +; X86SSE2-NEXT: movl $0, 48(%eax) +; X86SSE2-NEXT: movl $0, 44(%eax) +; X86SSE2-NEXT: movl $0, 40(%eax) +; X86SSE2-NEXT: movl $0, 36(%eax) +; X86SSE2-NEXT: movl $0, 32(%eax) +; X86SSE2-NEXT: movl $0, 28(%eax) +; X86SSE2-NEXT: movl $0, 24(%eax) +; X86SSE2-NEXT: movl $0, 20(%eax) +; X86SSE2-NEXT: movl $0, 16(%eax) +; X86SSE2-NEXT: movl $0, 12(%eax) +; X86SSE2-NEXT: movl $0, 8(%eax) +; X86SSE2-NEXT: movl $0, 4(%eax) +; X86SSE2-NEXT: movl $0, (%eax) ; X86SSE2-NEXT: retl ; ; X64AVX-LABEL: tryset: diff --git a/llvm/test/CodeGen/X86/pr39666.ll b/llvm/test/CodeGen/X86/pr39666.ll --- a/llvm/test/CodeGen/X86/pr39666.ll +++ b/llvm/test/CodeGen/X86/pr39666.ll @@ -4,7 +4,8 @@ define <2 x i64> @test5(ptr %base, <2 x i64> %src0) { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: -; CHECK-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: vmovddup (%rdi), %xmm1 # xmm1 = mem[0,0] +; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq %res = call <2 x i64> @llvm.masked.expandload.v2i64(ptr %base, <2 x i1> , <2 x i64> %src0) ret <2 x i64>%res diff --git a/llvm/test/CodeGen/X86/pr40730.ll b/llvm/test/CodeGen/X86/pr40730.ll --- a/llvm/test/CodeGen/X86/pr40730.ll +++ b/llvm/test/CodeGen/X86/pr40730.ll @@ -29,7 +29,9 @@ ; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] ; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,1,1,0] ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5],ymm0[6,7] +; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; CHECK-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5],ymm0[6,7] ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %a0, <8 x i32> , <8 x i32> ret <8 x i32> %res diff --git a/llvm/test/CodeGen/X86/pr42727.ll b/llvm/test/CodeGen/X86/pr42727.ll --- a/llvm/test/CodeGen/X86/pr42727.ll +++ b/llvm/test/CodeGen/X86/pr42727.ll @@ -7,8 +7,8 @@ ; CHECK-LABEL: _ZN14simd_test_avx216c_imm_v256_alignILi1EEE6c_v256S1_S1_: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovdqu {{[0-9]+}}(%esp), %xmm0 -; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vpbroadcastd (%eax), %ymm1 +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; CHECK-NEXT: vpsllq $56, %ymm0, %ymm0 ; CHECK-NEXT: vmovdqu %ymm0, (%eax) ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/pr42905.ll b/llvm/test/CodeGen/X86/pr42905.ll --- a/llvm/test/CodeGen/X86/pr42905.ll +++ b/llvm/test/CodeGen/X86/pr42905.ll @@ -4,16 +4,10 @@ define <4 x double> @autogen_SD30452(i1 %L230) { ; CHECK-LABEL: autogen_SD30452: ; CHECK: # %bb.0: # %BB -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [151829,151829] -; CHECK-NEXT: movq %xmm0, %rax -; CHECK-NEXT: cvtsi2sd %rax, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; CHECK-NEXT: movq %xmm2, %rax -; CHECK-NEXT: xorps %xmm2, %xmm2 -; CHECK-NEXT: cvtsi2sd %rax, %xmm2 -; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-NEXT: cvtdq2pd %xmm1, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [151829,151829] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0 +; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: retq BB: %I = insertelement <4 x i64> zeroinitializer, i64 151829, i32 3 diff --git a/llvm/test/CodeGen/X86/pr44976.ll b/llvm/test/CodeGen/X86/pr44976.ll --- a/llvm/test/CodeGen/X86/pr44976.ll +++ b/llvm/test/CodeGen/X86/pr44976.ll @@ -65,7 +65,7 @@ ; CHECK-NEXT: por %xmm5, %xmm3 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] ; CHECK-NEXT: movdqa %xmm3, %xmm4 -; CHECK-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] +; CHECK-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[1,1] ; CHECK-NEXT: movdqa %xmm0, %xmm5 ; CHECK-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm4[2,0] ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,3,3,3] diff --git a/llvm/test/CodeGen/X86/pr45563-2.ll b/llvm/test/CodeGen/X86/pr45563-2.ll --- a/llvm/test/CodeGen/X86/pr45563-2.ll +++ b/llvm/test/CodeGen/X86/pr45563-2.ll @@ -20,6 +20,7 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; CHECK-NEXT: movzbl %sil, %esi ; CHECK-NEXT: vmovd %esi, %xmm1 ; CHECK-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1 ; CHECK-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 @@ -28,22 +29,23 @@ ; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm3 -; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm2 -; CHECK-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[8,u,u,u],zero,xmm3[u,u,u],zero,xmm3[u,u,u],zero,xmm3[u,u,u] +; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm4 +; CHECK-NEXT: vblendvps %ymm1, %ymm4, %ymm0, %ymm0 +; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 ; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm1, %ymm2 ; CHECK-NEXT: vmovaps %ymm0, (%rdi) -; CHECK-NEXT: vblendvps %xmm1, %xmm2, %xmm4, %xmm0 +; CHECK-NEXT: vblendvps %xmm1, %xmm2, %xmm3, %xmm0 ; CHECK-NEXT: vmovss %xmm0, 32(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -62,6 +64,7 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; CHECK-NEXT: movzbl %sil, %esi ; CHECK-NEXT: vmovd %esi, %xmm1 ; CHECK-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1 ; CHECK-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 @@ -70,37 +73,37 @@ ; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm3 -; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm4 -; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],mem[0],xmm5[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],mem[0] -; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] +; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpslld $31, %xmm5, %xmm5 +; CHECK-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm5 +; CHECK-NEXT: vblendvps %ymm1, %ymm5, %ymm0, %ymm0 +; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm2 -; CHECK-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm3 -; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm3, %ymm3 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm5 +; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm5, %ymm5 ; CHECK-NEXT: vmovaps %ymm0, (%rdi) -; CHECK-NEXT: vblendvps %xmm1, %xmm3, %xmm5, %xmm0 +; CHECK-NEXT: vblendvps %xmm2, %xmm5, %xmm3, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, 32(%rdi) -; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm0 -; CHECK-NEXT: vblendvps %xmm2, %xmm0, %xmm6, %xmm0 +; CHECK-NEXT: vextractf128 $1, %ymm5, %xmm0 +; CHECK-NEXT: vblendvps %xmm1, %xmm0, %xmm4, %xmm0 ; CHECK-NEXT: vmovss %xmm0, 48(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -119,6 +122,7 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; CHECK-NEXT: movzbl %sil, %esi ; CHECK-NEXT: vmovd %esi, %xmm1 ; CHECK-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1 ; CHECK-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 @@ -127,39 +131,39 @@ ; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm3 -; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],mem[0] -; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[2,3] ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm2 -; CHECK-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[8,u,9,u,10,u,11,u,12,u,13,u],zero,xmm3[u],zero,xmm3[u] +; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpslld $31, %xmm5, %xmm5 +; CHECK-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; CHECK-NEXT: vmaskmovps (%rcx), %ymm1, %ymm5 +; CHECK-NEXT: vblendvps %ymm1, %ymm5, %ymm0, %ymm0 +; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 ; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3 -; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm3, %ymm3 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm5 +; CHECK-NEXT: vmaskmovps 32(%rcx), %ymm5, %ymm5 ; CHECK-NEXT: vmovaps %ymm0, (%rdi) -; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm0 -; CHECK-NEXT: vblendvps %xmm1, %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vextractf128 $1, %ymm5, %xmm0 +; CHECK-NEXT: vblendvps %xmm1, %xmm0, %xmm4, %xmm0 ; CHECK-NEXT: vmovlps %xmm0, 48(%rdi) -; CHECK-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm0 +; CHECK-NEXT: vblendvps %xmm2, %xmm5, %xmm3, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, 32(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr45833.ll b/llvm/test/CodeGen/X86/pr45833.ll --- a/llvm/test/CodeGen/X86/pr45833.ll +++ b/llvm/test/CodeGen/X86/pr45833.ll @@ -20,7 +20,8 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] ; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vmovd %esi, %xmm2 +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: vmovd %eax, %xmm2 ; CHECK-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $3, %r8d, %xmm2, %xmm2 @@ -28,8 +29,9 @@ ; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm4 -; CHECK-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,u,u,u],zero,xmm4[u,u,u],zero,xmm4[u,u,u],zero,xmm4[u,u,u] +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero ; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 ; CHECK-NEXT: vmaskmovps %ymm1, %ymm4, 32(%rdi) ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero @@ -61,7 +63,8 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: vmovd %esi, %xmm2 +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: vmovd %eax, %xmm2 ; CHECK-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $3, %r8d, %xmm2, %xmm2 @@ -69,24 +72,24 @@ ; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm4 -; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm4, %xmm5 +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; CHECK-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi) -; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm5[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; CHECK-NEXT: vmaskmovps %ymm1, %ymm0, 32(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -111,7 +114,8 @@ ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; CHECK-NEXT: vmovd %esi, %xmm2 +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: vmovd %eax, %xmm2 ; CHECK-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $3, %r8d, %xmm2, %xmm2 @@ -119,20 +123,20 @@ ; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm4 -; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm4, %xmm4 -; CHECK-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm4, %xmm4 +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3 +; CHECK-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpslld $31, %xmm4, %xmm4 +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; CHECK-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi) -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[8,u,9,u,10,u,11,u,12,u,13,u],zero,xmm4[u],zero,xmm4[u] +; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 ; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] @@ -213,15 +217,15 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] -; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] @@ -231,6 +235,7 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0] ; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm3, %xmm3 @@ -245,35 +250,33 @@ ; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 ; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; CHECK-NEXT: vmaskmovps %ymm2, %ymm3, 32(%rdi) -; CHECK-NEXT: vmovd %esi, %xmm2 -; CHECK-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $6, %r8d, %xmm2, %xmm2 +; CHECK-NEXT: vmovd %eax, %xmm2 +; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; CHECK-NEXT: vpslld $31, %xmm3, %xmm3 -; CHECK-NEXT: vpinsrb $8, %r9d, %xmm2, %xmm2 +; CHECK-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2 ; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; CHECK-NEXT: vmaskmovps %ymm1, %ymm2, (%rdi) -; CHECK-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; CHECK-NEXT: vmaskmovps %ymm1, %ymm2, 64(%rdi) +; CHECK-NEXT: vmovd %esi, %xmm1 +; CHECK-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $6, %r8d, %xmm1, %xmm1 +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 -; CHECK-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; CHECK-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; CHECK-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1 ; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; CHECK-NEXT: vmaskmovps %ymm0, %ymm1, 64(%rdi) +; CHECK-NEXT: vmaskmovps %ymm0, %ymm1, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq call void @llvm.masked.store.v23f32.p0(<23 x float> %value, ptr %addr, i32 4, <23 x i1>%mask) diff --git a/llvm/test/CodeGen/X86/pr46820.ll b/llvm/test/CodeGen/X86/pr46820.ll --- a/llvm/test/CodeGen/X86/pr46820.ll +++ b/llvm/test/CodeGen/X86/pr46820.ll @@ -11,13 +11,15 @@ ; CHECK-LABEL: load23: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: vmovups (%rsi), %zmm0 -; CHECK-NEXT: vmovaps 64(%rsi), %xmm1 -; CHECK-NEXT: vmovdqa 80(%rsi), %xmm2 -; CHECK-NEXT: vextractps $2, %xmm2, 88(%rdi) -; CHECK-NEXT: vmovq %xmm2, 80(%rdi) -; CHECK-NEXT: vmovaps %xmm1, 64(%rdi) -; CHECK-NEXT: vmovaps %zmm0, (%rdi) +; CHECK-NEXT: vmovups 64(%rsi), %ymm0 +; CHECK-NEXT: vmovups (%rsi), %zmm1 +; CHECK-NEXT: vmovaps 64(%rsi), %xmm2 +; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovss %xmm3, 88(%rdi) +; CHECK-NEXT: vmovaps %xmm2, 64(%rdi) +; CHECK-NEXT: vmovaps %zmm1, (%rdi) +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovlps %xmm0, 80(%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %t0 = load <23 x float>, ptr %p, align 16 diff --git a/llvm/test/CodeGen/X86/pr46877.ll b/llvm/test/CodeGen/X86/pr46877.ll --- a/llvm/test/CodeGen/X86/pr46877.ll +++ b/llvm/test/CodeGen/X86/pr46877.ll @@ -11,7 +11,7 @@ ; CHECK-NEXT: vmovss {{.*#+}} xmm13 = mem[0],zero,zero,zero ; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm12 ; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm10 -; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm10 = (xmm3 * xmm10) - xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm10 = -(xmm3 * xmm10) + xmm0 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm5 = -(xmm12 * xmm5) + xmm0 ; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm2 ; CHECK-NEXT: vmulss %xmm2, %xmm10, %xmm4 @@ -186,7 +186,7 @@ ; CHECK-NEXT: vmovss {{.*#+}} xmm12 = mem[0],zero,zero,zero ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm11 = -(xmm12 * xmm11) + xmm0 ; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm10 = -(xmm12 * xmm10) + xmm0 -; CHECK-NEXT: vfmsub213ss {{.*#+}} xmm8 = (xmm15 * xmm8) - xmm0 +; CHECK-NEXT: vfnmadd213ss {{.*#+}} xmm8 = -(xmm15 * xmm8) + xmm0 ; CHECK-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * mem) + xmm0 ; CHECK-NEXT: vmulss %xmm4, %xmm3, %xmm0 ; CHECK-NEXT: vmulss %xmm0, %xmm5, %xmm0 diff --git a/llvm/test/CodeGen/X86/pr47517.ll b/llvm/test/CodeGen/X86/pr47517.ll --- a/llvm/test/CodeGen/X86/pr47517.ll +++ b/llvm/test/CodeGen/X86/pr47517.ll @@ -6,7 +6,19 @@ ; CHECK-LABEL: test: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq $0, (%rdi) -; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps %xmm1, %xmm2 +; CHECK-NEXT: mulss %xmm0, %xmm2 +; CHECK-NEXT: addss %xmm1, %xmm2 +; CHECK-NEXT: movaps %xmm2, %xmm1 +; CHECK-NEXT: addss %xmm2, %xmm1 +; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-NEXT: mulss %xmm1, %xmm2 +; CHECK-NEXT: addss %xmm1, %xmm0 +; CHECK-NEXT: mulss %xmm1, %xmm1 +; CHECK-NEXT: addss %xmm2, %xmm0 +; CHECK-NEXT: mulss %xmm1, %xmm0 ; CHECK-NEXT: retq entry: %a1 = getelementptr inbounds float, ptr %p, i32 1 diff --git a/llvm/test/CodeGen/X86/pr49162.ll b/llvm/test/CodeGen/X86/pr49162.ll --- a/llvm/test/CodeGen/X86/pr49162.ll +++ b/llvm/test/CodeGen/X86/pr49162.ll @@ -6,8 +6,7 @@ ; X86-LABEL: PR49162: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl 8(%eax), %ecx -; X86-NEXT: shll $16, %ecx +; X86-NEXT: movl 6(%eax), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sarl $31, %eax ; X86-NEXT: shldl $16, %ecx, %eax @@ -17,10 +16,7 @@ ; ; X64-LABEL: PR49162: ; X64: # %bb.0: -; X64-NEXT: movl 8(%rsi), %eax -; X64-NEXT: shll $16, %eax -; X64-NEXT: cltq -; X64-NEXT: sarq $16, %rax +; X64-NEXT: movswq 8(%rsi), %rax ; X64-NEXT: leaq (%rdi,%rax,4), %rax ; X64-NEXT: retq %load160 = load i160, ptr %ptr160, align 4 diff --git a/llvm/test/CodeGen/X86/pr49451.ll b/llvm/test/CodeGen/X86/pr49451.ll --- a/llvm/test/CodeGen/X86/pr49451.ll +++ b/llvm/test/CodeGen/X86/pr49451.ll @@ -10,7 +10,7 @@ ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: xorl %ebx, %ebx @@ -51,7 +51,6 @@ ; X64-NEXT: leal -23090(%rax), %edi ; X64-NEXT: movw %di, s_0(%rip) ; X64-NEXT: incq %rax -; X64-NEXT: leal -23091(%rax), %edi ; X64-NEXT: cmpw $73, %di ; X64-NEXT: jl .LBB0_1 ; X64-NEXT: # %bb.4: # %for.body1703 diff --git a/llvm/test/CodeGen/X86/pr50609.ll b/llvm/test/CodeGen/X86/pr50609.ll --- a/llvm/test/CodeGen/X86/pr50609.ll +++ b/llvm/test/CodeGen/X86/pr50609.ll @@ -4,10 +4,11 @@ define void @PR50609(ptr noalias nocapture %RET, ptr noalias %aFOO, <16 x i32> %__mask) nounwind { ; CHECK-LABEL: PR50609: ; CHECK: # %bb.0: # %allocas -; CHECK-NEXT: leal 40(%rsi), %eax ; CHECK-NEXT: vmovq %rsi, %xmm2 -; CHECK-NEXT: vmovd %eax, %xmm3 +; CHECK-NEXT: addq $40, %rsi +; CHECK-NEXT: vmovq %rsi, %xmm3 ; CHECK-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; CHECK-NEXT: vpsrad $31, %xmm2, %xmm3 ; CHECK-NEXT: vpsrld $30, %xmm3, %xmm3 ; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/pr51615.ll b/llvm/test/CodeGen/X86/pr51615.ll --- a/llvm/test/CodeGen/X86/pr51615.ll +++ b/llvm/test/CodeGen/X86/pr51615.ll @@ -11,14 +11,15 @@ ; AVX-LABEL: volatile_load_2_elts: ; AVX: # %bb.0: ; AVX-NEXT: vmovaps g0(%rip), %xmm0 -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX-NEXT: vmovaps %ymm0, (%rax) -; AVX-NEXT: vmovaps %ymm1, (%rax) +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,3,2] +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3] +; AVX-NEXT: vmovapd %ymm0, (%rax) +; AVX-NEXT: vmovaps %ymm2, (%rax) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/pr53419.ll b/llvm/test/CodeGen/X86/pr53419.ll --- a/llvm/test/CodeGen/X86/pr53419.ll +++ b/llvm/test/CodeGen/X86/pr53419.ll @@ -13,19 +13,14 @@ ; All four versions are semantically equivalent and should produce same asm as scalar version. define i1 @intrinsic_v2i8(ptr align 1 %arg, ptr align 1 %arg1) { -; X64-LABEL: intrinsic_v2i8: -; X64: # %bb.0: # %bb -; X64-NEXT: movzwl (%rsi), %eax -; X64-NEXT: cmpw (%rdi), %ax -; X64-NEXT: sete %al -; X64-NEXT: retq -; ; X86-LABEL: intrinsic_v2i8: ; X86: # %bb.0: # %bb ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: cmpw (%eax), %cx +; X86-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-NEXT: vptest %xmm0, %xmm0 ; X86-NEXT: sete %al ; X86-NEXT: retl bb: @@ -37,19 +32,14 @@ } define i1 @intrinsic_v4i8(ptr align 1 %arg, ptr align 1 %arg1) { -; X64-LABEL: intrinsic_v4i8: -; X64: # %bb.0: # %bb -; X64-NEXT: movl (%rsi), %eax -; X64-NEXT: cmpl (%rdi), %eax -; X64-NEXT: sete %al -; X64-NEXT: retq -; ; X86-LABEL: intrinsic_v4i8: ; X86: # %bb.0: # %bb ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: cmpl (%eax), %ecx +; X86-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-NEXT: vptest %xmm0, %xmm0 ; X86-NEXT: sete %al ; X86-NEXT: retl bb: @@ -61,22 +51,16 @@ } define i1 @intrinsic_v8i8(ptr align 1 %arg, ptr align 1 %arg1) { -; X64-LABEL: intrinsic_v8i8: -; X64: # %bb.0: # %bb -; X64-NEXT: movq (%rsi), %rax -; X64-NEXT: cmpq (%rdi), %rax -; X64-NEXT: sete %al -; X64-NEXT: retq -; ; X86-LABEL: intrinsic_v8i8: ; X86: # %bb.0: # %bb ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %ecx -; X86-NEXT: xorl 4(%eax), %ecx -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: orl %ecx, %edx +; X86-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; X86-NEXT: vpmovmskb %xmm0, %eax +; X86-NEXT: cmpb $-1, %al ; X86-NEXT: sete %al ; X86-NEXT: retl bb: @@ -88,19 +72,14 @@ } define i1 @vector_version_v2i8(ptr align 1 %arg, ptr align 1 %arg1) { -; X64-LABEL: vector_version_v2i8: -; X64: # %bb.0: # %bb -; X64-NEXT: movzwl (%rsi), %eax -; X64-NEXT: cmpw (%rdi), %ax -; X64-NEXT: sete %al -; X64-NEXT: retq -; ; X86-LABEL: vector_version_v2i8: ; X86: # %bb.0: # %bb ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: cmpw (%eax), %cx +; X86-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-NEXT: vptest %xmm0, %xmm0 ; X86-NEXT: sete %al ; X86-NEXT: retl bb: @@ -113,19 +92,14 @@ } define i1 @vector_version_v4i8(ptr align 1 %arg, ptr align 1 %arg1) { -; X64-LABEL: vector_version_v4i8: -; X64: # %bb.0: # %bb -; X64-NEXT: movl (%rsi), %eax -; X64-NEXT: cmpl (%rdi), %eax -; X64-NEXT: sete %al -; X64-NEXT: retq -; ; X86-LABEL: vector_version_v4i8: ; X86: # %bb.0: # %bb ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: cmpl (%eax), %ecx +; X86-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-NEXT: vptest %xmm0, %xmm0 ; X86-NEXT: sete %al ; X86-NEXT: retl bb: @@ -138,22 +112,15 @@ } define i1 @vector_version_v8i8(ptr align 1 %arg, ptr align 1 %arg1) { -; X64-LABEL: vector_version_v8i8: -; X64: # %bb.0: # %bb -; X64-NEXT: movq (%rsi), %rax -; X64-NEXT: cmpq (%rdi), %rax -; X64-NEXT: sete %al -; X64-NEXT: retq -; ; X86-LABEL: vector_version_v8i8: ; X86: # %bb.0: # %bb ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl 4(%ecx), %ecx -; X86-NEXT: xorl 4(%eax), %ecx -; X86-NEXT: xorl (%eax), %edx -; X86-NEXT: orl %ecx, %edx +; X86-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpmovmskb %xmm0, %eax +; X86-NEXT: xorl $65535, %eax # imm = 0xFFFF ; X86-NEXT: sete %al ; X86-NEXT: retl bb: diff --git a/llvm/test/CodeGen/X86/pr53842.ll b/llvm/test/CodeGen/X86/pr53842.ll --- a/llvm/test/CodeGen/X86/pr53842.ll +++ b/llvm/test/CodeGen/X86/pr53842.ll @@ -5,20 +5,6 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s define void @PR53842() { -; CHECK-LABEL: PR53842: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB0_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %ymm3 -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %ymm2 -; CHECK-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; CHECK-NEXT: vpsubq %zmm2, %zmm0, %zmm0 -; CHECK-NEXT: jmp .LBB0_1 entry: br label %vector.body @@ -36,3 +22,5 @@ unreachable } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/X86/pr56103.ll b/llvm/test/CodeGen/X86/pr56103.ll --- a/llvm/test/CodeGen/X86/pr56103.ll +++ b/llvm/test/CodeGen/X86/pr56103.ll @@ -17,7 +17,7 @@ ; CHECK-NEXT: movq b@GOTPCREL(%rip), %rax ; CHECK-NEXT: movq $1, (%rax) ; CHECK-NEXT: movq a@GOTPCREL(%rip), %rax -; CHECK-NEXT: movl (%rax), %ecx +; CHECK-NEXT: movslq (%rax), %rcx ; CHECK-NEXT: movl $-2, %eax ; CHECK-NEXT: sarl %cl, %eax ; CHECK-NEXT: movq c@GOTPCREL(%rip), %rdx diff --git a/llvm/test/CodeGen/X86/pr57340.ll b/llvm/test/CodeGen/X86/pr57340.ll --- a/llvm/test/CodeGen/X86/pr57340.ll +++ b/llvm/test/CodeGen/X86/pr57340.ll @@ -4,12 +4,11 @@ define void @main.41() local_unnamed_addr #1 { ; CHECK-LABEL: main.41: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpbroadcastw (%rax), %xmm0 -; CHECK-NEXT: vmovdqu (%rax), %ymm2 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm1 +; CHECK-NEXT: vpinsrw $0, (%rax), %xmm0, %xmm0 ; CHECK-NEXT: vpextrw $0, %xmm0, %eax +; CHECK-NEXT: vmovdqu (%rax), %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [16,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1 ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm0 ; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/pr57402.ll b/llvm/test/CodeGen/X86/pr57402.ll --- a/llvm/test/CodeGen/X86/pr57402.ll +++ b/llvm/test/CodeGen/X86/pr57402.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: PR57402: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: notl %eax -; CHECK-NEXT: andl $-2, %eax +; CHECK-NEXT: andl $65534, %eax # imm = 0xFFFE ; CHECK-NEXT: leal 1(%rax,%rax,2), %ecx ; CHECK-NEXT: movswq %cx, %rsi ; CHECK-NEXT: xorl %edi, %edi diff --git a/llvm/test/CodeGen/X86/pr57658.ll b/llvm/test/CodeGen/X86/pr57658.ll --- a/llvm/test/CodeGen/X86/pr57658.ll +++ b/llvm/test/CodeGen/X86/pr57658.ll @@ -6,9 +6,8 @@ ; CHECK: # %bb.0: # %BB ; CHECK-NEXT: movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] ; CHECK-NEXT: xorpd %xmm0, %xmm1 +; CHECK-NEXT: mulpd %xmm1, %xmm0 ; CHECK-NEXT: mulpd %xmm0, %xmm1 -; CHECK-NEXT: mulpd %xmm0, %xmm1 -; CHECK-NEXT: mulpd %xmm0, %xmm0 ; CHECK-NEXT: mulpd %xmm1, %xmm0 ; CHECK-NEXT: retq BB: diff --git a/llvm/test/CodeGen/X86/pr61923.ll b/llvm/test/CodeGen/X86/pr61923.ll --- a/llvm/test/CodeGen/X86/pr61923.ll +++ b/llvm/test/CodeGen/X86/pr61923.ll @@ -17,9 +17,12 @@ ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_2: # %memcmp.loop ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups (%rsi,%rcx), %ymm0 -; CHECK-NEXT: vxorps (%rdi,%rcx), %ymm0, %ymm0 -; CHECK-NEXT: vptest %ymm0, %ymm0 +; CHECK-NEXT: vmovdqu (%rsi,%rcx), %xmm0 +; CHECK-NEXT: vmovdqu 16(%rsi,%rcx), %xmm1 +; CHECK-NEXT: vpxor (%rdi,%rcx), %xmm0, %xmm0 +; CHECK-NEXT: vpxor 16(%rdi,%rcx), %xmm1, %xmm1 +; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vptest %xmm0, %xmm0 ; CHECK-NEXT: jne .LBB0_4 ; CHECK-NEXT: # %bb.3: # %memcmp.loop.latch ; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=1 @@ -27,7 +30,6 @@ ; CHECK-NEXT: cmpq %rax, %rcx ; CHECK-NEXT: jb .LBB0_2 ; CHECK-NEXT: .LBB0_4: # %done -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: %len.wide = zext i32 %len to i64 diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll --- a/llvm/test/CodeGen/X86/pr62286.ll +++ b/llvm/test/CodeGen/X86/pr62286.ll @@ -28,32 +28,32 @@ ; AVX1-LABEL: PR62286: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] -; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm1 -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR62286: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] -; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] +; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -68,11 +68,12 @@ ; AVX512-NEXT: movw $4369, %ax # imm = 0x1111 ; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: vpaddd %zmm0, %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vpmovsxdq %ymm1, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; AVX512-NEXT: vpmovsxdq %ymm1, %zmm1 +; AVX512-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll --- a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll @@ -30,9 +30,7 @@ ; AVX256VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4],xmm1[5],xmm2[6,7] ; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm1 ; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1 -; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k0 -; AVX256VL-NEXT: kunpckbw %k1, %k0, %k0 -; AVX256VL-NEXT: kshiftrw $8, %k0, %k2 +; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k2 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} ; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} @@ -159,9 +157,7 @@ ; AVX256VL-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm1 ; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1 -; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k0 -; AVX256VL-NEXT: kunpckbw %k1, %k0, %k0 -; AVX256VL-NEXT: kshiftrw $8, %k0, %k2 +; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k2 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} ; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} diff --git a/llvm/test/CodeGen/X86/promote-cmp.ll b/llvm/test/CodeGen/X86/promote-cmp.ll --- a/llvm/test/CodeGen/X86/promote-cmp.ll +++ b/llvm/test/CodeGen/X86/promote-cmp.ll @@ -54,7 +54,6 @@ ; SSE4-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] ; SSE4-NEXT: pcmpeqd %xmm6, %xmm6 ; SSE4-NEXT: pxor %xmm5, %xmm6 -; SSE4-NEXT: psllq $63, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3 ; SSE4-NEXT: pmovsxdq %xmm6, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2 @@ -68,15 +67,24 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR45808: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %3 = icmp sgt <4 x i64> %0, %1 diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -744,87 +744,125 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind { ; SSE2-LABEL: test13: ; SSE2: # %bb.0: # %vector.ph -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: por %xmm2, %xmm6 -; SSE2-NEXT: pslld $16, %xmm6 -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pslld $16, %xmm5 -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: packssdw %xmm6, %xmm5 -; SSE2-NEXT: psubusw %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm7 +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm6, %xmm3 +; SSE2-NEXT: por %xmm6, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: packssdw %xmm7, %xmm3 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: psubw %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test13: ; SSSE3: # %bb.0: # %vector.ph -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] -; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pandn %xmm4, %xmm6 -; SSSE3-NEXT: por %xmm2, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm2, %xmm6 -; SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSSE3-NEXT: por %xmm1, %xmm5 -; SSSE3-NEXT: pshufb %xmm2, %xmm5 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSSE3-NEXT: psubusw %xmm5, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm2, %xmm7 +; SSSE3-NEXT: pxor %xmm6, %xmm7 +; SSSE3-NEXT: por %xmm6, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pxor %xmm6, %xmm3 +; SSSE3-NEXT: por %xmm6, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSSE3-NEXT: packssdw %xmm7, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm4, %xmm2 +; SSSE3-NEXT: pshufb %xmm4, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: psubw %xmm1, %xmm0 +; SSSE3-NEXT: pandn %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: test13: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] -; SSE41-NEXT: pminud %xmm3, %xmm2 -; SSE41-NEXT: pminud %xmm3, %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: movdqa %xmm3, %xmm6 +; SSE41-NEXT: pmaxud %xmm2, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm7, %xmm7 +; SSE41-NEXT: pxor %xmm7, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: pmaxud %xmm1, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE41-NEXT: pxor %xmm7, %xmm3 +; SSE41-NEXT: packssdw %xmm6, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] ; SSE41-NEXT: packusdw %xmm2, %xmm1 -; SSE41-NEXT: psubusw %xmm1, %xmm0 +; SSE41-NEXT: psubw %xmm1, %xmm0 +; SSE41-NEXT: pandn %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test13: ; AVX1: # %bb.0: # %vector.ph -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535] -; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpmaxud %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmaxud %xmm1, %xmm3, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpandn %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test13: ; AVX2: # %bb.0: # %vector.ph -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmaxud %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test13: ; AVX512: # %bb.0: # %vector.ph -; AVX512-NEXT: vpmovusdw %ymm1, %xmm1 -; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpcmpnltud %ymm1, %ymm2, %k1 +; AVX512-NEXT: vpmovdw %ymm1, %xmm1 +; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq vector.ph: @@ -970,14 +1008,13 @@ ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm1, %ymm4 ; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 ; AVX2-NEXT: vpxor %ymm5, %ymm4, %ymm4 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4 ; AVX2-NEXT: vpmaxud %ymm3, %ymm2, %ymm3 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm3 ; AVX2-NEXT: vpxor %ymm5, %ymm3, %ymm3 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vpackssdw %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpacksswb %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpacksswb %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 @@ -1010,87 +1047,122 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind { ; SSE2-LABEL: test15: ; SSE2: # %bb.0: # %vector.ph +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: por %xmm2, %xmm6 -; SSE2-NEXT: pslld $16, %xmm6 -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pslld $16, %xmm5 -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: packssdw %xmm6, %xmm5 -; SSE2-NEXT: psubusw %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: pxor %xmm3, %xmm6 +; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm3, %xmm6 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE2-NEXT: packssdw %xmm5, %xmm4 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: psubw %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test15: ; SSSE3: # %bb.0: # %vector.ph +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] -; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pandn %xmm4, %xmm6 -; SSSE3-NEXT: por %xmm2, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm2, %xmm6 -; SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSSE3-NEXT: por %xmm1, %xmm5 -; SSSE3-NEXT: pshufb %xmm2, %xmm5 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSSE3-NEXT: psubusw %xmm5, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm6 +; SSSE3-NEXT: pxor %xmm3, %xmm6 +; SSSE3-NEXT: por %xmm3, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 +; SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSSE3-NEXT: pxor %xmm3, %xmm6 +; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4 +; SSSE3-NEXT: packssdw %xmm5, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm3, %xmm2 +; SSSE3-NEXT: pshufb %xmm3, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: psubw %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm4, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: test15: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] -; SSE41-NEXT: pminud %xmm3, %xmm2 -; SSE41-NEXT: pminud %xmm3, %xmm1 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pminud %xmm2, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT: pxor %xmm4, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm7 +; SSE41-NEXT: pminud %xmm1, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 +; SSE41-NEXT: pxor %xmm4, %xmm7 +; SSE41-NEXT: packssdw %xmm6, %xmm7 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7] ; SSE41-NEXT: packusdw %xmm2, %xmm1 -; SSE41-NEXT: psubusw %xmm1, %xmm0 +; SSE41-NEXT: psubw %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test15: ; AVX1: # %bb.0: # %vector.ph -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535] -; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test15: ; AVX2: # %bb.0: # %vector.ph -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpminud %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test15: ; AVX512: # %bb.0: # %vector.ph -; AVX512-NEXT: vpmovusdw %ymm1, %xmm1 -; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpcmpnleud %ymm1, %ymm2, %k1 +; AVX512-NEXT: vpmovdw %ymm1, %xmm1 +; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq vector.ph: @@ -1161,19 +1233,21 @@ ; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pmaxud %xmm2, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE41-NEXT: pxor %xmm6, %xmm4 -; SSE41-NEXT: pmaxud %xmm1, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE41-NEXT: pxor %xmm6, %xmm5 -; SSE41-NEXT: packssdw %xmm4, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pminud %xmm2, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT: pxor %xmm4, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm7 +; SSE41-NEXT: pminud %xmm1, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 +; SSE41-NEXT: pxor %xmm4, %xmm7 +; SSE41-NEXT: packssdw %xmm6, %xmm7 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7] ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7] ; SSE41-NEXT: packusdw %xmm2, %xmm1 ; SSE41-NEXT: psubw %xmm1, %xmm0 -; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test16: @@ -1182,12 +1256,12 @@ ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpmaxud %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 @@ -1201,8 +1275,8 @@ ; AVX2-LABEL: test16: ; AVX2: # %bb.0: # %vector.ph ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmaxud %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpminud %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 @@ -1217,7 +1291,7 @@ ; AVX512-LABEL: test16: ; AVX512: # %bb.0: # %vector.ph ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vpcmpltud %ymm2, %ymm1, %k1 +; AVX512-NEXT: vpcmpnleud %ymm1, %ymm2, %k1 ; AVX512-NEXT: vpmovdw %ymm1, %xmm1 ; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vzeroupper @@ -1539,17 +1613,17 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] ; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] ; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pandn %xmm4, %xmm6 ; SSE2-NEXT: por %xmm2, %xmm6 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: packssdw %xmm6, %xmm5 @@ -1631,27 +1705,27 @@ ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE2OR3-NEXT: movdqa %xmm2, %xmm7 ; SSE2OR3-NEXT: pxor %xmm5, %xmm7 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] -; SSE2OR3-NEXT: movdqa %xmm6, %xmm9 -; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm8 -; SSE2OR3-NEXT: pand %xmm9, %xmm8 +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991] +; SSE2OR3-NEXT: movdqa %xmm6, %xmm8 +; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm8, %xmm9 ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535] -; SSE2OR3-NEXT: pand %xmm8, %xmm2 -; SSE2OR3-NEXT: pandn %xmm7, %xmm8 -; SSE2OR3-NEXT: por %xmm2, %xmm8 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] +; SSE2OR3-NEXT: pand %xmm9, %xmm2 +; SSE2OR3-NEXT: pandn %xmm7, %xmm9 +; SSE2OR3-NEXT: por %xmm2, %xmm9 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,2,3] ; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] ; SSE2OR3-NEXT: movdqa %xmm1, %xmm8 ; SSE2OR3-NEXT: pxor %xmm5, %xmm8 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE2OR3-NEXT: movdqa %xmm6, %xmm10 -; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2OR3-NEXT: movdqa %xmm6, %xmm9 +; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm8 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm8 -; SSE2OR3-NEXT: pand %xmm10, %xmm8 +; SSE2OR3-NEXT: pand %xmm9, %xmm8 ; SSE2OR3-NEXT: pand %xmm8, %xmm1 ; SSE2OR3-NEXT: pandn %xmm7, %xmm8 ; SSE2OR3-NEXT: por %xmm1, %xmm8 @@ -1660,28 +1734,28 @@ ; SSE2OR3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2OR3-NEXT: movdqa %xmm4, %xmm2 ; SSE2OR3-NEXT: pxor %xmm5, %xmm2 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2] -; SSE2OR3-NEXT: movdqa %xmm6, %xmm9 -; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2OR3-NEXT: movdqa %xmm6, %xmm8 +; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm2 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE2OR3-NEXT: pand %xmm9, %xmm2 +; SSE2OR3-NEXT: pand %xmm8, %xmm2 ; SSE2OR3-NEXT: pand %xmm2, %xmm4 ; SSE2OR3-NEXT: pandn %xmm7, %xmm2 ; SSE2OR3-NEXT: por %xmm4, %xmm2 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; SSE2OR3-NEXT: movdqa %xmm3, %xmm4 -; SSE2OR3-NEXT: pxor %xmm5, %xmm4 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] -; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm6 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE2OR3-NEXT: pand %xmm6, %xmm4 -; SSE2OR3-NEXT: pand %xmm4, %xmm3 -; SSE2OR3-NEXT: pandn %xmm7, %xmm4 -; SSE2OR3-NEXT: por %xmm3, %xmm4 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSE2OR3-NEXT: pxor %xmm3, %xmm5 +; SSE2OR3-NEXT: movdqa %xmm6, %xmm4 +; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm4, %xmm5 +; SSE2OR3-NEXT: pand %xmm5, %xmm3 +; SSE2OR3-NEXT: pandn %xmm7, %xmm5 +; SSE2OR3-NEXT: por %xmm3, %xmm5 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] ; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] ; SSE2OR3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE2OR3-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] @@ -1690,53 +1764,52 @@ ; ; SSE41-LABEL: psubus_8i64_max: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm7, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd {{.*#+}} xmm8 = [65535,65535] -; SSE41-NEXT: movapd %xmm8, %xmm10 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm10 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: movdqa %xmm8, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm8, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm6 = [65535,65535] +; SSE41-NEXT: movapd %xmm6, %xmm9 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm4 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm4 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm8, %xmm4 +; SSE41-NEXT: movapd %xmm6, %xmm4 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 -; SSE41-NEXT: packusdw %xmm10, %xmm4 +; SSE41-NEXT: packusdw %xmm9, %xmm4 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm3 +; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm8, %xmm3 +; SSE41-NEXT: movapd %xmm6, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pand %xmm7, %xmm5 +; SSE41-NEXT: pxor %xmm1, %xmm7 +; SSE41-NEXT: movdqa %xmm8, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm7, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 +; SSE41-NEXT: packusdw %xmm3, %xmm6 +; SSE41-NEXT: packusdw %xmm4, %xmm6 +; SSE41-NEXT: psubusw %xmm6, %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 -; SSE41-NEXT: packusdw %xmm3, %xmm8 -; SSE41-NEXT: packusdw %xmm4, %xmm8 -; SSE41-NEXT: psubusw %xmm8, %xmm6 -; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: psubus_8i64_max: @@ -1805,60 +1878,69 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind { ; SSE2OR3-LABEL: psubus_16i32_max: ; SSE2OR3: # %bb.0: # %vector.ph -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] -; SSE2OR3-NEXT: movdqa %xmm5, %xmm8 -; SSE2OR3-NEXT: pxor %xmm7, %xmm8 -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] -; SSE2OR3-NEXT: movdqa %xmm6, %xmm9 -; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2OR3-NEXT: pand %xmm9, %xmm5 -; SSE2OR3-NEXT: pxor %xmm8, %xmm9 -; SSE2OR3-NEXT: por %xmm5, %xmm9 -; SSE2OR3-NEXT: pslld $16, %xmm9 -; SSE2OR3-NEXT: psrad $16, %xmm9 -; SSE2OR3-NEXT: movdqa %xmm4, %xmm10 -; SSE2OR3-NEXT: pxor %xmm7, %xmm10 -; SSE2OR3-NEXT: movdqa %xmm6, %xmm5 -; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm5 -; SSE2OR3-NEXT: pand %xmm5, %xmm4 -; SSE2OR3-NEXT: pxor %xmm8, %xmm5 -; SSE2OR3-NEXT: por %xmm4, %xmm5 -; SSE2OR3-NEXT: pslld $16, %xmm5 -; SSE2OR3-NEXT: psrad $16, %xmm5 -; SSE2OR3-NEXT: packssdw %xmm9, %xmm5 -; SSE2OR3-NEXT: movdqa %xmm3, %xmm4 -; SSE2OR3-NEXT: pxor %xmm7, %xmm4 -; SSE2OR3-NEXT: movdqa %xmm6, %xmm9 +; SSE2OR3-NEXT: pxor %xmm8, %xmm8 +; SSE2OR3-NEXT: movdqa %xmm0, %xmm7 +; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE2OR3-NEXT: movdqa %xmm1, %xmm6 +; SSE2OR3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE2OR3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] +; SSE2OR3-NEXT: movdqa %xmm1, %xmm9 +; SSE2OR3-NEXT: psubd %xmm4, %xmm1 +; SSE2OR3-NEXT: pxor %xmm8, %xmm4 +; SSE2OR3-NEXT: por %xmm8, %xmm9 ; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE2OR3-NEXT: pand %xmm9, %xmm3 -; SSE2OR3-NEXT: pxor %xmm8, %xmm9 -; SSE2OR3-NEXT: por %xmm3, %xmm9 -; SSE2OR3-NEXT: pslld $16, %xmm9 -; SSE2OR3-NEXT: psrad $16, %xmm9 -; SSE2OR3-NEXT: pxor %xmm2, %xmm7 -; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2OR3-NEXT: pxor %xmm6, %xmm8 -; SSE2OR3-NEXT: pand %xmm2, %xmm6 -; SSE2OR3-NEXT: por %xmm8, %xmm6 +; SSE2OR3-NEXT: pand %xmm9, %xmm1 +; SSE2OR3-NEXT: movdqa %xmm6, %xmm4 +; SSE2OR3-NEXT: psubd %xmm5, %xmm6 +; SSE2OR3-NEXT: pxor %xmm8, %xmm5 +; SSE2OR3-NEXT: por %xmm8, %xmm4 +; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE2OR3-NEXT: pand %xmm4, %xmm6 +; SSE2OR3-NEXT: movdqa %xmm0, %xmm4 +; SSE2OR3-NEXT: psubd %xmm2, %xmm0 +; SSE2OR3-NEXT: pxor %xmm8, %xmm2 +; SSE2OR3-NEXT: por %xmm8, %xmm4 +; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2OR3-NEXT: pand %xmm4, %xmm0 +; SSE2OR3-NEXT: movdqa %xmm3, %xmm2 +; SSE2OR3-NEXT: pxor %xmm8, %xmm2 +; SSE2OR3-NEXT: por %xmm7, %xmm8 +; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE2OR3-NEXT: psubd %xmm3, %xmm7 +; SSE2OR3-NEXT: pand %xmm8, %xmm7 +; SSE2OR3-NEXT: pslld $16, %xmm7 +; SSE2OR3-NEXT: psrad $16, %xmm7 +; SSE2OR3-NEXT: pslld $16, %xmm0 +; SSE2OR3-NEXT: psrad $16, %xmm0 +; SSE2OR3-NEXT: packssdw %xmm7, %xmm0 ; SSE2OR3-NEXT: pslld $16, %xmm6 ; SSE2OR3-NEXT: psrad $16, %xmm6 -; SSE2OR3-NEXT: packssdw %xmm9, %xmm6 -; SSE2OR3-NEXT: psubusw %xmm6, %xmm0 -; SSE2OR3-NEXT: psubusw %xmm5, %xmm1 +; SSE2OR3-NEXT: pslld $16, %xmm1 +; SSE2OR3-NEXT: psrad $16, %xmm1 +; SSE2OR3-NEXT: packssdw %xmm6, %xmm1 ; SSE2OR3-NEXT: retq ; ; SSE41-LABEL: psubus_16i32_max: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535] -; SSE41-NEXT: pminud %xmm6, %xmm5 -; SSE41-NEXT: pminud %xmm6, %xmm4 -; SSE41-NEXT: packusdw %xmm5, %xmm4 -; SSE41-NEXT: pminud %xmm6, %xmm3 -; SSE41-NEXT: pminud %xmm6, %xmm2 -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: psubusw %xmm2, %xmm0 -; SSE41-NEXT: psubusw %xmm4, %xmm1 +; SSE41-NEXT: pxor %xmm8, %xmm8 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE41-NEXT: pmaxud %xmm5, %xmm1 +; SSE41-NEXT: psubd %xmm5, %xmm1 +; SSE41-NEXT: pmaxud %xmm4, %xmm7 +; SSE41-NEXT: psubd %xmm4, %xmm7 +; SSE41-NEXT: packusdw %xmm1, %xmm7 +; SSE41-NEXT: pmaxud %xmm3, %xmm0 +; SSE41-NEXT: psubd %xmm3, %xmm0 +; SSE41-NEXT: pmaxud %xmm2, %xmm6 +; SSE41-NEXT: psubd %xmm2, %xmm6 +; SSE41-NEXT: packusdw %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: psubus_16i32_max: @@ -1911,17 +1993,17 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] ; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] ; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pandn %xmm4, %xmm6 ; SSE2-NEXT: por %xmm2, %xmm6 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: packssdw %xmm6, %xmm5 @@ -2006,17 +2088,17 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] ; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] ; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pandn %xmm4, %xmm6 ; SSE2-NEXT: por %xmm2, %xmm6 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: packssdw %xmm6, %xmm5 @@ -2589,27 +2671,33 @@ define <8 x i16> @test32(<8 x i16> %a0, <8 x i32> %a1) { ; SSE2-LABEL: test32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: por %xmm2, %xmm6 -; SSE2-NEXT: pslld $16, %xmm6 -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pslld $16, %xmm5 -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: packssdw %xmm6, %xmm5 -; SSE2-NEXT: psubusw %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pandn %xmm2, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: packssdw %xmm1, %xmm3 +; SSE2-NEXT: psubw %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test32: @@ -2638,31 +2726,36 @@ ; ; SSE41-LABEL: test32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] -; SSE41-NEXT: pminud %xmm3, %xmm2 -; SSE41-NEXT: pminud %xmm3, %xmm1 -; SSE41-NEXT: packusdw %xmm2, %xmm1 -; SSE41-NEXT: psubusw %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pminud %xmm2, %xmm4 +; SSE41-NEXT: pminud %xmm1, %xmm3 +; SSE41-NEXT: packusdw %xmm4, %xmm3 +; SSE41-NEXT: psubw %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535] -; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpminud %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2687,29 +2780,29 @@ ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] ; SSE2OR3-NEXT: movdqa %xmm3, %xmm8 ; SSE2OR3-NEXT: pxor %xmm6, %xmm8 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647] -; SSE2OR3-NEXT: movdqa %xmm7, %xmm10 -; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm10 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE2OR3-NEXT: pand %xmm10, %xmm9 -; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2OR3-NEXT: pand %xmm9, %xmm3 -; SSE2OR3-NEXT: pxor %xmm8, %xmm9 -; SSE2OR3-NEXT: por %xmm3, %xmm9 +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259455,9223372039002259455] +; SSE2OR3-NEXT: movdqa %xmm7, %xmm9 +; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm7, %xmm8 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm9, %xmm10 +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] +; SSE2OR3-NEXT: pand %xmm10, %xmm3 +; SSE2OR3-NEXT: pandn %xmm8, %xmm10 +; SSE2OR3-NEXT: por %xmm3, %xmm10 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm3 ; SSE2OR3-NEXT: pxor %xmm6, %xmm3 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] -; SSE2OR3-NEXT: movdqa %xmm7, %xmm11 -; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2OR3-NEXT: movdqa %xmm7, %xmm9 +; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm7, %xmm3 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE2OR3-NEXT: pand %xmm11, %xmm3 +; SSE2OR3-NEXT: pand %xmm9, %xmm3 ; SSE2OR3-NEXT: pand %xmm3, %xmm2 -; SSE2OR3-NEXT: pxor %xmm8, %xmm3 +; SSE2OR3-NEXT: pandn %xmm8, %xmm3 ; SSE2OR3-NEXT: por %xmm2, %xmm3 -; SSE2OR3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm9[0,2] +; SSE2OR3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2] ; SSE2OR3-NEXT: movdqa %xmm0, %xmm2 ; SSE2OR3-NEXT: psubd %xmm3, %xmm2 ; SSE2OR3-NEXT: pxor %xmm6, %xmm3 @@ -2718,25 +2811,26 @@ ; SSE2OR3-NEXT: pand %xmm2, %xmm0 ; SSE2OR3-NEXT: movdqa %xmm5, %xmm2 ; SSE2OR3-NEXT: pxor %xmm6, %xmm2 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2OR3-NEXT: movdqa %xmm7, %xmm9 -; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm9 +; SSE2OR3-NEXT: movdqa %xmm7, %xmm3 +; SSE2OR3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm7, %xmm2 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE2OR3-NEXT: pand %xmm9, %xmm2 +; SSE2OR3-NEXT: pand %xmm3, %xmm2 ; SSE2OR3-NEXT: pand %xmm2, %xmm5 -; SSE2OR3-NEXT: pxor %xmm8, %xmm2 +; SSE2OR3-NEXT: pandn %xmm8, %xmm2 ; SSE2OR3-NEXT: por %xmm5, %xmm2 ; SSE2OR3-NEXT: movdqa %xmm4, %xmm3 ; SSE2OR3-NEXT: pxor %xmm6, %xmm3 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2OR3-NEXT: movdqa %xmm7, %xmm5 +; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm7, %xmm3 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE2OR3-NEXT: pand %xmm7, %xmm3 -; SSE2OR3-NEXT: pxor %xmm3, %xmm8 -; SSE2OR3-NEXT: pand %xmm4, %xmm3 -; SSE2OR3-NEXT: por %xmm8, %xmm3 +; SSE2OR3-NEXT: pand %xmm5, %xmm3 +; SSE2OR3-NEXT: pand %xmm3, %xmm4 +; SSE2OR3-NEXT: pandn %xmm8, %xmm3 +; SSE2OR3-NEXT: por %xmm4, %xmm3 ; SSE2OR3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] ; SSE2OR3-NEXT: movdqa %xmm1, %xmm2 ; SSE2OR3-NEXT: psubd %xmm3, %xmm2 @@ -2748,55 +2842,54 @@ ; ; SSE41-LABEL: test33: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm0, %xmm6 +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295] -; SSE41-NEXT: movapd %xmm9, %xmm11 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm11 +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm9, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm9, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [4294967295,4294967295] +; SSE41-NEXT: movapd %xmm7, %xmm10 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm10 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm3 +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] ; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm3 +; SSE41-NEXT: movapd %xmm7, %xmm3 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[0,2] -; SSE41-NEXT: pmaxud %xmm3, %xmm7 -; SSE41-NEXT: psubd %xmm3, %xmm7 +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[0,2] +; SSE41-NEXT: pmaxud %xmm3, %xmm6 +; SSE41-NEXT: psubd %xmm3, %xmm6 ; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm2 +; SSE41-NEXT: pxor %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm9, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm2 +; SSE41-NEXT: movapd %xmm7, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pand %xmm8, %xmm6 +; SSE41-NEXT: pxor %xmm4, %xmm8 +; SSE41-NEXT: movdqa %xmm9, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7 +; SSE41-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[0,2] +; SSE41-NEXT: pmaxud %xmm7, %xmm1 +; SSE41-NEXT: psubd %xmm7, %xmm1 ; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9 -; SSE41-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2] -; SSE41-NEXT: pmaxud %xmm9, %xmm1 -; SSE41-NEXT: psubd %xmm9, %xmm1 -; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test33: @@ -2905,219 +2998,203 @@ ; SSE2OR3-LABEL: test34: ; SSE2OR3: # %bb.0: ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1] -; SSE2OR3-NEXT: pand %xmm6, %xmm1 ; SSE2OR3-NEXT: pand %xmm6, %xmm0 -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE2OR3-NEXT: movdqa %xmm3, %xmm8 -; SSE2OR3-NEXT: pxor %xmm6, %xmm8 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647] -; SSE2OR3-NEXT: movdqa %xmm7, %xmm10 -; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm10 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE2OR3-NEXT: pand %xmm10, %xmm9 -; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2OR3-NEXT: pand %xmm9, %xmm3 -; SSE2OR3-NEXT: pxor %xmm8, %xmm9 -; SSE2OR3-NEXT: por %xmm3, %xmm9 +; SSE2OR3-NEXT: pand %xmm6, %xmm1 +; SSE2OR3-NEXT: pxor %xmm7, %xmm7 +; SSE2OR3-NEXT: movdqa %xmm1, %xmm6 +; SSE2OR3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; SSE2OR3-NEXT: movdqa %xmm1, %xmm8 +; SSE2OR3-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE2OR3-NEXT: movdqa %xmm0, %xmm9 +; SSE2OR3-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; SSE2OR3-NEXT: movdqa %xmm0, %xmm10 +; SSE2OR3-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] +; SSE2OR3-NEXT: movdqa %xmm3, %xmm11 +; SSE2OR3-NEXT: pxor %xmm7, %xmm11 +; SSE2OR3-NEXT: movdqa %xmm10, %xmm12 +; SSE2OR3-NEXT: por %xmm7, %xmm12 +; SSE2OR3-NEXT: movdqa %xmm11, %xmm13 +; SSE2OR3-NEXT: pcmpgtd %xmm12, %xmm13 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm11, %xmm12 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm14, %xmm11 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3] +; SSE2OR3-NEXT: por %xmm11, %xmm12 +; SSE2OR3-NEXT: pand %xmm12, %xmm10 +; SSE2OR3-NEXT: pandn %xmm3, %xmm12 +; SSE2OR3-NEXT: por %xmm10, %xmm12 ; SSE2OR3-NEXT: movdqa %xmm2, %xmm3 -; SSE2OR3-NEXT: pxor %xmm6, %xmm3 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] -; SSE2OR3-NEXT: movdqa %xmm7, %xmm11 +; SSE2OR3-NEXT: pxor %xmm7, %xmm3 +; SSE2OR3-NEXT: movdqa %xmm9, %xmm10 +; SSE2OR3-NEXT: por %xmm7, %xmm10 +; SSE2OR3-NEXT: movdqa %xmm3, %xmm11 ; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm11 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE2OR3-NEXT: pand %xmm11, %xmm3 -; SSE2OR3-NEXT: pand %xmm3, %xmm2 -; SSE2OR3-NEXT: pxor %xmm8, %xmm3 -; SSE2OR3-NEXT: por %xmm2, %xmm3 -; SSE2OR3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm9[0,2] -; SSE2OR3-NEXT: movdqa %xmm0, %xmm2 -; SSE2OR3-NEXT: psubd %xmm3, %xmm2 -; SSE2OR3-NEXT: pxor %xmm6, %xmm3 -; SSE2OR3-NEXT: por %xmm6, %xmm0 -; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE2OR3-NEXT: pand %xmm2, %xmm0 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm3, %xmm10 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm13, %xmm10 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3] +; SSE2OR3-NEXT: por %xmm10, %xmm3 +; SSE2OR3-NEXT: pand %xmm3, %xmm9 +; SSE2OR3-NEXT: pandn %xmm2, %xmm3 +; SSE2OR3-NEXT: por %xmm9, %xmm3 +; SSE2OR3-NEXT: packuswb %xmm12, %xmm3 ; SSE2OR3-NEXT: movdqa %xmm5, %xmm2 -; SSE2OR3-NEXT: pxor %xmm6, %xmm2 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2OR3-NEXT: movdqa %xmm7, %xmm9 -; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm9 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE2OR3-NEXT: pand %xmm9, %xmm2 -; SSE2OR3-NEXT: pand %xmm2, %xmm5 -; SSE2OR3-NEXT: pxor %xmm8, %xmm2 -; SSE2OR3-NEXT: por %xmm5, %xmm2 -; SSE2OR3-NEXT: movdqa %xmm4, %xmm3 -; SSE2OR3-NEXT: pxor %xmm6, %xmm3 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSE2OR3-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE2OR3-NEXT: pand %xmm7, %xmm3 -; SSE2OR3-NEXT: pxor %xmm3, %xmm8 -; SSE2OR3-NEXT: pand %xmm4, %xmm3 -; SSE2OR3-NEXT: por %xmm8, %xmm3 -; SSE2OR3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] -; SSE2OR3-NEXT: movdqa %xmm1, %xmm2 -; SSE2OR3-NEXT: psubd %xmm3, %xmm2 -; SSE2OR3-NEXT: pxor %xmm6, %xmm3 -; SSE2OR3-NEXT: por %xmm6, %xmm1 -; SSE2OR3-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE2OR3-NEXT: pand %xmm2, %xmm1 +; SSE2OR3-NEXT: pxor %xmm7, %xmm2 +; SSE2OR3-NEXT: movdqa %xmm8, %xmm9 +; SSE2OR3-NEXT: por %xmm7, %xmm9 +; SSE2OR3-NEXT: movdqa %xmm2, %xmm10 +; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm2, %xmm9 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm11, %xmm2 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3] +; SSE2OR3-NEXT: por %xmm2, %xmm9 +; SSE2OR3-NEXT: pand %xmm9, %xmm8 +; SSE2OR3-NEXT: pandn %xmm5, %xmm9 +; SSE2OR3-NEXT: por %xmm8, %xmm9 +; SSE2OR3-NEXT: movdqa %xmm4, %xmm2 +; SSE2OR3-NEXT: pxor %xmm7, %xmm2 +; SSE2OR3-NEXT: por %xmm6, %xmm7 +; SSE2OR3-NEXT: movdqa %xmm2, %xmm5 +; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm5 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2] +; SSE2OR3-NEXT: pcmpeqd %xmm2, %xmm7 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] +; SSE2OR3-NEXT: pand %xmm8, %xmm2 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2OR3-NEXT: por %xmm2, %xmm5 +; SSE2OR3-NEXT: pand %xmm5, %xmm6 +; SSE2OR3-NEXT: pandn %xmm4, %xmm5 +; SSE2OR3-NEXT: por %xmm6, %xmm5 +; SSE2OR3-NEXT: packuswb %xmm9, %xmm5 +; SSE2OR3-NEXT: psubd %xmm3, %xmm0 +; SSE2OR3-NEXT: psubd %xmm5, %xmm1 ; SSE2OR3-NEXT: retq ; ; SSE41-LABEL: test34: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm6 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] -; SSE41-NEXT: pand %xmm0, %xmm1 ; SSE41-NEXT: pand %xmm0, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm9 +; SSE41-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm10 = xmm6[0],zero,xmm6[1],zero +; SSE41-NEXT: movdqa %xmm6, %xmm11 +; SSE41-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm3, %xmm12 +; SSE41-NEXT: pxor %xmm8, %xmm12 +; SSE41-NEXT: movdqa %xmm11, %xmm0 +; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm12, %xmm13 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] +; SSE41-NEXT: pand %xmm13, %xmm0 +; SSE41-NEXT: por %xmm12, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm11 +; SSE41-NEXT: pxor %xmm8, %xmm11 +; SSE41-NEXT: movdqa %xmm10, %xmm0 +; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm11, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] +; SSE41-NEXT: pand %xmm12, %xmm0 +; SSE41-NEXT: por %xmm11, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: pxor %xmm8, %xmm3 +; SSE41-NEXT: movdqa %xmm9, %xmm0 +; SSE41-NEXT: por %xmm8, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm10 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm10, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm8, %xmm3 +; SSE41-NEXT: por %xmm7, %xmm8 +; SSE41-NEXT: movdqa %xmm3, %xmm9 +; SSE41-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] ; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295] -; SSE41-NEXT: movapd %xmm9, %xmm11 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm11 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm12, %xmm0 -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[0,2] -; SSE41-NEXT: pmaxud %xmm3, %xmm6 -; SSE41-NEXT: psubd %xmm3, %xmm6 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm8, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pand %xmm8, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm9 -; SSE41-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2] -; SSE41-NEXT: pmaxud %xmm9, %xmm1 -; SSE41-NEXT: psubd %xmm9, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm4 +; SSE41-NEXT: packusdw %xmm5, %xmm4 +; SSE41-NEXT: psubd %xmm2, %xmm6 +; SSE41-NEXT: psubd %xmm4, %xmm1 ; SSE41-NEXT: movdqa %xmm6, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test34: ; AVX1: # %bb.0: ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: # xmm4 = mem[0,0] -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 -; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103] -; AVX1-NEXT: # xmm6 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [4294967295,4294967295] -; AVX1-NEXT: # xmm7 = mem[0,0] -; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm7, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm4 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8 +; AVX1-NEXT: vmovddup {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm9 = mem[0,0] +; AVX1-NEXT: vxorps %xmm9, %xmm8, %xmm10 +; AVX1-NEXT: vorps %xmm3, %xmm9, %xmm11 +; AVX1-NEXT: vpcmpgtq %xmm11, %xmm10, %xmm10 +; AVX1-NEXT: vblendvpd %xmm10, %xmm3, %xmm8, %xmm3 +; AVX1-NEXT: vxorps %xmm2, %xmm9, %xmm8 +; AVX1-NEXT: vpor %xmm7, %xmm9, %xmm10 +; AVX1-NEXT: vpcmpgtq %xmm10, %xmm8, %xmm8 +; AVX1-NEXT: vblendvpd %xmm8, %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm7, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm9, %xmm7 +; AVX1-NEXT: vorps %xmm4, %xmm9, %xmm8 +; AVX1-NEXT: vpcmpgtq %xmm8, %xmm7, %xmm7 +; AVX1-NEXT: vblendvpd %xmm7, %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm1, %xmm9, %xmm4 +; AVX1-NEXT: vpor %xmm6, %xmm9, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm4, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: test34: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] -; AVX2-SLOW-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm2, %ymm4 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 -; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6] -; AVX2-SLOW-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: test34: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] -; AVX2-FAST-ALL-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm1, %ymm4 -; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-FAST-ALL-NEXT: vblendvpd %ymm4, %ymm1, %ymm6, %ymm1 -; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm2, %ymm3 -; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm2, %ymm6, %ymm2 -; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm4, %ymm2 -; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FAST-ALL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: test34: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm6, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm3[0,2],ymm1[4,6],ymm3[4,6] -; AVX2-FAST-PERLANE-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: test34: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm5, %ymm2, %ymm6 +; AVX2-NEXT: vpor %ymm5, %ymm4, %ymm7 +; AVX2-NEXT: vpcmpgtq %ymm7, %ymm6, %ymm6 +; AVX2-NEXT: vblendvpd %ymm6, %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm4 +; AVX2-NEXT: vpor %ymm5, %ymm3, %ymm5 +; AVX2-NEXT: vpcmpgtq %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test34: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/pull-binop-through-shift.ll b/llvm/test/CodeGen/X86/pull-binop-through-shift.ll --- a/llvm/test/CodeGen/X86/pull-binop-through-shift.ll +++ b/llvm/test/CodeGen/X86/pull-binop-through-shift.ll @@ -16,7 +16,7 @@ ; X86-LABEL: and_signbit_shl: ; X86: # %bb.0: ; X86-NEXT: movl 8(%esp), %ecx -; X86-NEXT: movzbl 6(%esp), %eax +; X86-NEXT: movzwl 6(%esp), %eax ; X86-NEXT: shll $24, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll --- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll +++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll @@ -10,11 +10,17 @@ ; with one of the shifts from the rotate idiom define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) { -; CHECK-LABEL: vroll_v4i32_extract_shl: -; CHECK: # %bb.0: -; CHECK-NEXT: vpslld $3, %xmm0, %xmm0 -; CHECK-NEXT: vprold $7, %xmm0, %xmm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: vroll_v4i32_extract_shl: +; X86: # %bb.0: +; X86-NEXT: vprold $10, %xmm0, %xmm0 +; X86-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: vroll_v4i32_extract_shl: +; X64: # %bb.0: +; X64-NEXT: vprold $10, %xmm0, %xmm0 +; X64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-NEXT: retq %lhs_mul = shl <4 x i32> %i, %rhs_mul = shl <4 x i32> %i, %lhs_shift = lshr <4 x i32> %lhs_mul, @@ -23,11 +29,17 @@ } define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind { -; CHECK-LABEL: vrolq_v4i64_extract_shrl: -; CHECK: # %bb.0: -; CHECK-NEXT: vpsrlq $5, %ymm0, %ymm0 -; CHECK-NEXT: vprolq $29, %ymm0, %ymm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: vrolq_v4i64_extract_shrl: +; X86: # %bb.0: +; X86-NEXT: vprolq $24, %ymm0, %ymm0 +; X86-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %ymm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: vrolq_v4i64_extract_shrl: +; X64: # %bb.0: +; X64-NEXT: vprolq $24, %ymm0, %ymm0 +; X64-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; X64-NEXT: retq %lhs_div = lshr <4 x i64> %i, %rhs_div = lshr <4 x i64> %i, %rhs_shift = shl <4 x i64> %rhs_div, diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll --- a/llvm/test/CodeGen/X86/rotate-extract.ll +++ b/llvm/test/CodeGen/X86/rotate-extract.ll @@ -12,20 +12,19 @@ define i64 @rolq_extract_shl(i64 %i) nounwind { ; X86-LABEL: rolq_extract_shl: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shldl $3, %edx, %ecx -; X86-NEXT: shll $3, %eax -; X86-NEXT: shll $3, %edx -; X86-NEXT: shrdl $25, %edx, %eax -; X86-NEXT: shrdl $25, %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: shldl $10, %eax, %edx +; X86-NEXT: shldl $10, %ecx, %eax +; X86-NEXT: andl $-897, %eax # imm = 0xFC7F ; X86-NEXT: retl ; ; X64-LABEL: rolq_extract_shl: ; X64: # %bb.0: -; X64-NEXT: leaq (,%rdi,8), %rax -; X64-NEXT: rolq $7, %rax +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: rolq $10, %rax +; X64-NEXT: andq $-897, %rax # imm = 0xFC7F ; X64-NEXT: retq %lhs_mul = shl i64 %i, 3 %rhs_mul = shl i64 %i, 10 @@ -38,16 +37,16 @@ ; X86-LABEL: rolw_extract_shrl: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: rolw $12, %ax +; X86-NEXT: rolw $9, %ax +; X86-NEXT: andl $61951, %eax # imm = 0xF1FF ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; ; X64-LABEL: rolw_extract_shrl: ; X64: # %bb.0: -; X64-NEXT: movzwl %di, %eax -; X64-NEXT: shrl $3, %eax -; X64-NEXT: rolw $12, %ax +; X64-NEXT: movl %edi, %eax +; X64-NEXT: rolw $9, %ax +; X64-NEXT: andl $61951, %eax # imm = 0xF1FF ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %lhs_div = lshr i16 %i, 7 @@ -82,18 +81,24 @@ ; X86-LABEL: rolb_extract_udiv: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull $171, %eax, %eax +; X86-NEXT: imull $171, %eax, %ecx +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: shrl $9, %eax -; X86-NEXT: rolb $4, %al +; X86-NEXT: shrl $13, %ecx +; X86-NEXT: shlb $4, %al +; X86-NEXT: orb %cl, %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; ; X64-LABEL: rolb_extract_udiv: ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: imull $171, %eax, %eax +; X64-NEXT: imull $171, %eax, %ecx +; X64-NEXT: movl %ecx, %eax ; X64-NEXT: shrl $9, %eax -; X64-NEXT: rolb $4, %al +; X64-NEXT: shrl $13, %ecx +; X64-NEXT: shlb $4, %al +; X64-NEXT: orb %cl, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %lhs_div = udiv i8 %i, 3 @@ -119,8 +124,12 @@ ; X64-LABEL: rolq_extract_mul_with_mask: ; X64: # %bb.0: ; X64-NEXT: leaq (%rdi,%rdi,8), %rax -; X64-NEXT: rolq $7, %rax -; X64-NEXT: movzbl %al, %eax +; X64-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi +; X64-NEXT: shll $7, %edi +; X64-NEXT: leal (%rdi,%rdi,8), %ecx +; X64-NEXT: movzbl %cl, %ecx +; X64-NEXT: shrq $57, %rax +; X64-NEXT: orq %rcx, %rax ; X64-NEXT: retq %lhs_mul = mul i64 %i, 1152 %rhs_mul = mul i64 %i, 9 @@ -223,33 +232,32 @@ define i8 @no_extract_udiv(i8 %i) nounwind { ; X86-LABEL: no_extract_udiv: ; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull $171, %eax, %ecx -; X86-NEXT: imull $79, %eax, %edx -; X86-NEXT: subb %dh, %al -; X86-NEXT: shrb %al -; X86-NEXT: addb %dh, %al -; X86-NEXT: shrb $5, %al -; X86-NEXT: shlb $3, %ch -; X86-NEXT: orb %al, %ch -; X86-NEXT: andb $-9, %ch -; X86-NEXT: movb %ch, %al +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: imull $171, %ecx, %eax +; X86-NEXT: shrl $9, %eax +; X86-NEXT: imull $79, %ecx, %edx +; X86-NEXT: subb %dh, %cl +; X86-NEXT: shrb %cl +; X86-NEXT: addb %dh, %cl +; X86-NEXT: shrb $5, %cl +; X86-NEXT: shlb $4, %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; ; X64-LABEL: no_extract_udiv: ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %ecx ; X64-NEXT: imull $171, %ecx, %eax -; X64-NEXT: shrl $8, %eax +; X64-NEXT: shrl $9, %eax ; X64-NEXT: imull $79, %ecx, %edx ; X64-NEXT: shrl $8, %edx ; X64-NEXT: subb %dl, %cl ; X64-NEXT: shrb %cl ; X64-NEXT: addb %dl, %cl ; X64-NEXT: shrb $5, %cl -; X64-NEXT: shlb $3, %al +; X64-NEXT: shlb $4, %al ; X64-NEXT: orb %cl, %al -; X64-NEXT: andb $-9, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %lhs_div = udiv i8 %i, 3 diff --git a/llvm/test/CodeGen/X86/rotate.ll b/llvm/test/CodeGen/X86/rotate.ll --- a/llvm/test/CodeGen/X86/rotate.ll +++ b/llvm/test/CodeGen/X86/rotate.ll @@ -569,11 +569,11 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl (%eax), %ecx ; X86-NEXT: movl 4(%eax), %edx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: shldl $31, %edx, %esi -; X86-NEXT: shldl $31, %ecx, %edx -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shldl $31, %ecx, %esi +; X86-NEXT: shldl $31, %edx, %ecx +; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/rotate4.ll b/llvm/test/CodeGen/X86/rotate4.ll --- a/llvm/test/CodeGen/X86/rotate4.ll +++ b/llvm/test/CodeGen/X86/rotate4.ll @@ -244,32 +244,32 @@ ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %esi +; X86-NEXT: movl (%eax), %edx ; X86-NEXT: movl 4(%eax), %ebx -; X86-NEXT: movl %esi, %edx -; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shll %cl, %esi ; X86-NEXT: movl %ebx, %edi -; X86-NEXT: shldl %cl, %esi, %edi +; X86-NEXT: shldl %cl, %edx, %edi ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB6_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %edi -; X86-NEXT: xorl %edx, %edx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: xorl %esi, %esi ; X86-NEXT: .LBB6_2: ; X86-NEXT: negb %cl ; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: shrl %cl, %ebp -; X86-NEXT: shrdl %cl, %ebx, %esi +; X86-NEXT: shrdl %cl, %ebx, %edx ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB6_4 ; X86-NEXT: # %bb.3: -; X86-NEXT: movl %ebp, %esi +; X86-NEXT: movl %ebp, %edx ; X86-NEXT: xorl %ebp, %ebp ; X86-NEXT: .LBB6_4: -; X86-NEXT: orl %esi, %edx ; X86-NEXT: orl %ebp, %edi -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: orl %edx, %esi ; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: popl %edi @@ -336,10 +336,10 @@ ; X86-NEXT: movl %ebp, %esi ; X86-NEXT: xorl %ebp, %ebp ; X86-NEXT: .LBB7_4: -; X86-NEXT: orl %ebp, %edi ; X86-NEXT: orl %esi, %edx -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: orl %ebp, %edi ; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/rotate_vec.ll b/llvm/test/CodeGen/X86/rotate_vec.ll --- a/llvm/test/CodeGen/X86/rotate_vec.ll +++ b/llvm/test/CodeGen/X86/rotate_vec.ll @@ -36,16 +36,34 @@ } define <4 x i32> @rot_v4i32_splat_2masks(<4 x i32> %x) { -; XOP-LABEL: rot_v4i32_splat_2masks: -; XOP: # %bb.0: -; XOP-NEXT: vprotd $31, %xmm0, %xmm0 -; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: rot_v4i32_splat_2masks: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; XOPAVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6],xmm0[7] +; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: rot_v4i32_splat_2masks: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpsrld $1, %xmm0, %xmm1 +; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; XOPAVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6],xmm0[7] +; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: rot_v4i32_splat_2masks: ; AVX512: # %bb.0: -; AVX512-NEXT: vprold $31, %xmm0, %xmm0 -; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $1, %xmm0, %xmm1 +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6],xmm0[7] +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = lshr <4 x i32> %x, %2 = and <4 x i32> %1, @@ -57,16 +75,34 @@ } define <4 x i32> @rot_v4i32_non_splat_2masks(<4 x i32> %x) { -; XOP-LABEL: rot_v4i32_non_splat_2masks: -; XOP: # %bb.0: -; XOP-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: rot_v4i32_non_splat_2masks: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6],xmm0[7] +; XOPAVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: rot_v4i32_non_splat_2masks: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; XOPAVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6],xmm0[7] +; XOPAVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: rot_v4i32_non_splat_2masks: ; AVX512: # %bb.0: -; AVX512-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6],xmm0[7] +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = lshr <4 x i32> %x, %2 = and <4 x i32> %1, diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -78,9 +78,9 @@ ; AVX2-NEXT: # %bb.2: # %middle.block ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -103,9 +103,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -233,9 +233,9 @@ ; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -260,9 +260,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -430,9 +430,9 @@ ; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -462,9 +462,9 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax @@ -491,9 +491,9 @@ ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax @@ -699,9 +699,14 @@ ; ; AVX-LABEL: sad_nonloop_4i8: ; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpabsd %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq %v1 = load <4 x i8>, <4 x i8>* %p, align 1 @@ -729,13 +734,55 @@ ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX-LABEL: sad_nonloop_8i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq +; AVX1-LABEL: sad_nonloop_8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpabsd %xmm0, %xmm0 +; AVX1-NEXT: vpabsd %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: sad_nonloop_8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpabsd %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: sad_nonloop_8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpabsd %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %v1 = load <8 x i8>, <8 x i8>* %p, align 1 %z1 = zext <8 x i8> %v1 to <8 x i32> %v2 = load <8 x i8>, <8 x i8>* %q, align 1 @@ -759,20 +806,83 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqu (%rdi), %xmm0 ; SSE2-NEXT: movdqu (%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pminub %xmm1, %xmm2 +; SSE2-NEXT: pmaxub %xmm1, %xmm0 +; SSE2-NEXT: psubb %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddq %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; -; AVX-LABEL: sad_nonloop_16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqu (%rdi), %xmm0 -; AVX-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq +; AVX1-LABEL: sad_nonloop_16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpabsd %xmm0, %xmm0 +; AVX1-NEXT: vpabsd %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpabsd %xmm2, %xmm1 +; AVX1-NEXT: vpabsd %xmm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: sad_nonloop_16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpabsd %ymm0, %ymm0 +; AVX2-NEXT: vpabsd %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: sad_nonloop_16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpabsd %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %v1 = load <16 x i8>, <16 x i8>* %p, align 1 %z1 = zext <16 x i8> %v1 to <16 x i32> %v2 = load <16 x i8>, <16 x i8>* %q, align 1 @@ -810,36 +920,102 @@ ; ; AVX1-LABEL: sad_nonloop_32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqu (%rdi), %xmm0 -; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 -; AVX1-NEXT: vpsadbw 16(%rdx), %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm8, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm8, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm8, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm8, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm8, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm8, %xmm6, %xmm6 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm8, %xmm7, %xmm7 +; AVX1-NEXT: vpabsd %xmm0, %xmm0 +; AVX1-NEXT: vpabsd %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpabsd %xmm2, %xmm1 +; AVX1-NEXT: vpabsd %xmm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpabsd %xmm4, %xmm1 +; AVX1-NEXT: vpabsd %xmm5, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpabsd %xmm6, %xmm2 +; AVX1-NEXT: vpabsd %xmm7, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: retq ; ; AVX2-LABEL: sad_nonloop_32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; AVX2-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpabsd %ymm0, %ymm0 +; AVX2-NEXT: vpabsd %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpabsd %ymm2, %ymm1 +; AVX2-NEXT: vpabsd %ymm3, %ymm2 +; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: sad_nonloop_32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpsubd %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpsubd %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpabsd %zmm0, %zmm0 +; AVX512-NEXT: vpabsd %zmm1, %zmm1 +; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -890,66 +1066,176 @@ ; ; AVX1-LABEL: sad_nonloop_64i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqu (%rdi), %xmm0 -; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3 -; AVX1-NEXT: vpsadbw 48(%rdx), %xmm3, %xmm3 -; AVX1-NEXT: vpsadbw 16(%rdx), %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw 32(%rdx), %xmm2, %xmm2 -; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm6, %xmm6 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm7, %xmm7 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm8, %xmm8 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm9, %xmm9 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm10, %xmm10 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm11, %xmm11 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm12, %xmm12 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm13, %xmm13 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm15, %xmm14, %xmm14 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm0, %xmm15, %xmm0 +; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-NEXT: vpabsd %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm15, %xmm1 +; AVX1-NEXT: vpabsd %xmm2, %xmm2 +; AVX1-NEXT: vpabsd %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpabsd %xmm4, %xmm2 +; AVX1-NEXT: vpabsd %xmm5, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpabsd %xmm6, %xmm3 +; AVX1-NEXT: vpabsd %xmm7, %xmm4 +; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpabsd %xmm8, %xmm2 +; AVX1-NEXT: vpabsd %xmm9, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpabsd %xmm10, %xmm3 +; AVX1-NEXT: vpabsd %xmm11, %xmm4 +; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpabsd %xmm12, %xmm3 +; AVX1-NEXT: vpabsd %xmm13, %xmm4 +; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpabsd %xmm14, %xmm4 +; AVX1-NEXT: vpabsd %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: retq ; ; AVX2-LABEL: sad_nonloop_64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX2-NEXT: vpsadbw 32(%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm8, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm8, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm8, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm8, %ymm3, %ymm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm8, %ymm4, %ymm4 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm8, %ymm5, %ymm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm8, %ymm6, %ymm6 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm8, %ymm7, %ymm7 +; AVX2-NEXT: vpabsd %ymm0, %ymm0 +; AVX2-NEXT: vpabsd %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpabsd %ymm2, %ymm1 +; AVX2-NEXT: vpabsd %ymm3, %ymm2 +; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpabsd %ymm4, %ymm1 +; AVX2-NEXT: vpabsd %ymm5, %ymm2 +; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpabsd %ymm6, %ymm2 +; AVX2-NEXT: vpabsd %ymm7, %ymm3 +; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: sad_nonloop_64i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX512F-NEXT: vpsadbw 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: sad_nonloop_64i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpsadbw (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: sad_nonloop_64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpsubd %zmm4, %zmm0, %zmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpsubd %zmm4, %zmm1, %zmm1 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpsubd %zmm4, %zmm2, %zmm2 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpsubd %zmm4, %zmm3, %zmm3 +; AVX512-NEXT: vpabsd %zmm0, %zmm0 +; AVX512-NEXT: vpabsd %zmm1, %zmm1 +; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpabsd %zmm2, %zmm1 +; AVX512-NEXT: vpabsd %zmm3, %zmm2 +; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %v1 = load <64 x i8>, <64 x i8>* %p, align 1 %z1 = zext <64 x i8> %v1 to <64 x i32> %v2 = load <64 x i8>, <64 x i8>* %q, align 1 diff --git a/llvm/test/CodeGen/X86/sad_variations.ll b/llvm/test/CodeGen/X86/sad_variations.ll --- a/llvm/test/CodeGen/X86/sad_variations.ll +++ b/llvm/test/CodeGen/X86/sad_variations.ll @@ -13,14 +13,6 @@ ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq -; -; AVX-LABEL: sad8_32bit_icmp_sge: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq entry: %idx.ext = zext i32 %stride to i64 @@ -53,14 +45,6 @@ ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq -; -; AVX-LABEL: sad8_32bit_icmp_sgt: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq entry: %idx.ext = zext i32 %stride to i64 br label %for.body @@ -92,14 +76,6 @@ ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq -; -; AVX-LABEL: sad8_32bit_icmp_sle: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq entry: %idx.ext = zext i32 %stride to i64 br label %for.body @@ -131,14 +107,6 @@ ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq -; -; AVX-LABEL: sad8_32bit_icmp_slt: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq entry: %idx.ext = zext i32 %stride to i64 br label %for.body @@ -170,14 +138,6 @@ ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: retq -; -; AVX-LABEL: sad8_64bit_icmp_sext_slt: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq entry: br label %for.body @@ -209,14 +169,6 @@ ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: retq -; -; AVX-LABEL: sad8_64bit_icmp_zext_slt: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq entry: br label %for.body @@ -248,14 +200,6 @@ ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: retq -; -; AVX-LABEL: sad8_early_64bit_icmp_zext_slt: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq entry: br label %for.body @@ -277,3 +221,5 @@ %8 = extractelement <8 x i64> %bin.rdx239, i32 0 ret i64 %8 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} diff --git a/llvm/test/CodeGen/X86/sadd_sat.ll b/llvm/test/CodeGen/X86/sadd_sat.ll --- a/llvm/test/CodeGen/X86/sadd_sat.ll +++ b/llvm/test/CodeGen/X86/sadd_sat.ll @@ -75,7 +75,7 @@ ; X86-NEXT: movl %eax, %edx ; X86-NEXT: addw %cx, %dx ; X86-NEXT: movswl %dx, %edx -; X86-NEXT: sarl $15, %edx +; X86-NEXT: shrl $15, %edx ; X86-NEXT: xorl $-32768, %edx # imm = 0x8000 ; X86-NEXT: addw %cx, %ax ; X86-NEXT: cmovol %edx, %eax @@ -88,7 +88,7 @@ ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: leal (%rdi,%rsi), %eax ; X64-NEXT: cwtl -; X64-NEXT: sarl $15, %eax +; X64-NEXT: shrl $15, %eax ; X64-NEXT: xorl $-32768, %eax # imm = 0x8000 ; X64-NEXT: addw %si, %di ; X64-NEXT: cmovnol %edi, %eax diff --git a/llvm/test/CodeGen/X86/sadd_sat_plus.ll b/llvm/test/CodeGen/X86/sadd_sat_plus.ll --- a/llvm/test/CodeGen/X86/sadd_sat_plus.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_plus.ll @@ -79,7 +79,7 @@ ; X86-NEXT: movl %eax, %edx ; X86-NEXT: addw %cx, %dx ; X86-NEXT: movswl %dx, %edx -; X86-NEXT: sarl $15, %edx +; X86-NEXT: shrl $15, %edx ; X86-NEXT: xorl $-32768, %edx # imm = 0x8000 ; X86-NEXT: addw %cx, %ax ; X86-NEXT: cmovol %edx, %eax @@ -93,7 +93,7 @@ ; X64-NEXT: imull %edx, %esi ; X64-NEXT: leal (%rdi,%rsi), %eax ; X64-NEXT: cwtl -; X64-NEXT: sarl $15, %eax +; X64-NEXT: shrl $15, %eax ; X64-NEXT: xorl $-32768, %eax # imm = 0x8000 ; X64-NEXT: addw %si, %di ; X64-NEXT: cmovnol %edi, %eax diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -468,7 +468,7 @@ ; SSE-NEXT: movzwl (%rsi), %ecx ; SSE-NEXT: leal (%rax,%rcx), %esi ; SSE-NEXT: movswl %si, %esi -; SSE-NEXT: sarl $15, %esi +; SSE-NEXT: shrl $15, %esi ; SSE-NEXT: xorl $-32768, %esi # imm = 0x8000 ; SSE-NEXT: addw %cx, %ax ; SSE-NEXT: cmovol %esi, %eax @@ -481,7 +481,7 @@ ; AVX-NEXT: movzwl (%rsi), %ecx ; AVX-NEXT: leal (%rax,%rcx), %esi ; AVX-NEXT: movswl %si, %esi -; AVX-NEXT: sarl $15, %esi +; AVX-NEXT: shrl $15, %esi ; AVX-NEXT: xorl $-32768, %esi # imm = 0x8000 ; AVX-NEXT: addw %cx, %ax ; AVX-NEXT: cmovol %esi, %eax @@ -1175,22 +1175,23 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pxor %xmm0, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 ; SSE41-NEXT: paddq %xmm1, %xmm2 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: por %xmm4, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm3 -; SSE41-NEXT: movapd {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -1352,40 +1353,42 @@ ; SSE41-LABEL: v4i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: paddq %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pxor %xmm6, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 ; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm5 -; SSE41-NEXT: por %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm2, %xmm5 -; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm8, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm6 +; SSE41-NEXT: por %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pxor %xmm6, %xmm2 +; SSE41-NEXT: movapd {{.*#+}} xmm6 = [9223372036854775807,9223372036854775807] +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movapd %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: paddq %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm1, %xmm6 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 -; SSE41-NEXT: pxor %xmm3, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm2, %xmm5 +; SSE41-NEXT: por %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE41-NEXT: pxor %xmm5, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 ; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: retq ; @@ -1650,74 +1653,78 @@ ; SSE41-LABEL: v8i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: paddq %xmm4, %xmm8 -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pxor %xmm10, %xmm9 +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pxor %xmm9, %xmm10 ; SSE41-NEXT: movdqa %xmm0, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm11 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm11, %xmm9 -; SSE41-NEXT: por %xmm0, %xmm9 -; SSE41-NEXT: pxor %xmm4, %xmm9 -; SSE41-NEXT: movapd {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm11, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm10 +; SSE41-NEXT: por %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: pxor %xmm10, %xmm4 +; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807] +; SSE41-NEXT: movapd {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movapd %xmm10, %xmm12 ; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm4 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: paddq %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE41-NEXT: pxor %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm12 ; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm4 -; SSE41-NEXT: por %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm5, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm12, %xmm13 +; SSE41-NEXT: por %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE41-NEXT: pxor %xmm13, %xmm4 +; SSE41-NEXT: movapd %xmm10, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: paddq %xmm6, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 +; SSE41-NEXT: pxor %xmm9, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm4 -; SSE41-NEXT: por %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm6, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm12 +; SSE41-NEXT: por %xmm0, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSE41-NEXT: pxor %xmm12, %xmm4 +; SSE41-NEXT: movapd %xmm10, %xmm5 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: paddq %xmm7, %xmm3 -; SSE41-NEXT: pxor %xmm3, %xmm10 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm4 -; SSE41-NEXT: por %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm7, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm9 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: por %xmm0, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE41-NEXT: pxor %xmm5, %xmm4 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm11 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm10 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 ; SSE41-NEXT: movapd %xmm8, %xmm0 ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/sandybridge-loads.ll b/llvm/test/CodeGen/X86/sandybridge-loads.ll --- a/llvm/test/CodeGen/X86/sandybridge-loads.ll +++ b/llvm/test/CodeGen/X86/sandybridge-loads.ll @@ -8,9 +8,16 @@ ; CHECK-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 ; CHECK-NEXT: vmovaps (%rsi), %ymm1 ; CHECK-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vmovaps (%rdx), %ymm2 ; CHECK-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 -; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 +; CHECK-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: vmovaps %ymm0, (%rax) ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll --- a/llvm/test/CodeGen/X86/sat-add.ll +++ b/llvm/test/CodeGen/X86/sat-add.ll @@ -383,10 +383,19 @@ ; SSE-NEXT: paddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: unsigned_sat_constant_v16i8_using_cmp_sum: -; AVX: # %bb.0: -; AVX-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: unsigned_sat_constant_v16i8_using_cmp_sum: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: unsigned_sat_constant_v16i8_using_cmp_sum: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $222, %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: retq %a = add <16 x i8> %x, %c = icmp ugt <16 x i8> %x, %a %r = select <16 x i1> %c, <16 x i8> , <16 x i8> %a @@ -399,10 +408,18 @@ ; SSE-NEXT: paddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: unsigned_sat_constant_v16i8_using_cmp_notval: -; AVX: # %bb.0: -; AVX-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: unsigned_sat_constant_v16i8_using_cmp_notval: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddusb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: unsigned_sat_constant_v16i8_using_cmp_notval: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %a = add <16 x i8> %x, %c = icmp ugt <16 x i8> %x, %r = select <16 x i1> %c, <16 x i8> , <16 x i8> %a @@ -441,10 +458,19 @@ ; SSE-NEXT: paddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: unsigned_sat_constant_v8i16_using_cmp_sum: -; AVX: # %bb.0: -; AVX-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: unsigned_sat_constant_v8i16_using_cmp_sum: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: unsigned_sat_constant_v8i16_using_cmp_sum: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $222, %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: retq %a = add <8 x i16> %x, %c = icmp ugt <8 x i16> %x, %a %r = select <8 x i1> %c, <8 x i16> , <8 x i16> %a @@ -457,10 +483,18 @@ ; SSE-NEXT: paddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: unsigned_sat_constant_v8i16_using_cmp_notval: -; AVX: # %bb.0: -; AVX-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: unsigned_sat_constant_v8i16_using_cmp_notval: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: unsigned_sat_constant_v8i16_using_cmp_notval: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %a = add <8 x i16> %x, %c = icmp ugt <8 x i16> %x, %r = select <8 x i1> %c, <8 x i16> , <8 x i16> %a @@ -610,12 +644,13 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372034707292117,9223372034707292117] -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 @@ -751,11 +786,13 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [42,42] ; SSE2-NEXT: paddq %xmm0, %xmm1 ; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372034707292117,9223372034707292117] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -764,11 +801,13 @@ ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42,42] ; SSE41-NEXT: paddq %xmm0, %xmm1 ; SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372034707292117,9223372034707292117] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -841,10 +880,19 @@ ; SSE-NEXT: paddusb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: unsigned_sat_variable_v16i8_using_cmp_sum: -; AVX: # %bb.0: -; AVX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: unsigned_sat_variable_v16i8_using_cmp_sum: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: unsigned_sat_variable_v16i8_using_cmp_sum: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $222, %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: retq %a = add <16 x i8> %x, %y %c = icmp ugt <16 x i8> %x, %a %r = select <16 x i1> %c, <16 x i8> , <16 x i8> %a @@ -938,10 +986,19 @@ ; SSE-NEXT: paddusw %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: unsigned_sat_variable_v8i16_using_cmp_sum: -; AVX: # %bb.0: -; AVX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: unsigned_sat_variable_v8i16_using_cmp_sum: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: unsigned_sat_variable_v8i16_using_cmp_sum: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $222, %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: retq %a = add <8 x i16> %x, %y %c = icmp ugt <8 x i16> %x, %a %r = select <8 x i1> %c, <8 x i16> , <8 x i16> %a diff --git a/llvm/test/CodeGen/X86/scalar_widen_div.ll b/llvm/test/CodeGen/X86/scalar_widen_div.ll --- a/llvm/test/CodeGen/X86/scalar_widen_div.ll +++ b/llvm/test/CodeGen/X86/scalar_widen_div.ll @@ -393,26 +393,27 @@ ; CHECK-NEXT: jle .LBB12_3 ; CHECK-NEXT: # %bb.1: # %bb.nph ; CHECK-NEXT: movl %edx, %ecx -; CHECK-NEXT: xorl %r10d, %r10d +; CHECK-NEXT: xorl %r11d, %r11d ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB12_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl (%rdi,%r10), %r8d -; CHECK-NEXT: movl 4(%rdi,%r10), %eax +; CHECK-NEXT: movl 8(%rdi,%r11), %r8d +; CHECK-NEXT: movl (%rdi,%r11), %r9d +; CHECK-NEXT: movl 4(%rdi,%r11), %eax ; CHECK-NEXT: cltd -; CHECK-NEXT: idivl 4(%rsi,%r10) -; CHECK-NEXT: movl %eax, %r9d -; CHECK-NEXT: movl %r8d, %eax +; CHECK-NEXT: idivl 4(%rsi,%r11) +; CHECK-NEXT: movl %eax, %r10d +; CHECK-NEXT: movl %r9d, %eax ; CHECK-NEXT: cltd -; CHECK-NEXT: idivl (%rsi,%r10) +; CHECK-NEXT: idivl (%rsi,%r11) ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: pinsrd $1, %r9d, %xmm0 -; CHECK-NEXT: movl 8(%rdi,%r10), %eax +; CHECK-NEXT: pinsrd $1, %r10d, %xmm0 +; CHECK-NEXT: movl %r8d, %eax ; CHECK-NEXT: cltd -; CHECK-NEXT: idivl 8(%rsi,%r10) -; CHECK-NEXT: movl %eax, 8(%rdi,%r10) -; CHECK-NEXT: movq %xmm0, (%rdi,%r10) -; CHECK-NEXT: addq $16, %r10 +; CHECK-NEXT: idivl 8(%rsi,%r11) +; CHECK-NEXT: movl %eax, 8(%rdi,%r11) +; CHECK-NEXT: movq %xmm0, (%rdi,%r11) +; CHECK-NEXT: addq $16, %r11 ; CHECK-NEXT: decl %ecx ; CHECK-NEXT: jne .LBB12_2 ; CHECK-NEXT: .LBB12_3: # %for.end diff --git a/llvm/test/CodeGen/X86/scalarize-fp.ll b/llvm/test/CodeGen/X86/scalarize-fp.ll --- a/llvm/test/CodeGen/X86/scalarize-fp.ll +++ b/llvm/test/CodeGen/X86/scalarize-fp.ll @@ -671,7 +671,12 @@ ; ; AVX-LABEL: splat0_fdiv_v8f32: ; AVX: # %bb.0: -; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vrcpps %ymm1, %ymm2 +; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm3 +; AVX-NEXT: vmulss %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq @@ -745,6 +750,11 @@ ; ; AVX-LABEL: splat0_fdiv_const_op1_v8f32: ; AVX: # %bb.0: +; AVX-NEXT: vrcpps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq @@ -764,8 +774,12 @@ ; ; AVX-LABEL: splat0_fdiv_const_op0_v8f32: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vsubss %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sdiv-exact.ll b/llvm/test/CodeGen/X86/sdiv-exact.ll --- a/llvm/test/CodeGen/X86/sdiv-exact.ll +++ b/llvm/test/CodeGen/X86/sdiv-exact.ll @@ -83,11 +83,12 @@ ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psrad $3, %xmm1 ; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; X86-NEXT: movdqa {{.*#+}} xmm0 = [2863311531,2863311531,3264175145,3264175145] -; X86-NEXT: pmuludq %xmm1, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,3264175145,3264175145] +; X86-NEXT: movaps %xmm1, %xmm0 +; X86-NEXT: pmuludq %xmm2, %xmm0 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: pmuludq %xmm2, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: retl @@ -108,11 +109,12 @@ ; X86-NEXT: psrad $3, %xmm1 ; X86-NEXT: psrad $1, %xmm0 ; X86-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,3303820997,3303820997] -; X86-NEXT: pmuludq %xmm0, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,3303820997,3303820997] +; X86-NEXT: movapd %xmm0, %xmm1 +; X86-NEXT: pmuludq %xmm2, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: pmuludq %xmm2, %xmm0 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X86-NEXT: movdqa %xmm1, %xmm0 @@ -130,11 +132,12 @@ define <4 x i32> @test7(<4 x i32> %x) { ; X86-LABEL: test7: ; X86: # %bb.0: -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,1749801491,1749801491] +; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-NEXT: pmuludq %xmm1, %xmm0 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-NEXT: pmuludq %xmm1, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: retl ; @@ -152,11 +155,12 @@ ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psrad $3, %xmm1 ; X86-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; X86-NEXT: movdqa {{.*#+}} xmm0 = [1,1,2863311531,2863311531] -; X86-NEXT: pmuludq %xmm1, %xmm0 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [1,1,2863311531,2863311531] +; X86-NEXT: movapd %xmm1, %xmm0 +; X86-NEXT: pmuludq %xmm2, %xmm0 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: pmuludq %xmm2, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll --- a/llvm/test/CodeGen/X86/sdiv_fix.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix.ll @@ -153,27 +153,26 @@ ; ; X86-LABEL: func3: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll $8, %eax -; X86-NEXT: movswl %ax, %esi ; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: shrl $4, %esi +; X86-NEXT: shll $4, %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: cwtd ; X86-NEXT: idivw %si ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: leal -1(%eax), %edi +; X86-NEXT: testw %si, %si +; X86-NEXT: sets %bl ; X86-NEXT: testw %cx, %cx ; X86-NEXT: sets %cl -; X86-NEXT: testw %si, %si -; X86-NEXT: sets %ch -; X86-NEXT: xorb %cl, %ch +; X86-NEXT: xorb %bl, %cl ; X86-NEXT: testw %dx, %dx -; X86-NEXT: setne %cl -; X86-NEXT: testb %ch, %cl +; X86-NEXT: setne %dl +; X86-NEXT: testb %cl, %dl ; X86-NEXT: cmovel %eax, %edi ; X86-NEXT: addl %edi, %edi ; X86-NEXT: movswl %di, %eax @@ -181,6 +180,7 @@ ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %y2 = sext i8 %y to i15 %y3 = shl i15 %y2, 7 @@ -535,168 +535,169 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $60, %esp +; X86-NEXT: subl $64, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: sarl $31, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shll $31, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl %edi, %esi +; X86-NEXT: shll $31, %esi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: shrl %eax +; X86-NEXT: andl $-2147483648, %ebx # imm = 0x80000000 +; X86-NEXT: orl %eax, %ebx +; X86-NEXT: sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: shrl %eax +; X86-NEXT: andl $-2147483648, %ebp # imm = 0x80000000 +; X86-NEXT: orl %eax, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shrl %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: andl $-2147483648, %ebp # imm = 0x80000000 +; X86-NEXT: orl %eax, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sets (%esp) # 1-byte Folded Spill +; X86-NEXT: movl %edi, %eax +; X86-NEXT: shrl %eax +; X86-NEXT: andl $-2147483648, %edi # imm = 0x80000000 +; X86-NEXT: orl %eax, %edi +; X86-NEXT: sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: pushl %edx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrl $31, %eax -; X86-NEXT: shldl $31, %ecx, %eax +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: calll __moddi3 +; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %ebp +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax -; X86-NEXT: pushl %edx ; X86-NEXT: calll __divdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll $31, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: shll $31, %ebp -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: shrl $31, %ecx -; X86-NEXT: shldl $31, %ebx, %ecx ; X86-NEXT: pushl %eax ; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %ebp +; X86-NEXT: movl %ecx, %edi ; X86-NEXT: calll __moddi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi ; X86-NEXT: calll __divdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: sarl $31, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: shll $31, %ebx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: shrl $31, %edi -; X86-NEXT: shldl $31, %edx, %edi +; X86-NEXT: shll $31, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: pushl %edx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %ecx -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %eax ; X86-NEXT: movl %eax, %esi -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %ebx ; X86-NEXT: calll __moddi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %esi ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %esi ; X86-NEXT: calll __divdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: sarl $31, %ebx +; X86-NEXT: shll $31, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: shll $31, %esi ; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: shrl $31, %ebp -; X86-NEXT: shldl $31, %ecx, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %eax +; X86-NEXT: sarl $31, %ebp ; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: pushl %esi +; X86-NEXT: pushl %eax ; X86-NEXT: calll __moddi3 ; X86-NEXT: addl $16, %esp -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %edi +; X86-NEXT: testl %ebp, %ebp +; X86-NEXT: sets %bl +; X86-NEXT: xorb (%esp), %bl # 1-byte Folded Reload ; X86-NEXT: pushl %ebp +; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl %esi +; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: calll __divdi3 ; X86-NEXT: addl $16, %esp -; X86-NEXT: testl %ebp, %ebp -; X86-NEXT: sets %cl -; X86-NEXT: testl %ebx, %ebx -; X86-NEXT: sets %dl -; X86-NEXT: xorb %cl, %dl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl (%esp), %ecx # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: setne %cl -; X86-NEXT: testb %dl, %cl +; X86-NEXT: testb %bl, %cl ; X86-NEXT: leal -1(%eax), %ecx ; X86-NEXT: cmovel %eax, %ecx -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: testl %edi, %edi +; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: sets %al +; X86-NEXT: xorb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: setne %dl +; X86-NEXT: testb %al, %dl +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: leal -1(%eax), %edi +; X86-NEXT: cmovel %eax, %edi ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: sets %cl -; X86-NEXT: xorb %al, %cl +; X86-NEXT: sets %dl +; X86-NEXT: xorb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: setne %al -; X86-NEXT: testb %cl, %al +; X86-NEXT: setne %dh +; X86-NEXT: testb %dl, %dh ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: leal -1(%eax), %ecx -; X86-NEXT: cmovel %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: sets %al +; X86-NEXT: leal -1(%eax), %edx +; X86-NEXT: cmovel %eax, %edx ; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: sets %cl -; X86-NEXT: xorb %al, %cl +; X86-NEXT: sets %bl +; X86-NEXT: xorb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: setne %al -; X86-NEXT: testb %cl, %al +; X86-NEXT: setne %bh +; X86-NEXT: testb %bl, %bh ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: leal -1(%eax), %ebp -; X86-NEXT: cmovel %eax, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: testl %edx, %edx -; X86-NEXT: sets %al -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: testl %ecx, %ecx -; X86-NEXT: sets %bl -; X86-NEXT: xorb %al, %bl -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: leal -1(%edi), %esi -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %edx -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: calll __moddi3 -; X86-NEXT: addl $16, %esp -; X86-NEXT: orl %eax, %edx -; X86-NEXT: setne %al -; X86-NEXT: testb %bl, %al -; X86-NEXT: cmovel %edi, %esi +; X86-NEXT: leal -1(%eax), %esi +; X86-NEXT: cmovel %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: movl %ebp, 8(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 4(%eax) -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: addl $60, %esp +; X86-NEXT: addl $64, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll --- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -181,27 +181,26 @@ ; ; X86-LABEL: func3: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll $8, %eax -; X86-NEXT: movswl %ax, %esi ; X86-NEXT: addl %ecx, %ecx -; X86-NEXT: shrl $4, %esi +; X86-NEXT: shll $4, %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: cwtd ; X86-NEXT: idivw %si ; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: leal -1(%eax), %edi +; X86-NEXT: testw %si, %si +; X86-NEXT: sets %bl ; X86-NEXT: testw %cx, %cx ; X86-NEXT: sets %cl -; X86-NEXT: testw %si, %si -; X86-NEXT: sets %ch -; X86-NEXT: xorb %cl, %ch +; X86-NEXT: xorb %bl, %cl ; X86-NEXT: testw %dx, %dx -; X86-NEXT: setne %cl -; X86-NEXT: testb %ch, %cl +; X86-NEXT: setne %dl +; X86-NEXT: testb %cl, %dl ; X86-NEXT: cmovnel %edi, %eax ; X86-NEXT: movswl %ax, %ecx ; X86-NEXT: cmpl $16383, %ecx # imm = 0x3FFF @@ -214,6 +213,7 @@ ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx ; X86-NEXT: retl %y2 = sext i8 %y to i15 %y3 = shl i15 %y2, 7 @@ -577,252 +577,251 @@ ; X64-NEXT: subq $104, %rsp ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] -; X64-NEXT: psllq $32, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; X64-NEXT: psrad $31, %xmm2 -; X64-NEXT: psrlq $31, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: pcmpgtd %xmm0, %xmm2 ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: paddq %xmm0, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %rbp -; X64-NEXT: sarq $63, %rbp -; X64-NEXT: shldq $31, %rbx, %rbp -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; X64-NEXT: movq %xmm0, %r14 +; X64-NEXT: movq %r14, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: shldq $31, %r14, %rbx ; X64-NEXT: pxor %xmm0, %xmm0 ; X64-NEXT: pcmpgtd %xmm1, %xmm0 ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: movq %xmm1, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: sarq $63, %r15 -; X64-NEXT: movq %rbx, %r12 -; X64-NEXT: shlq $31, %r12 -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %rbp, %rsi -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: sarq $63, %r12 +; X64-NEXT: shlq $31, %r14 +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rbx, %rsi +; X64-NEXT: movq %r12, %rcx ; X64-NEXT: callq __divti3@PLT ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: subq $1, %r13 -; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %r15d, %ebx -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %rbp, %rsi +; X64-NEXT: sbbq $0, %rbp +; X64-NEXT: movq %rbx, %r15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: xorq %rdx, %r15 +; X64-NEXT: shrq $63, %r15 +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rbx, %rsi +; X64-NEXT: movq %r12, %rcx ; X64-NEXT: callq __modti3@PLT +; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: orq %rax, %rdx -; X64-NEXT: setne %al -; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; X64-NEXT: setne %cl +; X64-NEXT: testl %r15d, %ecx +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %edx # imm = 0xFFFFFFFF ; X64-NEXT: cmpq %rdx, %r13 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF ; X64-NEXT: cmovbq %r13, %rax ; X64-NEXT: xorl %ecx, %ecx -; X64-NEXT: testq %r14, %r14 +; X64-NEXT: testq %rbp, %rbp ; X64-NEXT: cmovnsq %rdx, %r13 ; X64-NEXT: cmoveq %rax, %r13 -; X64-NEXT: cmovnsq %rcx, %r14 +; X64-NEXT: cmovnsq %rcx, %rbp ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 ; X64-NEXT: cmpq %rcx, %r13 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: cmovaq %r13, %rax -; X64-NEXT: testq %r14, %r14 +; X64-NEXT: testq %rbp, %rbp ; X64-NEXT: cmovsq %rcx, %r13 -; X64-NEXT: cmpq $-1, %r14 +; X64-NEXT: cmpq $-1, %rbp ; X64-NEXT: cmoveq %rax, %r13 ; X64-NEXT: movq %r13, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %rbp -; X64-NEXT: sarq $63, %rbp -; X64-NEXT: shldq $31, %rbx, %rbp +; X64-NEXT: movq %rbx, %r14 +; X64-NEXT: sarq $63, %r14 +; X64-NEXT: shldq $31, %rbx, %r14 ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: sarq $63, %r15 -; X64-NEXT: movq %rbx, %r12 -; X64-NEXT: shlq $31, %r12 -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %rbp, %rsi -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: sarq $63, %r12 +; X64-NEXT: shlq $31, %rbx +; X64-NEXT: movq %rbx, %rdi +; X64-NEXT: movq %r14, %rsi +; X64-NEXT: movq %r12, %rcx ; X64-NEXT: callq __divti3@PLT ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: subq $1, %r13 -; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %r15d, %ebx -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %rbp, %rsi +; X64-NEXT: sbbq $0, %rbp +; X64-NEXT: movq %r14, %r15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: xorq %rdx, %r15 +; X64-NEXT: shrq $63, %r15 +; X64-NEXT: movq %rbx, %rdi +; X64-NEXT: movq %r14, %rsi +; X64-NEXT: movq %r12, %rcx ; X64-NEXT: callq __modti3@PLT +; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: orq %rax, %rdx -; X64-NEXT: setne %al -; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; X64-NEXT: setne %cl +; X64-NEXT: testl %r15d, %ecx +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF ; X64-NEXT: cmpq %rcx, %r13 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF ; X64-NEXT: cmovbq %r13, %rax -; X64-NEXT: testq %r14, %r14 +; X64-NEXT: testq %rbp, %rbp ; X64-NEXT: cmovnsq %rcx, %r13 ; X64-NEXT: cmoveq %rax, %r13 ; X64-NEXT: movl $0, %eax -; X64-NEXT: cmovnsq %rax, %r14 +; X64-NEXT: cmovnsq %rax, %rbp ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 ; X64-NEXT: cmpq %rcx, %r13 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: cmovaq %r13, %rax -; X64-NEXT: testq %r14, %r14 +; X64-NEXT: testq %rbp, %rbp ; X64-NEXT: cmovsq %rcx, %r13 -; X64-NEXT: cmpq $-1, %r14 +; X64-NEXT: cmpq $-1, %rbp ; X64-NEXT: cmoveq %rax, %r13 ; X64-NEXT: movq %r13, %xmm0 ; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; X64-NEXT: psrlq $1, %xmm1 ; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; X64-NEXT: # xmm0 = mem[0,1,1,3] -; X64-NEXT: psllq $32, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; X64-NEXT: psrad $31, %xmm1 -; X64-NEXT: psrlq $31, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %rbp -; X64-NEXT: sarq $63, %rbp -; X64-NEXT: shldq $31, %rbx, %rbp -; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: pcmpgtd %xmm0, %xmm1 -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movq %xmm0, %rdx +; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-NEXT: # xmm1 = mem[2,3,2,3] +; X64-NEXT: pxor %xmm0, %xmm0 +; X64-NEXT: pcmpgtd %xmm1, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: paddq %xmm1, %xmm1 +; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movq %xmm1, %r14 +; X64-NEXT: movq %r14, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: shldq $31, %r14, %rbx +; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; X64-NEXT: # xmm1 = mem[2,3,2,3] +; X64-NEXT: pxor %xmm0, %xmm0 +; X64-NEXT: pcmpgtd %xmm1, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movq %xmm1, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: sarq $63, %r15 -; X64-NEXT: movq %rbx, %r12 -; X64-NEXT: shlq $31, %r12 -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %rbp, %rsi -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: sarq $63, %r12 +; X64-NEXT: shlq $31, %r14 +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rbx, %rsi +; X64-NEXT: movq %r12, %rcx ; X64-NEXT: callq __divti3@PLT ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: subq $1, %r13 -; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %r15d, %ebx -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %rbp, %rsi +; X64-NEXT: sbbq $0, %rbp +; X64-NEXT: movq %rbx, %r15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: xorq %rdx, %r15 +; X64-NEXT: shrq $63, %r15 +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %rbx, %rsi +; X64-NEXT: movq %r12, %rcx ; X64-NEXT: callq __modti3@PLT +; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: orq %rax, %rdx -; X64-NEXT: setne %al -; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; X64-NEXT: setne %cl +; X64-NEXT: testl %r15d, %ecx +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF ; X64-NEXT: cmpq %rcx, %r13 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF ; X64-NEXT: cmovbq %r13, %rax -; X64-NEXT: testq %r14, %r14 +; X64-NEXT: testq %rbp, %rbp ; X64-NEXT: cmovnsq %rcx, %r13 ; X64-NEXT: cmoveq %rax, %r13 ; X64-NEXT: movl $0, %eax -; X64-NEXT: cmovnsq %rax, %r14 +; X64-NEXT: cmovnsq %rax, %rbp ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 ; X64-NEXT: cmpq %rcx, %r13 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: cmovaq %r13, %rax -; X64-NEXT: testq %r14, %r14 +; X64-NEXT: testq %rbp, %rbp ; X64-NEXT: cmovsq %rcx, %r13 -; X64-NEXT: cmpq $-1, %r14 +; X64-NEXT: cmpq $-1, %rbp ; X64-NEXT: cmoveq %rax, %r13 ; X64-NEXT: movq %r13, %xmm0 ; X64-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rbx -; X64-NEXT: movq %rbx, %rbp -; X64-NEXT: sarq $63, %rbp -; X64-NEXT: shldq $31, %rbx, %rbp +; X64-NEXT: movq %rbx, %r14 +; X64-NEXT: sarq $63, %r14 +; X64-NEXT: shldq $31, %rbx, %r14 ; X64-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; X64-NEXT: # xmm0 = mem[2,3,2,3] ; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: sarq $63, %r15 -; X64-NEXT: movq %rbx, %r12 -; X64-NEXT: shlq $31, %r12 -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %rbp, %rsi -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: sarq $63, %r12 +; X64-NEXT: shlq $31, %rbx +; X64-NEXT: movq %rbx, %rdi +; X64-NEXT: movq %r14, %rsi +; X64-NEXT: movq %r12, %rcx ; X64-NEXT: callq __divti3@PLT ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: subq $1, %r13 -; X64-NEXT: sbbq $0, %r14 -; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %r15d, %ebx -; X64-NEXT: movq %r12, %rdi -; X64-NEXT: movq %rbp, %rsi +; X64-NEXT: sbbq $0, %rbp +; X64-NEXT: movq %r14, %r15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq %r15, %rcx +; X64-NEXT: xorq %rdx, %r15 +; X64-NEXT: shrq $63, %r15 +; X64-NEXT: movq %rbx, %rdi +; X64-NEXT: movq %r14, %rsi +; X64-NEXT: movq %r12, %rcx ; X64-NEXT: callq __modti3@PLT +; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: orq %rax, %rdx -; X64-NEXT: setne %al -; X64-NEXT: testb %bl, %al -; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; X64-NEXT: setne %cl +; X64-NEXT: testl %r15d, %ecx +; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload ; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload ; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF ; X64-NEXT: cmpq %rcx, %r13 ; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF ; X64-NEXT: cmovbq %r13, %rax -; X64-NEXT: testq %r14, %r14 +; X64-NEXT: testq %rbp, %rbp ; X64-NEXT: cmovnsq %rcx, %r13 ; X64-NEXT: cmoveq %rax, %r13 ; X64-NEXT: movl $0, %eax -; X64-NEXT: cmovnsq %rax, %r14 +; X64-NEXT: cmovnsq %rax, %rbp ; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 ; X64-NEXT: cmpq %rcx, %r13 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: cmovaq %r13, %rax -; X64-NEXT: testq %r14, %r14 +; X64-NEXT: testq %rbp, %rbp ; X64-NEXT: cmovsq %rcx, %r13 -; X64-NEXT: cmpq $-1, %r14 +; X64-NEXT: cmpq $-1, %rbp ; X64-NEXT: cmoveq %rax, %r13 -; X64-NEXT: movq %r13, %xmm1 -; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-NEXT: psrlq $1, %xmm0 -; X64-NEXT: shufps $136, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; X64-NEXT: # xmm0 = xmm0[0,2],mem[0,2] +; X64-NEXT: movq %r13, %xmm0 +; X64-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; X64-NEXT: psrlq $1, %xmm1 +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; X64-NEXT: addq $104, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 @@ -844,19 +843,18 @@ ; X86-NEXT: movl 16(%ebp), %edi ; X86-NEXT: movl 32(%ebp), %eax ; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %eax, %edx ; X86-NEXT: sarl $31, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edi, %ebx ; X86-NEXT: sarl $31, %ebx -; X86-NEXT: leal (%edi,%edi), %eax -; X86-NEXT: shrl $31, %edi -; X86-NEXT: shldl $31, %eax, %edi ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl %edi, %edi +; X86-NEXT: shrdl $1, %ebx, %edi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %edx ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi @@ -881,68 +879,65 @@ ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: sarl $31, %ebx ; X86-NEXT: movl 20(%ebp), %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: sarl $31, %esi -; X86-NEXT: leal (%ecx,%ecx), %eax -; X86-NEXT: shrl $31, %ecx -; X86-NEXT: shldl $31, %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: sarl $31, %edi ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: shrdl $1, %edi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %esi +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax ; X86-NEXT: calll __modti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: movl 28(%ebp), %edx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: sarl $31, %edi +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl 12(%ebp), %ecx ; X86-NEXT: movl %ecx, %esi ; X86-NEXT: sarl $31, %esi -; X86-NEXT: leal (%ecx,%ecx), %eax -; X86-NEXT: shrl $31, %ecx -; X86-NEXT: shldl $31, %eax, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: shrdl $1, %esi, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %edx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %eax ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl $0 +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl %eax ; X86-NEXT: calll __divti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: movl 40(%ebp), %edx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl 40(%ebp), %eax +; X86-NEXT: movl %eax, %esi ; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl 24(%ebp), %ecx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: sarl $31, %edi -; X86-NEXT: leal (%ecx,%ecx), %eax -; X86-NEXT: shrl $31, %ecx -; X86-NEXT: shldl $31, %eax, %ecx +; X86-NEXT: movl 24(%ebp), %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: sarl $31, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl %edx, %edx +; X86-NEXT: shrdl $1, %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi +; X86-NEXT: pushl %eax +; X86-NEXT: pushl %ecx ; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %edx ; X86-NEXT: pushl $0 +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-NEXT: pushl %eax ; X86-NEXT: calll __modti3 ; X86-NEXT: addl $32, %esp @@ -952,9 +947,9 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl 40(%ebp) -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ecx ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax @@ -965,7 +960,6 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl 36(%ebp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %edi ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload @@ -1064,20 +1058,20 @@ ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: testl %edx, %edx -; X86-NEXT: sets %al ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: sets %al +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: testl %edx, %edx ; X86-NEXT: sets %bl ; X86-NEXT: xorb %al, %bl ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %edx -; X86-NEXT: pushl 28(%ebp) ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl %ecx +; X86-NEXT: pushl %ecx +; X86-NEXT: pushl 28(%ebp) +; X86-NEXT: pushl %edx +; X86-NEXT: pushl %edx ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: pushl $0 ; X86-NEXT: pushl %eax diff --git a/llvm/test/CodeGen/X86/select-sra.ll b/llvm/test/CodeGen/X86/select-sra.ll --- a/llvm/test/CodeGen/X86/select-sra.ll +++ b/llvm/test/CodeGen/X86/select-sra.ll @@ -18,7 +18,7 @@ ; CHECK-LABEL: isnonneg_i16: ; CHECK: # %bb.0: ; CHECK-NEXT: movswl %di, %eax -; CHECK-NEXT: sarl $15, %eax +; CHECK-NEXT: shrl $15, %eax ; CHECK-NEXT: orl $542, %eax # imm = 0x21E ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/setcc-combine.ll b/llvm/test/CodeGen/X86/setcc-combine.ll --- a/llvm/test/CodeGen/X86/setcc-combine.ll +++ b/llvm/test/CodeGen/X86/setcc-combine.ll @@ -6,16 +6,18 @@ ; SSE2-LABEL: test_eq_1: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_eq_1: ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pextrd $1, %xmm1, %eax -; SSE41-NEXT: notl %eax +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE41-NEXT: pextrd $1, %xmm0, %eax ; SSE41-NEXT: retq %cmp = icmp slt <4 x i32> %A, %B %sext = sext <4 x i1> %cmp to <4 x i32> @@ -29,14 +31,20 @@ ; SSE2-LABEL: test_ne_1: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: notl %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_ne_1: ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pextrd $1, %xmm1, %eax +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: notl %eax ; SSE41-NEXT: retq %cmp = icmp slt <4 x i32> %A, %B %sext = sext <4 x i1> %cmp to <4 x i32> @@ -47,10 +55,22 @@ } define i32 @test_le_1(<4 x i32> %A, <4 x i32> %B) { -; CHECK-LABEL: test_le_1: -; CHECK: # %bb.0: -; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: retq +; SSE2-LABEL: test_le_1: +; SSE2: # %bb.0: +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_le_1: +; SSE41: # %bb.0: +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] +; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: retq %cmp = icmp slt <4 x i32> %A, %B %sext = sext <4 x i1> %cmp to <4 x i32> %cmp1 = icmp sle <4 x i32> %sext, zeroinitializer @@ -63,7 +83,9 @@ ; SSE2-LABEL: test_ge_1: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: retq @@ -71,7 +93,9 @@ ; SSE41-LABEL: test_ge_1: ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pextrd $1, %xmm1, %eax +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE41-NEXT: pextrd $1, %xmm0, %eax ; SSE41-NEXT: notl %eax ; SSE41-NEXT: retq %cmp = icmp slt <4 x i32> %A, %B @@ -104,10 +128,22 @@ } define i32 @test_gt_1(<4 x i32> %A, <4 x i32> %B) { -; CHECK-LABEL: test_gt_1: -; CHECK: # %bb.0: -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: retq +; SSE2-LABEL: test_gt_1: +; SSE2: # %bb.0: +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_gt_1: +; SSE41: # %bb.0: +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pextrd $1, %xmm1, %eax +; SSE41-NEXT: retq %cmp = icmp slt <4 x i32> %A, %B %sext = sext <4 x i1> %cmp to <4 x i32> %cmp1 = icmp sgt <4 x i32> %sext, zeroinitializer @@ -120,16 +156,18 @@ ; SSE2-LABEL: test_eq_2: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: notl %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_eq_2: ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: pextrd $1, %xmm0, %eax -; SSE41-NEXT: notl %eax +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pextrd $1, %xmm1, %eax ; SSE41-NEXT: retq %cmp = icmp slt <4 x i32> %B, %A %sext = sext <4 x i1> %cmp to <4 x i32> @@ -143,14 +181,20 @@ ; SSE2-LABEL: test_ne_2: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: notl %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_ne_2: ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pextrd $1, %xmm1, %eax +; SSE41-NEXT: notl %eax ; SSE41-NEXT: retq %cmp = icmp slt <4 x i32> %B, %A %sext = sext <4 x i1> %cmp to <4 x i32> @@ -164,15 +208,19 @@ ; SSE2-LABEL: test_le_2: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: notl %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_le_2: ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pextrd $1, %xmm1, %eax ; SSE41-NEXT: notl %eax ; SSE41-NEXT: retq %cmp = icmp slt <4 x i32> %B, %A @@ -184,10 +232,22 @@ } define i32 @test_ge_2(<4 x i32> %A, <4 x i32> %B) { -; CHECK-LABEL: test_ge_2: -; CHECK: # %bb.0: -; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: retq +; SSE2-LABEL: test_ge_2: +; SSE2: # %bb.0: +; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_ge_2: +; SSE41: # %bb.0: +; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pextrd $1, %xmm1, %eax +; SSE41-NEXT: retq %cmp = icmp slt <4 x i32> %B, %A %sext = sext <4 x i1> %cmp to <4 x i32> %cmp1 = icmp sge <4 x i32> zeroinitializer, %sext @@ -221,14 +281,18 @@ ; SSE2-LABEL: test_gt_2: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_gt_2: ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pextrd $1, %xmm1, %eax ; SSE41-NEXT: retq %cmp = icmp slt <4 x i32> %B, %A %sext = sext <4 x i1> %cmp to <4 x i32> diff --git a/llvm/test/CodeGen/X86/setcc-freeze.ll b/llvm/test/CodeGen/X86/setcc-freeze.ll --- a/llvm/test/CodeGen/X86/setcc-freeze.ll +++ b/llvm/test/CodeGen/X86/setcc-freeze.ll @@ -4,7 +4,8 @@ define i32 @f(ptr %p) { ; CHECK-LABEL: f: ; CHECK: # %bb.0: -; CHECK-NEXT: testb $8, 1(%rdi) +; CHECK-NEXT: movzwl (%rdi), %eax +; CHECK-NEXT: testl $2048, %eax # imm = 0x800 ; CHECK-NEXT: je .LBB0_1 ; CHECK-NEXT: # %bb.2: # %B ; CHECK-NEXT: movl $20, %eax diff --git a/llvm/test/CodeGen/X86/setcc-fsh.ll b/llvm/test/CodeGen/X86/setcc-fsh.ll --- a/llvm/test/CodeGen/X86/setcc-fsh.ll +++ b/llvm/test/CodeGen/X86/setcc-fsh.ll @@ -252,9 +252,9 @@ define <4 x i1> @or_rotl_ne_eq0(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-LABEL: or_rotl_ne_eq0: ; CHECK: # %bb.0: -; CHECK-NEXT: pxor %xmm2, %xmm2 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NEXT: retq %rot = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32>%x, <4 x i32> %x, <4 x i32> %y) %or = or <4 x i32> %y, %rot @@ -291,10 +291,10 @@ define <4 x i1> @fshl_or2_eq_0(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: fshl_or2_eq_0: ; CHECK: # %bb.0: -; CHECK-NEXT: pxor %xmm2, %xmm2 ; CHECK-NEXT: psrld $7, %xmm1 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NEXT: retq %or = or <4 x i32> %x, %y %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %or, <4 x i32> ) @@ -305,10 +305,10 @@ define <4 x i1> @fshl_or2_commute_eq_0(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: fshl_or2_commute_eq_0: ; CHECK: # %bb.0: -; CHECK-NEXT: pxor %xmm2, %xmm2 ; CHECK-NEXT: psrld $7, %xmm1 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NEXT: retq %or = or <4 x i32> %y, %x %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %or, <4 x i32> ) @@ -319,8 +319,8 @@ define i1 @fshr_or_eq_0(i16 %x, i16 %y) { ; CHECK-LABEL: fshr_or_eq_0: ; CHECK: # %bb.0: -; CHECK-NEXT: shll $8, %esi -; CHECK-NEXT: orw %di, %si +; CHECK-NEXT: orl %edi, %esi +; CHECK-NEXT: shldw $8, %di, %si ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %or = or i16 %x, %y @@ -332,8 +332,8 @@ define i1 @fshr_or_commute_eq_0(i16 %x, i16 %y) { ; CHECK-LABEL: fshr_or_commute_eq_0: ; CHECK: # %bb.0: -; CHECK-NEXT: shll $8, %esi -; CHECK-NEXT: orw %di, %si +; CHECK-NEXT: orl %edi, %esi +; CHECK-NEXT: shldw $8, %di, %si ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %or = or i16 %y, %x @@ -397,10 +397,10 @@ define <4 x i1> @fshl_or2_ne_0(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: fshl_or2_ne_0: ; CHECK: # %bb.0: -; CHECK-NEXT: pxor %xmm2, %xmm2 ; CHECK-NEXT: psrld $27, %xmm1 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 ; CHECK-NEXT: pxor %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -413,10 +413,10 @@ define <4 x i1> @fshl_or2_commute_ne_0(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: fshl_or2_commute_ne_0: ; CHECK: # %bb.0: -; CHECK-NEXT: pxor %xmm2, %xmm2 ; CHECK-NEXT: psrld $27, %xmm1 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 ; CHECK-NEXT: pxor %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -429,8 +429,8 @@ define i1 @fshr_or_ne_0(i64 %x, i64 %y) { ; CHECK-LABEL: fshr_or_ne_0: ; CHECK: # %bb.0: -; CHECK-NEXT: shlq $63, %rsi -; CHECK-NEXT: orq %rdi, %rsi +; CHECK-NEXT: orl %edi, %esi +; CHECK-NEXT: shldq $63, %rdi, %rsi ; CHECK-NEXT: setne %al ; CHECK-NEXT: retq %or = or i64 %x, %y @@ -442,8 +442,8 @@ define i1 @fshr_or_commute_ne_0(i64 %x, i64 %y) { ; CHECK-LABEL: fshr_or_commute_ne_0: ; CHECK: # %bb.0: -; CHECK-NEXT: shlq $63, %rsi -; CHECK-NEXT: orq %rdi, %rsi +; CHECK-NEXT: orl %edi, %esi +; CHECK-NEXT: shldq $63, %rdi, %rsi ; CHECK-NEXT: setne %al ; CHECK-NEXT: retq %or = or i64 %y, %x @@ -455,9 +455,8 @@ define i1 @fshr_or2_ne_0(i16 %x, i16 %y) { ; CHECK-LABEL: fshr_or2_ne_0: ; CHECK: # %bb.0: -; CHECK-NEXT: movzwl %si, %eax -; CHECK-NEXT: shrl $2, %eax -; CHECK-NEXT: orw %di, %ax +; CHECK-NEXT: orl %edi, %esi +; CHECK-NEXT: shrdw $2, %di, %si ; CHECK-NEXT: setne %al ; CHECK-NEXT: retq %or = or i16 %x, %y @@ -469,9 +468,8 @@ define i1 @fshr_or2_commute_ne_0(i16 %x, i16 %y) { ; CHECK-LABEL: fshr_or2_commute_ne_0: ; CHECK: # %bb.0: -; CHECK-NEXT: movzwl %si, %eax -; CHECK-NEXT: shrl $2, %eax -; CHECK-NEXT: orw %di, %ax +; CHECK-NEXT: orl %edi, %esi +; CHECK-NEXT: shrdw $2, %di, %si ; CHECK-NEXT: setne %al ; CHECK-NEXT: retq %or = or i16 %y, %x diff --git a/llvm/test/CodeGen/X86/setcc-logic.ll b/llvm/test/CodeGen/X86/setcc-logic.ll --- a/llvm/test/CodeGen/X86/setcc-logic.ll +++ b/llvm/test/CodeGen/X86/setcc-logic.ll @@ -324,7 +324,11 @@ ; CHECK-NEXT: xorpd %xmm1, %xmm1 ; CHECK-NEXT: cmpltpd %xmm0, %xmm1 ; CHECK-NEXT: movmskpd %xmm1, %eax -; CHECK-NEXT: cmpl $3, %eax +; CHECK-NEXT: testb $2, %al +; CHECK-NEXT: notb %al +; CHECK-NEXT: sete %cl +; CHECK-NEXT: orb %al, %cl +; CHECK-NEXT: testb $1, %cl ; CHECK-NEXT: jne .LBB16_2 ; CHECK-NEXT: # %bb.1: # %true ; CHECK-NEXT: movl $42, %eax @@ -679,18 +683,12 @@ } define i1 @or_cmp_eq_i16(i16 zeroext %x, i16 zeroext %y) { -; NOBMI-LABEL: or_cmp_eq_i16: -; NOBMI: # %bb.0: -; NOBMI-NEXT: notl %edi -; NOBMI-NEXT: testl %esi, %edi -; NOBMI-NEXT: sete %al -; NOBMI-NEXT: retq -; -; BMI-LABEL: or_cmp_eq_i16: -; BMI: # %bb.0: -; BMI-NEXT: andnl %esi, %edi, %eax -; BMI-NEXT: sete %al -; BMI-NEXT: retq +; CHECK-LABEL: or_cmp_eq_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: orl %edi, %esi +; CHECK-NEXT: cmpw %si, %di +; CHECK-NEXT: sete %al +; CHECK-NEXT: retq %o = or i16 %x, %y %c = icmp eq i16 %x, %o ret i1 %c @@ -699,8 +697,8 @@ define i1 @or_cmp_ne_i8(i8 zeroext %x, i8 zeroext %y) { ; CHECK-LABEL: or_cmp_ne_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: notb %sil -; CHECK-NEXT: testb %dil, %sil +; CHECK-NEXT: orl %esi, %edi +; CHECK-NEXT: cmpb %dil, %sil ; CHECK-NEXT: setne %al ; CHECK-NEXT: retq %o = or i8 %x, %y diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll --- a/llvm/test/CodeGen/X86/setcc-wide-types.ll +++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll @@ -599,169 +599,287 @@ define i1 @ne_v4i256(<4 x i256> %a0) { ; SSE2-LABEL: ne_v4i256: ; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; SSE2-NEXT: xorl %ebp, %ebp +; SSE2-NEXT: orq %rbx, %r11 +; SSE2-NEXT: setne %bpl +; SSE2-NEXT: negl %ebp +; SSE2-NEXT: movd %ebp, %xmm0 ; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %r10 -; SSE2-NEXT: movq %r10, %xmm0 ; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: movq %rcx, %xmm0 -; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: movq %rdx, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: xorl %r11d, %r11d +; SSE2-NEXT: orq %r10, %rax +; SSE2-NEXT: setne %r11b +; SSE2-NEXT: negl %r11d +; SSE2-NEXT: movd %r11d, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: orq %rcx, %rsi +; SSE2-NEXT: orq %rdx, %rdi +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: orq %rsi, %rdi +; SSE2-NEXT: setne %al +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %r9 -; SSE2-NEXT: movq %r9, %xmm0 ; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %r8 -; SSE2-NEXT: movq %r8, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: movq %rsi, %xmm0 -; SSE2-NEXT: orq {{[0-9]+}}(%rsp), %rdi -; SSE2-NEXT: movq %rdi, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: orq %r9, %r8 +; SSE2-NEXT: setne %al +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: sete %al +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 ; SSE2-NEXT: retq ; ; SSE41-LABEL: ne_v4i256: ; SSE41: # %bb.0: +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: .cfi_def_cfa_offset 16 +; SSE41-NEXT: .cfi_offset %rbx, -16 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %r10 -; SSE41-NEXT: movq %r10, %xmm0 -; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rax -; SSE41-NEXT: movq %rax, %xmm1 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: movq %rcx, %xmm0 -; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rdx -; SSE41-NEXT: movq %rdx, %xmm2 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE41-NEXT: por %xmm1, %xmm2 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; SSE41-NEXT: orq %rcx, %rsi +; SSE41-NEXT: orq %rdx, %rdi +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: orq %rsi, %rdi +; SSE41-NEXT: setne %cl +; SSE41-NEXT: negl %ecx +; SSE41-NEXT: movd %ecx, %xmm0 ; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %r9 -; SSE41-NEXT: movq %r9, %xmm0 ; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %r8 -; SSE41-NEXT: movq %r8, %xmm1 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rsi -; SSE41-NEXT: movq %rsi, %xmm0 -; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rdi -; SSE41-NEXT: movq %rdi, %xmm3 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE41-NEXT: por %xmm1, %xmm3 -; SSE41-NEXT: por %xmm2, %xmm3 -; SSE41-NEXT: ptest %xmm3, %xmm3 +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: orq %r9, %r8 +; SSE41-NEXT: setne %cl +; SSE41-NEXT: negl %ecx +; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 +; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: orq %rbx, %r11 +; SSE41-NEXT: setne %cl +; SSE41-NEXT: negl %ecx +; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 +; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; SSE41-NEXT: orq {{[0-9]+}}(%rsp), %rax +; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: orq %r10, %rax +; SSE41-NEXT: setne %cl +; SSE41-NEXT: negl %ecx +; SSE41-NEXT: pinsrd $3, %ecx, %xmm0 +; SSE41-NEXT: movmskps %xmm0, %eax +; SSE41-NEXT: testl %eax, %eax ; SSE41-NEXT: sete %al +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: .cfi_def_cfa_offset 8 ; SSE41-NEXT: retq ; ; AVX1-LABEL: ne_v4i256: ; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: .cfi_offset %rbx, -16 ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %r10 -; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rcx -; AVX1-NEXT: orq %r10, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 -; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rax -; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rdx -; AVX1-NEXT: orq %rax, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX1-NEXT: orq %rcx, %rsi +; AVX1-NEXT: orq %rdx, %rdi +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: orq %rsi, %rdi +; AVX1-NEXT: setne %cl +; AVX1-NEXT: negl %ecx +; AVX1-NEXT: vmovd %ecx, %xmm0 ; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %r9 -; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rsi -; AVX1-NEXT: orq %r9, %rsi -; AVX1-NEXT: vmovq %rsi, %xmm1 ; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %r8 -; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rdi -; AVX1-NEXT: orq %r8, %rdi -; AVX1-NEXT: vmovq %rdi, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: orq %r9, %r8 +; AVX1-NEXT: setne %cl +; AVX1-NEXT: negl %ecx +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: orq %rbx, %r11 +; AVX1-NEXT: setne %cl +; AVX1-NEXT: negl %ecx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: xorl %ecx, %ecx +; AVX1-NEXT: orq %r10, %rax +; AVX1-NEXT: setne %cl +; AVX1-NEXT: negl %ecx +; AVX1-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: vtestps %xmm0, %xmm0 ; AVX1-NEXT: sete %al -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: .cfi_def_cfa_offset 8 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ne_v4i256: ; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: .cfi_def_cfa_offset 16 +; AVX2-NEXT: .cfi_offset %rbx, -16 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rcx -; AVX2-NEXT: orq %r10, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm0 -; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: orq %rax, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX2-NEXT: orq %rcx, %rsi +; AVX2-NEXT: orq %rdx, %rdi +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: orq %rsi, %rdi +; AVX2-NEXT: setne %cl +; AVX2-NEXT: negl %ecx +; AVX2-NEXT: vmovd %ecx, %xmm0 ; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r9 -; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rsi -; AVX2-NEXT: orq %r9, %rsi -; AVX2-NEXT: vmovq %rsi, %xmm1 ; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r8 -; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rdi -; AVX2-NEXT: orq %r8, %rdi -; AVX2-NEXT: vmovq %rdi, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: orq %r9, %r8 +; AVX2-NEXT: setne %cl +; AVX2-NEXT: negl %ecx +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: orq %rbx, %r11 +; AVX2-NEXT: setne %cl +; AVX2-NEXT: negl %ecx +; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: xorl %ecx, %ecx +; AVX2-NEXT: orq %r10, %rax +; AVX2-NEXT: setne %cl +; AVX2-NEXT: negl %ecx +; AVX2-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: vtestps %xmm0, %xmm0 ; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: .cfi_def_cfa_offset 8 ; AVX2-NEXT: retq ; -; AVX512-LABEL: ne_v4i256: -; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: shrq $32, %rax -; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0 -; AVX512-NEXT: shrq $32, %r10 -; AVX512-NEXT: vpinsrd $3, %r10d, %xmm0, %xmm0 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r8 -; AVX512-NEXT: vmovd %r8d, %xmm1 -; AVX512-NEXT: shrq $32, %r8 -; AVX512-NEXT: vpinsrd $1, %r8d, %xmm1, %xmm1 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r9 -; AVX512-NEXT: vpinsrd $2, %r9d, %xmm1, %xmm1 -; AVX512-NEXT: shrq $32, %r9 -; AVX512-NEXT: vpinsrd $3, %r9d, %xmm1, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rdx -; AVX512-NEXT: vmovd %edx, %xmm1 -; AVX512-NEXT: shrq $32, %rdx -; AVX512-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 -; AVX512-NEXT: shrq $32, %rcx -; AVX512-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: vmovd %edi, %xmm2 -; AVX512-NEXT: shrq $32, %rdi -; AVX512-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2 -; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: vpinsrd $2, %esi, %xmm2, %xmm2 -; AVX512-NEXT: shrq $32, %rsi -; AVX512-NEXT: vpinsrd $3, %esi, %xmm2, %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: ne_v4i256: +; AVX512F: # %bb.0: +; AVX512F-NEXT: pushq %r14 +; AVX512F-NEXT: .cfi_def_cfa_offset 16 +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: .cfi_def_cfa_offset 24 +; AVX512F-NEXT: .cfi_offset %rbx, -24 +; AVX512F-NEXT: .cfi_offset %r14, -16 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512F-NEXT: orq %rcx, %rsi +; AVX512F-NEXT: orq %rdx, %rdi +; AVX512F-NEXT: orq %rsi, %rdi +; AVX512F-NEXT: setne %al +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r9 +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r8 +; AVX512F-NEXT: orq %r9, %r8 +; AVX512F-NEXT: setne %cl +; AVX512F-NEXT: kmovw %ecx, %k0 +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r14 +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; AVX512F-NEXT: orq %r14, %rbx +; AVX512F-NEXT: setne %cl +; AVX512F-NEXT: kmovw %ecx, %k1 +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: orq %r11, %r10 +; AVX512F-NEXT: setne %cl +; AVX512F-NEXT: andl $1, %ecx +; AVX512F-NEXT: kmovw %ecx, %k2 +; AVX512F-NEXT: kshiftlw $1, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k2, %k1 +; AVX512F-NEXT: kshiftlw $2, %k1, %k1 +; AVX512F-NEXT: andl $1, %eax +; AVX512F-NEXT: kmovw %eax, %k2 +; AVX512F-NEXT: kshiftlw $1, %k0, %k0 +; AVX512F-NEXT: korw %k0, %k2, %k0 +; AVX512F-NEXT: kshiftlw $14, %k0, %k0 +; AVX512F-NEXT: kshiftrw $14, %k0, %k0 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: .cfi_def_cfa_offset 16 +; AVX512F-NEXT: popq %r14 +; AVX512F-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: ne_v4i256: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: pushq %r14 +; AVX512BW-NEXT: .cfi_def_cfa_offset 16 +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: .cfi_def_cfa_offset 24 +; AVX512BW-NEXT: .cfi_offset %rbx, -24 +; AVX512BW-NEXT: .cfi_offset %r14, -16 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512BW-NEXT: orq %rcx, %rsi +; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: orq %rsi, %rdi +; AVX512BW-NEXT: setne %al +; AVX512BW-NEXT: orq {{[0-9]+}}(%rsp), %r9 +; AVX512BW-NEXT: orq {{[0-9]+}}(%rsp), %r8 +; AVX512BW-NEXT: orq %r9, %r8 +; AVX512BW-NEXT: setne %cl +; AVX512BW-NEXT: kmovd %ecx, %k0 +; AVX512BW-NEXT: orq {{[0-9]+}}(%rsp), %r14 +; AVX512BW-NEXT: orq {{[0-9]+}}(%rsp), %rbx +; AVX512BW-NEXT: orq %r14, %rbx +; AVX512BW-NEXT: setne %cl +; AVX512BW-NEXT: kmovd %ecx, %k1 +; AVX512BW-NEXT: orq {{[0-9]+}}(%rsp), %r11 +; AVX512BW-NEXT: orq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-NEXT: orq %r11, %r10 +; AVX512BW-NEXT: setne %cl +; AVX512BW-NEXT: andl $1, %ecx +; AVX512BW-NEXT: kmovw %ecx, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: kshiftlw $2, %k1, %k1 +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: kmovw %eax, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k2, %k0 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 +; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: popq %rbx +; AVX512BW-NEXT: .cfi_def_cfa_offset 16 +; AVX512BW-NEXT: popq %r14 +; AVX512BW-NEXT: .cfi_def_cfa_offset 8 +; AVX512BW-NEXT: retq %c = icmp ne <4 x i256> %a0, zeroinitializer %b = bitcast <4 x i1> %c to i4 %r = icmp eq i4 %b, 0 @@ -1334,65 +1452,35 @@ } define i1 @eq_i512_op(i512 %a, i512 %b) { -; SSE-LABEL: eq_i512_op: -; SSE: # %bb.0: -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: addq $1, %rdi -; SSE-NEXT: adcq $0, %rsi -; SSE-NEXT: adcq $0, %rdx -; SSE-NEXT: adcq $0, %rcx -; SSE-NEXT: adcq $0, %r8 -; SSE-NEXT: adcq $0, %r9 -; SSE-NEXT: adcq $0, %r10 -; SSE-NEXT: adcq $0, %rax -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %rsi -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %r9 -; SSE-NEXT: orq %rsi, %r9 -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: orq %r9, %rax -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %rdx -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: orq %rdx, %r10 -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %r8 -; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %rdi -; SSE-NEXT: orq %r8, %rdi -; SSE-NEXT: orq %r10, %rdi -; SSE-NEXT: orq %rax, %rdi -; SSE-NEXT: sete %al -; SSE-NEXT: retq -; -; AVXANY-LABEL: eq_i512_op: -; AVXANY: # %bb.0: -; AVXANY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVXANY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVXANY-NEXT: addq $1, %rdi -; AVXANY-NEXT: adcq $0, %rsi -; AVXANY-NEXT: adcq $0, %rdx -; AVXANY-NEXT: adcq $0, %rcx -; AVXANY-NEXT: adcq $0, %r8 -; AVXANY-NEXT: adcq $0, %r9 -; AVXANY-NEXT: adcq $0, %r10 -; AVXANY-NEXT: adcq $0, %rax -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rsi -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %r9 -; AVXANY-NEXT: orq %rsi, %r9 -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax -; AVXANY-NEXT: orq %rcx, %rax -; AVXANY-NEXT: orq %r9, %rax -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10 -; AVXANY-NEXT: orq %rdx, %r10 -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %r8 -; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdi -; AVXANY-NEXT: orq %r8, %rdi -; AVXANY-NEXT: orq %r10, %rdi -; AVXANY-NEXT: orq %rax, %rdi -; AVXANY-NEXT: sete %al -; AVXANY-NEXT: retq +; ANY-LABEL: eq_i512_op: +; ANY: # %bb.0: +; ANY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; ANY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; ANY-NEXT: addq $1, %rdi +; ANY-NEXT: adcq $0, %rsi +; ANY-NEXT: adcq $0, %rdx +; ANY-NEXT: adcq $0, %rcx +; ANY-NEXT: adcq $0, %r8 +; ANY-NEXT: adcq $0, %r9 +; ANY-NEXT: adcq $0, %r10 +; ANY-NEXT: adcq $0, %rax +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rsi +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r9 +; ANY-NEXT: orq %rsi, %r9 +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax +; ANY-NEXT: orq %rcx, %rax +; ANY-NEXT: orq %r9, %rax +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10 +; ANY-NEXT: orq %rdx, %r10 +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r8 +; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdi +; ANY-NEXT: orq %r8, %rdi +; ANY-NEXT: orq %r10, %rdi +; ANY-NEXT: orq %rax, %rdi +; ANY-NEXT: sete %al +; ANY-NEXT: retq %a2 = add i512 %a, 1 %r = icmp eq i512 %a2, %b ret i1 %r diff --git a/llvm/test/CodeGen/X86/setcc.ll b/llvm/test/CodeGen/X86/setcc.ll --- a/llvm/test/CodeGen/X86/setcc.ll +++ b/llvm/test/CodeGen/X86/setcc.ll @@ -141,7 +141,7 @@ define zeroext i1 @t7(i32 %0) { ; X86-LABEL: t7: ; X86: ## %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl $19, %ecx ; X86-NEXT: btl %eax, %ecx ; X86-NEXT: setb %al diff --git a/llvm/test/CodeGen/X86/shift-amount-mod.ll b/llvm/test/CodeGen/X86/shift-amount-mod.ll --- a/llvm/test/CodeGen/X86/shift-amount-mod.ll +++ b/llvm/test/CodeGen/X86/shift-amount-mod.ll @@ -225,8 +225,8 @@ ; X32-NEXT: movl %esi, %edx ; X32-NEXT: xorl %esi, %esi ; X32-NEXT: .LBB7_2: -; X32-NEXT: movl %esi, (%eax) ; X32-NEXT: movl %edx, 4(%eax) +; X32-NEXT: movl %esi, (%eax) ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: retl @@ -463,8 +463,8 @@ ; X32-NEXT: movl %esi, %edx ; X32-NEXT: xorl %esi, %esi ; X32-NEXT: .LBB15_2: -; X32-NEXT: movl %edx, (%eax) ; X32-NEXT: movl %esi, 4(%eax) +; X32-NEXT: movl %edx, (%eax) ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: retl @@ -705,8 +705,8 @@ ; X32-NEXT: movl %esi, %edx ; X32-NEXT: movl %edi, %esi ; X32-NEXT: .LBB23_2: -; X32-NEXT: movl %edx, (%eax) ; X32-NEXT: movl %esi, 4(%eax) +; X32-NEXT: movl %edx, (%eax) ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: retl @@ -782,7 +782,7 @@ ; X64-LABEL: reg64_lshr_by_sub_from_negated: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: leal (%rdx,%rsi), %ecx +; X64-NEXT: leal (%rsi,%rdx), %ecx ; X64-NEXT: negb %cl ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrq %cl, %rax @@ -844,7 +844,7 @@ ; X64-LABEL: reg64_lshr_by_sub_of_negated: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: leal (%rdx,%rsi), %ecx +; X64-NEXT: leal (%rsi,%rdx), %ecx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrq %cl, %rax ; X64-NEXT: retq @@ -1033,7 +1033,7 @@ ; X64-LABEL: reg64_lshr_by_add_of_negated_amts: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: leal (%rdx,%rsi), %ecx +; X64-NEXT: leal (%rsi,%rdx), %ecx ; X64-NEXT: negb %cl ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrq %cl, %rax @@ -1157,7 +1157,7 @@ ; X64-LABEL: reg64_lshr_by_negated_unfolded_sub_b: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: leal (%rdx,%rsi), %ecx +; X64-NEXT: leal (%rsi,%rdx), %ecx ; X64-NEXT: negb %cl ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrq %cl, %rax @@ -1218,7 +1218,7 @@ ; X64-LABEL: reg64_lshr_by_b_sub_negated_unfolded: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: leal (%rdx,%rsi), %ecx +; X64-NEXT: leal (%rsi,%rdx), %ecx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrq %cl, %rax ; X64-NEXT: retq @@ -1556,9 +1556,8 @@ define i16 @sh_trunc_sh(i64 %x) { ; X32-LABEL: sh_trunc_sh: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: shrl $4, %eax -; X32-NEXT: andl $15, %eax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X32-NEXT: shrl $12, %eax ; X32-NEXT: # kill: def $ax killed $ax killed $eax ; X32-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/shift-and.ll b/llvm/test/CodeGen/X86/shift-and.ll --- a/llvm/test/CodeGen/X86/shift-and.ll +++ b/llvm/test/CodeGen/X86/shift-and.ll @@ -144,8 +144,8 @@ ; X32-NEXT: movl %esi, %edx ; X32-NEXT: xorl %esi, %esi ; X32-NEXT: .LBB5_2: -; X32-NEXT: movl %edx, (%eax) ; X32-NEXT: movl %esi, 4(%eax) +; X32-NEXT: movl %edx, (%eax) ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: retl diff --git a/llvm/test/CodeGen/X86/shift-by-signext.ll b/llvm/test/CodeGen/X86/shift-by-signext.ll --- a/llvm/test/CodeGen/X86/shift-by-signext.ll +++ b/llvm/test/CodeGen/X86/shift-by-signext.ll @@ -88,9 +88,9 @@ define i32 @n6_fshl(i32 %x, i32 %y, i8 %shamt) nounwind { ; X86-LABEL: n6_fshl: ; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shldl %cl, %edx, %eax ; X86-NEXT: retl ; @@ -108,9 +108,9 @@ define i32 @n7_fshr(i32 %x, i32 %y, i8 %shamt) nounwind { ; X86-LABEL: n7_fshr: ; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shrdl %cl, %edx, %eax ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll --- a/llvm/test/CodeGen/X86/shift-combine.ll +++ b/llvm/test/CodeGen/X86/shift-combine.ll @@ -536,11 +536,11 @@ define i32 @xor_tree_with_shifts_i32(i32 %a, i32 %b, i32 %c, i32 %d) { ; X32-LABEL: xor_tree_with_shifts_i32: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; X32-NEXT: shrl $16, %eax -; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X32-NEXT: xorl %ecx, %eax ; X32-NEXT: retl ; ; X64-LABEL: xor_tree_with_shifts_i32: diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll --- a/llvm/test/CodeGen/X86/shift-i128.ll +++ b/llvm/test/CodeGen/X86/shift-i128.ll @@ -14,7 +14,7 @@ ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi ; i686-NEXT: subl $32, %esp -; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx +; i686-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; i686-NEXT: movl {{[0-9]+}}(%esp), %eax ; i686-NEXT: movl {{[0-9]+}}(%esp), %esi ; i686-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -85,7 +85,7 @@ ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi ; i686-NEXT: subl $32, %esp -; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx +; i686-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; i686-NEXT: movl {{[0-9]+}}(%esp), %eax ; i686-NEXT: movl {{[0-9]+}}(%esp), %esi ; i686-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -158,7 +158,7 @@ ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi ; i686-NEXT: subl $32, %esp -; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx +; i686-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; i686-NEXT: movl {{[0-9]+}}(%esp), %eax ; i686-NEXT: movl {{[0-9]+}}(%esp), %esi ; i686-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -552,7 +552,7 @@ ; i686-NEXT: pushl %ebx ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi -; i686-NEXT: subl $100, %esp +; i686-NEXT: subl $92, %esp ; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp ; i686-NEXT: movl {{[0-9]+}}(%esp), %eax ; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -571,92 +571,95 @@ ; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) ; i686-NEXT: movl %ebp, %ecx -; i686-NEXT: shrl $3, %ebp -; i686-NEXT: andl $15, %ebp -; i686-NEXT: leal {{[0-9]+}}(%esp), %eax -; i686-NEXT: subl %ebp, %eax +; i686-NEXT: movl %ebp, %eax +; i686-NEXT: shrl $3, %eax +; i686-NEXT: andl $15, %eax +; i686-NEXT: leal {{[0-9]+}}(%esp), %edi +; i686-NEXT: subl %eax, %edi ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) -; i686-NEXT: movl 8(%eax), %edx -; i686-NEXT: movl %edx, (%esp) # 4-byte Spill +; i686-NEXT: movl 8(%edi), %esi +; i686-NEXT: movl %esi, (%esp) # 4-byte Spill ; i686-NEXT: andl $7, %ecx ; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: shll %cl, %edx -; i686-NEXT: movl 4(%eax), %esi -; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: shrl %esi +; i686-NEXT: shll %cl, %esi ; i686-NEXT: notl %ecx +; i686-NEXT: negl %eax +; i686-NEXT: movl 48(%esp,%eax), %eax +; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: shrl %eax +; i686-NEXT: movl {{[0-9]+}}(%esp), %edx ; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: shrl %cl, %esi -; i686-NEXT: orl %edx, %esi -; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl (%eax), %eax +; i686-NEXT: shrl %cl, %eax +; i686-NEXT: orl %esi, %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %ebx, %edx -; i686-NEXT: shrl $3, %edx -; i686-NEXT: andl $15, %edx +; i686-NEXT: movl %edx, %eax +; i686-NEXT: shrl $3, %eax +; i686-NEXT: andl $15, %eax ; i686-NEXT: leal {{[0-9]+}}(%esp), %esi -; i686-NEXT: subl %edx, %esi +; i686-NEXT: subl %eax, %esi +; i686-NEXT: movl (%edi), %ecx +; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) -; i686-NEXT: andl $7, %ebx -; i686-NEXT: movl 8(%esi), %edi -; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: shll %cl, %edi -; i686-NEXT: movl 4(%esi), %eax -; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: shrl %eax -; i686-NEXT: movl %ebx, %ecx +; i686-NEXT: andl $7, %edx +; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 8(%esi), %ebx +; i686-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl %edx, %ecx +; i686-NEXT: shll %cl, %ebx +; i686-NEXT: movl %edx, %ecx ; i686-NEXT: notl %ecx +; i686-NEXT: negl %eax +; i686-NEXT: movl 80(%esp,%eax), %ebp +; i686-NEXT: movl %ebp, %eax +; i686-NEXT: shrl %eax ; i686-NEXT: # kill: def $cl killed $cl killed $ecx ; i686-NEXT: shrl %cl, %eax -; i686-NEXT: orl %edi, %eax -; i686-NEXT: movl (%esi), %ecx -; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; i686-NEXT: movl %esi, %edi +; i686-NEXT: orl %ebx, %eax +; i686-NEXT: movl 12(%edi), %edi ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: shll %cl, %edi -; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %ecx, %edi -; i686-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; i686-NEXT: negl %ebp -; i686-NEXT: movl 64(%esp,%ebp), %esi -; i686-NEXT: movl %edi, %ecx ; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: movl (%esp), %edi # 4-byte Reload -; i686-NEXT: shldl %cl, %edi, %esi -; i686-NEXT: movl %esi, (%esp) # 4-byte Spill -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; i686-NEXT: movl %esi, %edi -; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: shll %cl, %edi -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; i686-NEXT: shldl %cl, %esi, %ebp -; i686-NEXT: negl %edx -; i686-NEXT: movl 96(%esp,%edx), %edx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; i686-NEXT: shldl %cl, %ebx, %edx -; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; i686-NEXT: movl %edx, 28(%ecx) -; i686-NEXT: movl %ebp, 20(%ecx) -; i686-NEXT: movl %edi, 16(%ecx) ; i686-NEXT: movl (%esp), %edx # 4-byte Reload -; i686-NEXT: movl %edx, 12(%ecx) +; i686-NEXT: shldl %cl, %edx, %edi +; i686-NEXT: movl %edi, (%esp) # 4-byte Spill +; i686-NEXT: movl (%esi), %edi +; i686-NEXT: movl 12(%esi), %ebx +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: # kill: def $cl killed $cl killed $ecx ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; i686-NEXT: movl %edx, 4(%ecx) +; i686-NEXT: shldl %cl, %edx, %ebx +; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx +; i686-NEXT: movl %ebx, 28(%ecx) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; i686-NEXT: movl %esi, %ebx +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: shll %cl, %ebx +; i686-NEXT: # kill: def $cl killed $cl killed $ecx ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; i686-NEXT: movl %edx, (%ecx) -; i686-NEXT: movl %eax, 24(%ecx) +; i686-NEXT: shldl %cl, %esi, %edx +; i686-NEXT: movl %edi, %esi +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: # kill: def $cl killed $cl killed $ecx +; i686-NEXT: shll %cl, %esi +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: # kill: def $cl killed $cl killed $ecx +; i686-NEXT: shldl %cl, %edi, %ebp +; i686-NEXT: movl {{[0-9]+}}(%esp), %edi +; i686-NEXT: movl %ebp, 20(%edi) +; i686-NEXT: movl %esi, 16(%edi) +; i686-NEXT: movl (%esp), %ecx # 4-byte Reload +; i686-NEXT: movl %ecx, 12(%edi) +; i686-NEXT: movl %edx, 4(%edi) +; i686-NEXT: movl %ebx, (%edi) +; i686-NEXT: movl %eax, 24(%edi) ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; i686-NEXT: movl %eax, 8(%ecx) -; i686-NEXT: addl $100, %esp +; i686-NEXT: movl %eax, 8(%edi) +; i686-NEXT: addl $92, %esp ; i686-NEXT: popl %esi ; i686-NEXT: popl %edi ; i686-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll --- a/llvm/test/CodeGen/X86/shift-i256.ll +++ b/llvm/test/CodeGen/X86/shift-i256.ll @@ -3,8 +3,192 @@ ; RUN: llc < %s -mtriple=x86_64-- -O0 | FileCheck %s -check-prefixes=CHECK-X64,CHECK-X64-O0 ; RUN: llc < %s -mtriple=x86_64-- -O2 | FileCheck %s -check-prefixes=CHECK-X64,CHECK-X64-O2 -; CHECK-LABEL: shift1 define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone { +; CHECK-LABEL: shift1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: pushl %ebx +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: subl $92, %esp +; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-NEXT: sarl $31, %ebp +; CHECK-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: andb $7, %al +; CHECK-NEXT: shrb $3, %cl +; CHECK-NEXT: movzbl %cl, %ebp +; CHECK-NEXT: movl 32(%esp,%ebp), %esi +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shrl %cl, %esi +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: notb %dl +; CHECK-NEXT: movl 36(%esp,%ebp), %ecx +; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: leal (%ecx,%ecx), %edi +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: shll %cl, %edi +; CHECK-NEXT: orl %esi, %edi +; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 40(%esp,%ebp), %esi +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shrl %cl, %esi +; CHECK-NEXT: movl 44(%esp,%ebp), %ecx +; CHECK-NEXT: movl %ecx, (%esp) # 4-byte Spill +; CHECK-NEXT: leal (%ecx,%ecx), %edi +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: shll %cl, %edi +; CHECK-NEXT: orl %esi, %edi +; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 48(%esp,%ebp), %ebx +; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shrl %cl, %ebx +; CHECK-NEXT: movl 52(%esp,%ebp), %edi +; CHECK-NEXT: leal (%edi,%edi), %esi +; CHECK-NEXT: movl %edx, %ecx +; CHECK-NEXT: shll %cl, %esi +; CHECK-NEXT: orl %ebx, %esi +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; CHECK-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; CHECK-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill +; CHECK-NEXT: movl 28(%esp,%ebp), %edx +; CHECK-NEXT: movl 56(%esp,%ebp), %ebx +; CHECK-NEXT: shrdl %cl, %ebx, %edi +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; CHECK-NEXT: shrdl %cl, %ebp, %edx +; CHECK-NEXT: sarl %cl, %ebx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %ebx, 28(%eax) +; CHECK-NEXT: movl %edi, 24(%eax) +; CHECK-NEXT: movl (%esp), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 16(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 8(%eax) +; CHECK-NEXT: movl %edx, (%eax) +; CHECK-NEXT: movl %esi, 20(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 12(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 4(%eax) +; CHECK-NEXT: addl $92, %esp +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %edi +; CHECK-NEXT: popl %ebx +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: retl +; +; CHECK-X64-O0-LABEL: shift1: +; CHECK-X64-O0: # %bb.0: # %entry +; CHECK-X64-O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; CHECK-X64-O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; CHECK-X64-O0-NEXT: movq {{[0-9]+}}(%rsp), %rax +; CHECK-X64-O0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: sarq $63, %rcx +; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O0-NEXT: movb %r8b, %dl +; CHECK-X64-O0-NEXT: movb %dl, %cl +; CHECK-X64-O0-NEXT: andb $7, %cl +; CHECK-X64-O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-X64-O0-NEXT: shrb $3, %dl +; CHECK-X64-O0-NEXT: movzbl %dl, %edx +; CHECK-X64-O0-NEXT: movl %edx, %edi +; CHECK-X64-O0-NEXT: movq -64(%rsp,%rdi), %rdx +; CHECK-X64-O0-NEXT: movq -56(%rsp,%rdi), %r8 +; CHECK-X64-O0-NEXT: movq %r8, %r9 +; CHECK-X64-O0-NEXT: shrq %cl, %r9 +; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload +; CHECK-X64-O0-NEXT: notb %cl +; CHECK-X64-O0-NEXT: movq -48(%rsp,%rdi), %rsi +; CHECK-X64-O0-NEXT: movq %rsi, %r10 +; CHECK-X64-O0-NEXT: addq %r10, %r10 +; CHECK-X64-O0-NEXT: shlq %cl, %r10 +; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload +; CHECK-X64-O0-NEXT: orq %r10, %r9 +; CHECK-X64-O0-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-X64-O0-NEXT: movq -40(%rsp,%rdi), %rdi +; CHECK-X64-O0-NEXT: shrdq %cl, %rdi, %rsi +; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload +; CHECK-X64-O0-NEXT: shrdq %cl, %r8, %rdx +; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload +; CHECK-X64-O0-NEXT: sarq %cl, %rdi +; CHECK-X64-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; CHECK-X64-O0-NEXT: movq %rdi, 24(%rax) +; CHECK-X64-O0-NEXT: movq %rsi, 16(%rax) +; CHECK-X64-O0-NEXT: movq %rdx, (%rax) +; CHECK-X64-O0-NEXT: movq %rcx, 8(%rax) +; CHECK-X64-O0-NEXT: retq +; +; CHECK-X64-O2-LABEL: shift1: +; CHECK-X64-O2: # %bb.0: # %entry +; CHECK-X64-O2-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: sarq $63, %rcx +; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-X64-O2-NEXT: movl %r8d, %eax +; CHECK-X64-O2-NEXT: andb $7, %al +; CHECK-X64-O2-NEXT: shrb $3, %r8b +; CHECK-X64-O2-NEXT: movzbl %r8b, %edx +; CHECK-X64-O2-NEXT: movq -64(%rsp,%rdx), %rsi +; CHECK-X64-O2-NEXT: movq -56(%rsp,%rdx), %rdi +; CHECK-X64-O2-NEXT: movq %rdi, %r8 +; CHECK-X64-O2-NEXT: movl %eax, %ecx +; CHECK-X64-O2-NEXT: shrq %cl, %r8 +; CHECK-X64-O2-NEXT: notb %cl +; CHECK-X64-O2-NEXT: movq -48(%rsp,%rdx), %r10 +; CHECK-X64-O2-NEXT: leaq (%r10,%r10), %r11 +; CHECK-X64-O2-NEXT: shlq %cl, %r11 +; CHECK-X64-O2-NEXT: orq %r8, %r11 +; CHECK-X64-O2-NEXT: movq -40(%rsp,%rdx), %rdx +; CHECK-X64-O2-NEXT: movl %eax, %ecx +; CHECK-X64-O2-NEXT: shrdq %cl, %rdx, %r10 +; CHECK-X64-O2-NEXT: shrdq %cl, %rdi, %rsi +; CHECK-X64-O2-NEXT: sarq %cl, %rdx +; CHECK-X64-O2-NEXT: movq %rdx, 24(%r9) +; CHECK-X64-O2-NEXT: movq %r10, 16(%r9) +; CHECK-X64-O2-NEXT: movq %rsi, (%r9) +; CHECK-X64-O2-NEXT: movq %r11, 8(%r9) +; CHECK-X64-O2-NEXT: retq entry: %0 = ashr i256 %x, %a store i256 %0, ptr %r @@ -19,7 +203,7 @@ ; CHECK-NEXT: pushl %edi ; CHECK-NEXT: pushl %esi ; CHECK-NEXT: subl $92, %esp -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/shift-mask.ll b/llvm/test/CodeGen/X86/shift-mask.ll --- a/llvm/test/CodeGen/X86/shift-mask.ll +++ b/llvm/test/CodeGen/X86/shift-mask.ll @@ -129,7 +129,7 @@ ; ; X64-SHIFT-LABEL: test_i16_shl_lshr_1: ; X64-SHIFT: # %bb.0: -; X64-SHIFT-NEXT: movzwl %di, %eax +; X64-SHIFT-NEXT: movl %edi, %eax ; X64-SHIFT-NEXT: shrl $3, %eax ; X64-SHIFT-NEXT: shll $5, %eax ; X64-SHIFT-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/shift-parts.ll b/llvm/test/CodeGen/X86/shift-parts.ll --- a/llvm/test/CodeGen/X86/shift-parts.ll +++ b/llvm/test/CodeGen/X86/shift-parts.ll @@ -11,15 +11,13 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq g_144+24(%rip), %rax ; CHECK-NEXT: movq g_144+16(%rip), %rcx -; CHECK-NEXT: movzbl %sil, %edx -; CHECK-NEXT: shll $6, %edx ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_1: # %for.cond ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: testb $64, %dl -; CHECK-NEXT: movq %rcx, %rsi -; CHECK-NEXT: cmovneq %rax, %rsi -; CHECK-NEXT: testl %esi, %esi +; CHECK-NEXT: testb $1, %sil +; CHECK-NEXT: movq %rcx, %rdx +; CHECK-NEXT: cmovneq %rax, %rdx +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: je .LBB0_1 ; CHECK-NEXT: # %bb.2: # %if.then ; CHECK-NEXT: movl $1, %eax diff --git a/llvm/test/CodeGen/X86/shl_elim.ll b/llvm/test/CodeGen/X86/shl_elim.ll --- a/llvm/test/CodeGen/X86/shl_elim.ll +++ b/llvm/test/CodeGen/X86/shl_elim.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: test1: ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: shrl %eax +; CHECK-NEXT: shrl $9, %eax ; CHECK-NEXT: cwtl ; CHECK-NEXT: retl %tmp29 = lshr i64 %a, 24 ; [#uses=1] diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -719,18 +719,18 @@ ; X86-SSE-NEXT: pmulhuw %xmm2, %xmm4 ; X86-SSE-NEXT: pmullw %xmm2, %xmm0 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; X86-SSE-NEXT: movdqa %xmm1, %xmm4 ; X86-SSE-NEXT: pmulhuw %xmm3, %xmm4 ; X86-SSE-NEXT: pmullw %xmm3, %xmm1 ; X86-SSE-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X86-SSE-NEXT: movdqu %xmm1, 32(%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm3, 48(%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm0, (%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm2, 16(%ecx,%eax,4) +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X86-SSE-NEXT: movdqu %xmm1, 48(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm3, 32(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm0, 16(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm2, (%ecx,%eax,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -790,18 +790,18 @@ ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; X64-SSE-NEXT: movdqa %xmm3, %xmm4 ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1 -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4) +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: mul_16xi16: @@ -1216,18 +1216,18 @@ ; X86-SSE-NEXT: pmulhw %xmm2, %xmm4 ; X86-SSE-NEXT: pmullw %xmm2, %xmm0 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; X86-SSE-NEXT: movdqa %xmm1, %xmm4 ; X86-SSE-NEXT: pmulhw %xmm3, %xmm4 ; X86-SSE-NEXT: pmullw %xmm3, %xmm1 ; X86-SSE-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X86-SSE-NEXT: movdqu %xmm1, 32(%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm3, 48(%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm0, (%ecx,%eax,4) -; X86-SSE-NEXT: movdqu %xmm2, 16(%ecx,%eax,4) +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X86-SSE-NEXT: movdqu %xmm1, 48(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm3, 32(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm0, 16(%ecx,%eax,4) +; X86-SSE-NEXT: movdqu %xmm2, (%ecx,%eax,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -1287,18 +1287,18 @@ ; X64-SSE-NEXT: pmulhw %xmm0, %xmm4 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; X64-SSE-NEXT: movdqa %xmm3, %xmm4 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm4 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1 -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4) +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: mul_16xi16_sext: @@ -1427,7 +1427,7 @@ ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx ; X86-SSE-NEXT: movd %ecx, %xmm0 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $24, %xmm0 ; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) @@ -1451,7 +1451,7 @@ ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx ; X64-SSE-NEXT: movd %ecx, %xmm0 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X64-SSE-NEXT: psrad $24, %xmm0 ; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) @@ -1619,7 +1619,7 @@ ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx ; X86-SSE-NEXT: movd %ecx, %xmm0 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $24, %xmm0 ; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) @@ -1643,7 +1643,7 @@ ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx ; X64-SSE-NEXT: movd %ecx, %xmm0 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X64-SSE-NEXT: psrad $24, %xmm0 ; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) @@ -1683,7 +1683,7 @@ ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx ; X86-SSE-NEXT: movd %ecx, %xmm0 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $24, %xmm0 ; X86-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) @@ -1707,7 +1707,7 @@ ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx ; X64-SSE-NEXT: movd %ecx, %xmm0 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X64-SSE-NEXT: psrad $24, %xmm0 ; X64-SSE-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) @@ -1750,6 +1750,8 @@ ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE-NEXT: psllq $32, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; @@ -1773,6 +1775,8 @@ ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-SSE-NEXT: psllq $32, %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; @@ -2007,13 +2011,13 @@ ; X86-SSE-NEXT: movd %xmm4, %ecx ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %ecx -; X86-SSE-NEXT: movd %edx, %xmm4 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; X86-SSE-NEXT: movd %edx, %xmm6 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SSE-NEXT: divl 16(%esi) -; X86-SSE-NEXT: movd %edx, %xmm3 +; X86-SSE-NEXT: movd %edx, %xmm4 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; X86-SSE-NEXT: movd %xmm2, %eax ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] @@ -2021,54 +2025,54 @@ ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %ecx ; X86-SSE-NEXT: movd %edx, %xmm1 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] ; X86-SSE-NEXT: movl %edi, %eax ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl (%esi) -; X86-SSE-NEXT: movd %edx, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; X86-SSE-NEXT: movd %xmm2, %ecx +; X86-SSE-NEXT: movd %edx, %xmm2 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] +; X86-SSE-NEXT: movd %xmm6, %ecx ; X86-SSE-NEXT: movl %ebx, %eax ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %ecx -; X86-SSE-NEXT: movd %edx, %xmm2 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; X86-SSE-NEXT: movd %xmm2, %ecx +; X86-SSE-NEXT: movd %edx, %xmm6 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; X86-SSE-NEXT: movd %xmm7, %ecx ; X86-SSE-NEXT: movl %ebp, %eax ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %ecx -; X86-SSE-NEXT: movd %edx, %xmm2 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] -; X86-SSE-NEXT: movd %xmm4, %eax +; X86-SSE-NEXT: movd %edx, %xmm7 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; X86-SSE-NEXT: movd %xmm5, %eax ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X86-SSE-NEXT: movd %xmm0, %ecx ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl %ecx ; X86-SSE-NEXT: movd %edx, %xmm0 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; X86-SSE-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-SSE-NEXT: xorl %edx, %edx ; X86-SSE-NEXT: divl 32(%esi) +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] ; X86-SSE-NEXT: movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE-NEXT: pmuludq %xmm0, %xmm6 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] ; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm0, %xmm3 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; X86-SSE-NEXT: pmuludq %xmm0, %xmm4 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; X86-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007 ; X86-SSE-NEXT: movl %eax, (%eax) -; X86-SSE-NEXT: movdqa %xmm3, (%eax) -; X86-SSE-NEXT: movdqa %xmm1, (%eax) +; X86-SSE-NEXT: movdqa %xmm4, (%eax) +; X86-SSE-NEXT: movdqa %xmm2, (%eax) ; X86-SSE-NEXT: addl $4, %esp ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: popl %edi @@ -2228,12 +2232,12 @@ ; X64-SSE-NEXT: movd %xmm4, %r11d ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %r11d -; X64-SSE-NEXT: movd %edx, %xmm4 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; X64-SSE-NEXT: movd %edx, %xmm6 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; X64-SSE-NEXT: movl %edi, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl 16(%rsi) -; X64-SSE-NEXT: movd %edx, %xmm3 +; X64-SSE-NEXT: movd %edx, %xmm4 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; X64-SSE-NEXT: movd %xmm2, %eax ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] @@ -2241,54 +2245,54 @@ ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %edi ; X64-SSE-NEXT: movd %edx, %xmm1 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0] ; X64-SSE-NEXT: movl %r8d, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl (%rsi) -; X64-SSE-NEXT: movd %edx, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; X64-SSE-NEXT: movd %xmm2, %edi +; X64-SSE-NEXT: movd %edx, %xmm2 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] +; X64-SSE-NEXT: movd %xmm6, %edi ; X64-SSE-NEXT: movl %r9d, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %edi -; X64-SSE-NEXT: movd %edx, %xmm2 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; X64-SSE-NEXT: movd %xmm2, %edi +; X64-SSE-NEXT: movd %edx, %xmm6 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] +; X64-SSE-NEXT: movd %xmm7, %edi ; X64-SSE-NEXT: movl %r10d, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %edi -; X64-SSE-NEXT: movd %edx, %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] -; X64-SSE-NEXT: movd %xmm4, %eax +; X64-SSE-NEXT: movd %edx, %xmm7 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; X64-SSE-NEXT: movd %xmm5, %eax ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; X64-SSE-NEXT: movd %xmm0, %edi ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl %edi ; X64-SSE-NEXT: movd %edx, %xmm0 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; X64-SSE-NEXT: movl %ecx, %eax ; X64-SSE-NEXT: xorl %edx, %edx ; X64-SSE-NEXT: divl 32(%rsi) +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] ; X64-SSE-NEXT: movdqa {{.*#+}} xmm0 = [8199,8199,8199,8199] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-SSE-NEXT: pmuludq %xmm0, %xmm6 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] ; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; X64-SSE-NEXT: pmuludq %xmm0, %xmm3 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; X64-SSE-NEXT: pmuludq %xmm0, %xmm4 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; X64-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007 ; X64-SSE-NEXT: movl %eax, (%rax) -; X64-SSE-NEXT: movdqa %xmm3, (%rax) -; X64-SSE-NEXT: movdqa %xmm1, (%rax) +; X64-SSE-NEXT: movdqa %xmm4, (%rax) +; X64-SSE-NEXT: movdqa %xmm2, (%rax) ; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: PR34947: diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll @@ -156,9 +156,9 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[3,1,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm0, (%rsi) ; SSE2-NEXT: retq @@ -463,8 +463,8 @@ ; SSE2-LABEL: shuffle_v16i8_to_v2i8_2: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rsi) @@ -541,8 +541,8 @@ ; SSE2-LABEL: shuffle_v16i8_to_v2i8_4: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rsi) @@ -619,8 +619,8 @@ ; SSE2-LABEL: shuffle_v16i8_to_v2i8_6: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rsi) diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll @@ -64,10 +64,10 @@ ; ; AVX512BWVL-FAST-ALL-LABEL: shuffle_v16i32_to_v8i32_1: ; AVX512BWVL-FAST-ALL: # %bb.0: -; AVX512BWVL-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] -; AVX512BWVL-FAST-ALL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1 -; AVX512BWVL-FAST-ALL-NEXT: vmovdqa %ymm1, (%rsi) +; AVX512BWVL-FAST-ALL-NEXT: vmovaps (%rdi), %ymm0 +; AVX512BWVL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15] +; AVX512BWVL-FAST-ALL-NEXT: vpermi2ps 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-FAST-ALL-NEXT: vmovaps %ymm1, (%rsi) ; AVX512BWVL-FAST-ALL-NEXT: vzeroupper ; AVX512BWVL-FAST-ALL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -276,9 +276,9 @@ ; ; AVX512VL-LABEL: shuffle_v8i32_to_v4i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi) -; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] +; AVX512VL-NEXT: vmovaps %xmm0, (%rsi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v8i32_to_v4i32: @@ -290,16 +290,16 @@ ; ; AVX512BWVL-LABEL: shuffle_v8i32_to_v4i32: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] +; AVX512BWVL-NEXT: vmovaps %xmm0, (%rsi) ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v8i32_to_v4i32: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vpmovqd %ymm0, (%rsi) -; AVX512VBMIVL-NEXT: vzeroupper +; AVX512VBMIVL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VBMIVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] +; AVX512VBMIVL-NEXT: vmovaps %xmm0, (%rsi) ; AVX512VBMIVL-NEXT: retq %vec = load <8 x i32>, ptr %L %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1324,74 +1324,74 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { ; AVX1-LABEL: negative: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm1, %eax +; AVX1-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: negative: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm1, %eax +; AVX2-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: negative: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512F-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm1, %eax +; AVX512F-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: negative: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX512VL-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX512VL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm1, %eax +; AVX512VL-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: negative: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vmovd %xmm1, %eax +; AVX512BW-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: negative: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512BWVL-NEXT: movl $65537, %eax # imm = 0x10001 -; AVX512BWVL-NEXT: kmovd %eax, %k1 -; AVX512BWVL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} -; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm1, %eax +; AVX512BWVL-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: negative: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; AVX512VBMIVL-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512VBMIVL-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 -; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VBMIVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vmovd %xmm1, %eax +; AVX512VBMIVL-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %strided.vec = shufflevector <32 x i8> %v, <32 x i8> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -166,12 +166,32 @@ } define void @shuffle_v16i32_to_v8i32(ptr %L, ptr %S) nounwind { -; AVX512-LABEL: shuffle_v16i32_to_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, (%rsi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: shuffle_v16i32_to_v8i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps (%rdi), %ymm0 +; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6] +; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-NEXT: vmovaps %ymm0, (%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-FAST-ALL-LABEL: shuffle_v16i32_to_v8i32: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovaps (%rdi), %ymm0 +; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14] +; AVX512VL-FAST-ALL-NEXT: vpermi2ps 32(%rdi), %ymm0, %ymm1 +; AVX512VL-FAST-ALL-NEXT: vmovaps %ymm1, (%rsi) +; AVX512VL-FAST-ALL-NEXT: vzeroupper +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i32_to_v8i32: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rsi) +; AVX512VL-FAST-PERLANE-NEXT: vzeroupper +; AVX512VL-FAST-PERLANE-NEXT: retq %vec = load <16 x i32>, ptr %L %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> store <8 x i32> %strided.vec, ptr %S diff --git a/llvm/test/CodeGen/X86/signbit-shift.ll b/llvm/test/CodeGen/X86/signbit-shift.ll --- a/llvm/test/CodeGen/X86/signbit-shift.ll +++ b/llvm/test/CodeGen/X86/signbit-shift.ll @@ -86,8 +86,9 @@ ; CHECK-LABEL: add_sext_ifpos: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: shrl $31, %edi -; CHECK-NEXT: leal 41(%rdi), %eax +; CHECK-NEXT: notl %edi +; CHECK-NEXT: sarl $31, %edi +; CHECK-NEXT: leal 42(%rdi), %eax ; CHECK-NEXT: retq %c = icmp sgt i32 %x, -1 %e = sext i1 %c to i32 diff --git a/llvm/test/CodeGen/X86/single_elt_vector_memory_operation.ll b/llvm/test/CodeGen/X86/single_elt_vector_memory_operation.ll --- a/llvm/test/CodeGen/X86/single_elt_vector_memory_operation.ll +++ b/llvm/test/CodeGen/X86/single_elt_vector_memory_operation.ll @@ -87,8 +87,8 @@ ; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: movaps %xmm2, 48(%rdx) ; SSE-NEXT: movaps %xmm2, 32(%rdx) -; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: load_single_256bit_elt_vector: @@ -129,8 +129,8 @@ ; SSE: # %bb.0: ; SSE-NEXT: movaps (%rdi), %xmm0 ; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: store_single_256bit_elt_vector: @@ -186,8 +186,8 @@ ; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vmovaps %ymm2, 96(%rdx) ; AVX-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX-NEXT: vmovaps %ymm0, (%rdx) ; AVX-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX-NEXT: vmovaps %ymm0, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -198,8 +198,8 @@ ; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vmovaps %ymm2, 96(%rdx) ; AVX2-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -226,17 +226,17 @@ ; SSE-NEXT: movaps 32(%rdi), %xmm2 ; SSE-NEXT: movaps 48(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, 48(%rdx) -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps %xmm1, 16(%rdx) ; SSE-NEXT: movaps %xmm2, 32(%rdx) +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: store_single_512bit_elt_vector: ; AVX: # %bb.0: ; AVX-NEXT: vmovaps (%rdi), %ymm0 ; AVX-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX-NEXT: vmovaps %ymm0, (%rdx) ; AVX-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX-NEXT: vmovaps %ymm0, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -244,8 +244,8 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll --- a/llvm/test/CodeGen/X86/smul-with-overflow.ll +++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll @@ -441,8 +441,8 @@ ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %eax, %edi @@ -838,16 +838,16 @@ ; X64-NEXT: negq %r11 ; X64-NEXT: andl $1, %r10d ; X64-NEXT: negq %r10 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %rdi ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: addq %rdx, %rdi ; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r9 +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: addq %rax, %rdi diff --git a/llvm/test/CodeGen/X86/splat-for-size.ll b/llvm/test/CodeGen/X86/splat-for-size.ll --- a/llvm/test/CodeGen/X86/splat-for-size.ll +++ b/llvm/test/CodeGen/X86/splat-for-size.ll @@ -387,8 +387,9 @@ define <8 x i64> @pr23259() #1 { ; AVX-LABEL: pr23259: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovaps A+16(%rip), %xmm0 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [1,1] +; AVX-NEXT: # xmm0 = mem[0,0] +; AVX-NEXT: vpinsrq $0, A+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1] ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll @@ -185,7 +185,7 @@ ; SSE2-NEXT: addq %rdx, %rax ; SSE2-NEXT: leaq (%rax,%rax,8), %rax ; SSE2-NEXT: subq %rax, %rsi -; SSE2-NEXT: movq %rsi, %xmm1 +; SSE2-NEXT: movq %rsi, %xmm0 ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: imulq %r8 ; SSE2-NEXT: movq %rdx, %rax @@ -193,10 +193,10 @@ ; SSE2-NEXT: addq %rdx, %rax ; SSE2-NEXT: leaq (%rax,%rax,8), %rax ; SSE2-NEXT: subq %rax, %rdi -; SSE2-NEXT: movq %rdi, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8589934591,8589934591] -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movq %rdi, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [8589934591,8589934591] +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movabsq $2049638230412172401, %rdx # imm = 0x1C71C71C71C71C71 ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: imulq %rdx @@ -208,19 +208,21 @@ ; SSE2-NEXT: leaq (%rdx,%rdx,8), %rax ; SSE2-NEXT: addq %rcx, %rax ; SSE2-NEXT: movq %rax, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,2] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,3] -; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: notl %ecx ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: # kill: def $cl killed $cl killed $ecx ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_srem_vec: @@ -269,9 +271,9 @@ ; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: pcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: pxor %xmm1, %xmm2 -; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: pextrb $8, %xmm0, %edx -; SSE41-NEXT: pextrb $0, %xmm2, %ecx +; SSE41-NEXT: movd %xmm2, %ecx ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: # kill: def $dl killed $dl killed $edx ; SSE41-NEXT: # kill: def $cl killed $cl killed $ecx diff --git a/llvm/test/CodeGen/X86/srem-seteq-optsize.ll b/llvm/test/CodeGen/X86/srem-seteq-optsize.ll --- a/llvm/test/CodeGen/X86/srem-seteq-optsize.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-optsize.ll @@ -47,11 +47,18 @@ define i32 @test_optsize(i32 %X) optsize nounwind readnone { ; X86-LABEL: test_optsize: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD -; X86-NEXT: addl $429496729, %eax # imm = 0x19999999 -; X86-NEXT: cmpl $858993459, %eax # imm = 0x33333333 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1717986919, %edx # imm = 0x66666667 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: leal (%edx,%edx,4), %eax +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: movl $42, %eax -; X86-NEXT: jb .LBB1_2 +; X86-NEXT: je .LBB1_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: movl $-10, %eax ; X86-NEXT: .LBB1_2: @@ -59,12 +66,17 @@ ; ; X64-LABEL: test_optsize: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %eax # imm = 0xCCCCCCCD -; X64-NEXT: addl $429496729, %eax # imm = 0x19999999 -; X64-NEXT: cmpl $858993459, %eax # imm = 0x33333333 +; X64-NEXT: movslq %edi, %rax +; X64-NEXT: imulq $1717986919, %rax, %rcx # imm = 0x66666667 +; X64-NEXT: movq %rcx, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $33, %rcx +; X64-NEXT: addl %edx, %ecx +; X64-NEXT: leal (%rcx,%rcx,4), %ecx +; X64-NEXT: cmpl %ecx, %eax ; X64-NEXT: movl $42, %ecx ; X64-NEXT: movl $-10, %eax -; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: cmovel %ecx, %eax ; X64-NEXT: retq %rem = srem i32 %X, 5 %cmp = icmp eq i32 %rem, 0 diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -9,76 +9,131 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,1374389535,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: psrad $3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,171798690,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,1374389535,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm3 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm5 +; CHECK-SSE41-NEXT: psrad $1, %xmm5 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm4 +; CHECK-SSE41-NEXT: paddd %xmm5, %xmm4 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm2, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $1, %xmm2, %xmm5 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_even: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -94,55 +149,125 @@ define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_allones_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,0,1717986919] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <0,u,4294967295,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $1, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_allones_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,4294967295,858993458] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,0,0,0] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,0,4294967295,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_allones_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_allones_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [429496729,429496729,429496729,429496729] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_allones_eq: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -153,56 +278,126 @@ define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,0,1717986919] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <0,u,4294967295,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $1, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,4294967295,858993458] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,0,0,0] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,0,4294967295,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_allones_ne: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_allones_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [429496729,429496729,429496729,429496729] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_allones_ne: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -215,70 +410,131 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_allones_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: pslld $31, %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = <1,u,4294967295,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5 +; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm5 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm5 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_allones_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pslld $31, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,0,0,0] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,4294967295,1] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_allones_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_allones_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_allones_eq: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -289,71 +545,132 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: pslld $31, %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = <1,u,4294967295,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5 +; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm5 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm5 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pslld $31, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,0,0,0] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,4294967295,1] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_allones_ne: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_allones_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_allones_ne: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -366,77 +683,145 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,1374389535,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4294967295,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4294967295,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_eq: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -447,78 +832,146 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,1374389535,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4294967295,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4294967295,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_ne: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_ne: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -533,70 +986,123 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,2147483649,1717986919] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,0,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 +; CHECK-SSE2-NEXT: psrad $1, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: psrad $3, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u> -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: psrlq $32, %xmm1 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,268435454,858993458] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm4 = <1717986919,u,2147483649,u> +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm4 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE41-NEXT: psrad $1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm4 +; CHECK-SSE41-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm2, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5],xmm4[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX2-NEXT: vpmuldq %xmm4, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm4, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -610,83 +1116,107 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,268435456,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2147483649,2454267027] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm0, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,2147483649,u> +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,268435454,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm0, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -700,82 +1230,131 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824] -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,2147483649,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: psrad $3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,268435454,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,2147483649,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm3 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm5 +; CHECK-SSE41-NEXT: psrad $1, %xmm5 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm3[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm4 +; CHECK-SSE41-NEXT: paddd %xmm5, %xmm4 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm2, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $1, %xmm2, %xmm5 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_even_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -791,54 +1370,123 @@ define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,0,1717986919] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,0,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $1, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,4294967295,858993458] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1717986919,0,0,0] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm4 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [429496729,429496729,429496729,429496729] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX2-NEXT: vpmuldq %xmm4, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm4, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -852,70 +1500,121 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: pslld $31, %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm0, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: psrad $3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pslld $31, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,0,0,0] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm0, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -928,76 +1627,139 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,4294967295,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,306783378,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm3 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm2, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $1, %xmm2, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_even_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1014,77 +1776,123 @@ ; CHECK-SSE2-LABEL: test_srem_odd_INT_MIN: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2] +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,2147483647,1717986919] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <0,u,4294967295,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $1, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: psrad $30, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_INT_MIN: ; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <1717986919,u,2147483647,u> +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,0,4294967295,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $30, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrad $1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm3, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-SSE41-NEXT: pand %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,0,858993458] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7] ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_INT_MIN: ; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $30, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_INT_MIN: ; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-AVX2-NEXT: vpand %xmm2, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_INT_MIN: ; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1717986919,1717986919,1717986919,1717986919] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -1097,114 +1905,129 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_INT_MIN: ; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3067833783,u,1,u> -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2147483647,2454267027] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295,0,4294967295] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pxor %xmm5, %xmm3 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; CHECK-SSE2-NEXT: psrld $31, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = <1,u,4294967295,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: psrad $30, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_INT_MIN: ; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,2147483647,u> +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,4294967295,1] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $30, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm3, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3067833783,3067833783,1,3067833783] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378] -; CHECK-SSE41-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: por %xmm5, %xmm4 -; CHECK-SSE41-NEXT: pminud %xmm4, %xmm3 -; CHECK-SSE41-NEXT: pcmpeqd %xmm4, %xmm3 -; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_INT_MIN: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378] -; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; CHECK-AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm3 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $30, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_INT_MIN: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378] -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm4 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpor %xmm4, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpminud %xmm3, %xmm2, %xmm3 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_INT_MIN: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378] -; CHECK-AVX512VL-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX512VL-NEXT: vpminud %xmm3, %xmm2, %xmm3 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -1217,110 +2040,144 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,1,3264175145] -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,2147483647,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,1374389535,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4294967295,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2147483648,2,1073741824] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; CHECK-SSE2-NEXT: psrld $31, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: psrad $30, %xmm3 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,3067833783,1,3264175145] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483647,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4294967295,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $30, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm4 +; CHECK-SSE41-NEXT: psrad $1, %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: por %xmm4, %xmm3 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [858993458,306783378,0,42949672] -; CHECK-SSE41-NEXT: pminud %xmm3, %xmm2 -; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm2 -; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm4, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $30, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -1335,71 +2192,140 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,0,2147483649,1717986919] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,0,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,1717986919,1717986919] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,1,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $1, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u> -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: psrlq $32, %xmm1 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,858993458] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm4 +; CHECK-SSE41-NEXT: psrad $3, %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: paddd %xmm4, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -1412,83 +2338,129 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,4294967295] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,0,2147483649,2454267027] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [0,0,2454267027,2454267027] +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm6 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm5 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm3[0,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[2,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,4294967295,268435454,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,4294967295,1,1] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -1501,83 +2473,144 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824] -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,0,2147483649,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,0,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,1374389535,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,1,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE2-NEXT: psrad $1, %xmm2 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm4 +; CHECK-SSE41-NEXT: psrad $1, %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3],xmm4[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: paddd %xmm4, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -1592,55 +2625,126 @@ define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_allones_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,0,0,1717986919] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,1717986919,1717986919] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,1,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $1, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[1,2] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,3,1] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,4294967295,858993458] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,0,1717986919] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_allones_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_allones_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [429496729,429496729,429496729,429496729] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_allones_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -1653,70 +2757,128 @@ define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_allones_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: pslld $31, %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,0,4294967295] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,0,0,2454267027] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [0,0,2454267027,2454267027] +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm6 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[1,2] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,3,1] +; CHECK-SSE2-NEXT: psrld $31, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pslld $31, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,4294967295,4294967295,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,4294967295,1,1] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_allones_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_allones_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_allones_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -1729,77 +2891,137 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,0,0,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,1374389535,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,1,0] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $5, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrad $1, %xmm3 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: psrad $1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 ; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm3, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -1814,76 +3036,134 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2147483649,0,1717986919] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,4294967295,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 +; CHECK-SSE2-NEXT: psrad $1, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 +; CHECK-SSE2-NEXT: psrad $3, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,268435454,4294967295,858993458] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2147483649,0,1717986919] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm3 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm2, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1897,83 +3177,119 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2147483649,0,2454267027] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm0, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: psrad $3, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,268435454,4294967295,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: paddd %xmm0, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -1986,76 +3302,139 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2147483649,0,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,0,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,4294967295,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,268435454,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2147483649,0,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm3 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm3 +; CHECK-SSE41-NEXT: psrad $3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2 +; CHECK-SSE41-NEXT: psrad $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm4 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm2, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrad $1, %xmm2, %xmm4 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -2070,68 +3449,122 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,1] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,1,1] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm5, %xmm5 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,0,2147483649,0] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm5 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [0,0,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm6 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm6 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: psrlq $32, %xmm3 +; CHECK-SSE2-NEXT: psubd %xmm6, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 +; CHECK-SSE2-NEXT: psrad $3, %xmm4 +; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 +; CHECK-SSE2-NEXT: psrad $1, %xmm5 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,1] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <1717986919,u,2147483649,u> +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrlq $32, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE41-NEXT: psrad $1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrlq $32, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,4294967295] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpsrlq $32, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrlq $32, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -2143,68 +3576,112 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,0,2147483649,0] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 +; CHECK-SSE2-NEXT: psrlq $32, %xmm5 +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm5 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: psrad $3, %xmm5 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 +; CHECK-SSE2-NEXT: paddd %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrlq $32, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,4294967295,268435454,4294967295] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,4294967295,1,1] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,2147483649,u> +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrlq $32, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpsrlq $32, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] +; CHECK-AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrlq $32, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] +; CHECK-AVX512VL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %srem = srem <4 x i32> %X, @@ -2218,121 +3695,190 @@ ; CHECK-SSE2-LABEL: pr51133: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: movq %rdi, %rax -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm5 -; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; CHECK-SSE2-NEXT: pand %xmm4, %xmm5 ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm6 ; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 -; CHECK-SSE2-NEXT: pand %xmm4, %xmm6 -; CHECK-SSE2-NEXT: packuswb %xmm5, %xmm6 -; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 -; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm5 -; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 -; CHECK-SSE2-NEXT: psrlw $8, %xmm5 -; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm6 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm4 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: packuswb %xmm4, %xmm6 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pxor %xmm8, %xmm8 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] +; CHECK-SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 +; CHECK-SSE2-NEXT: psrlw $8, %xmm8 +; CHECK-SSE2-NEXT: pxor %xmm7, %xmm7 +; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; CHECK-SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 +; CHECK-SSE2-NEXT: psrlw $8, %xmm7 +; CHECK-SSE2-NEXT: packuswb %xmm8, %xmm7 +; CHECK-SSE2-NEXT: paddb %xmm6, %xmm7 +; CHECK-SSE2-NEXT: movdqa %xmm7, %xmm6 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; CHECK-SSE2-NEXT: psraw $8, %xmm6 ; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 ; CHECK-SSE2-NEXT: psrlw $8, %xmm6 -; CHECK-SSE2-NEXT: packuswb %xmm5, %xmm6 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm7 = [84,2,36,42,2,0,2,4,2,255,4,36,126,30,2,2] -; CHECK-SSE2-NEXT: pminub %xmm6, %xmm7 -; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm7 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; CHECK-SSE2-NEXT: pandn %xmm5, %xmm7 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm6, %xmm6 -; CHECK-SSE2-NEXT: pcmpgtb %xmm6, %xmm1 -; CHECK-SSE2-NEXT: pandn %xmm1, %xmm5 -; CHECK-SSE2-NEXT: por %xmm7, %xmm5 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pand %xmm4, %xmm1 -; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pand %xmm4, %xmm0 -; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0 -; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: psrlw $8, %xmm1 -; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: psrlw $8, %xmm0 -; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [19,51,13,7,127,31,127,3,5,5,51,37,3,127,85,5] -; CHECK-SSE2-NEXT: pmaxub %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm3 -; CHECK-SSE2-NEXT: pandn %xmm5, %xmm3 -; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm2 -; CHECK-SSE2-NEXT: pandn %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pmovmskb %xmm2, %ecx -; CHECK-SSE2-NEXT: pmovmskb %xmm3, %edx -; CHECK-SSE2-NEXT: shll $16, %edx +; CHECK-SSE2-NEXT: movdqa %xmm7, %xmm8 +; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; CHECK-SSE2-NEXT: psraw $8, %xmm8 +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 +; CHECK-SSE2-NEXT: psrlw $8, %xmm8 +; CHECK-SSE2-NEXT: packuswb %xmm6, %xmm8 +; CHECK-SSE2-NEXT: psrlw $7, %xmm7 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-SSE2-NEXT: pand %xmm6, %xmm7 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 +; CHECK-SSE2-NEXT: paddb %xmm8, %xmm7 +; CHECK-SSE2-NEXT: movdqa %xmm7, %xmm8 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm8 +; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm7 +; CHECK-SSE2-NEXT: packuswb %xmm8, %xmm7 +; CHECK-SSE2-NEXT: psubb %xmm7, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm7, %xmm7 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] +; CHECK-SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 +; CHECK-SSE2-NEXT: psrlw $8, %xmm7 +; CHECK-SSE2-NEXT: pxor %xmm8, %xmm8 +; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; CHECK-SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 +; CHECK-SSE2-NEXT: psrlw $8, %xmm8 +; CHECK-SSE2-NEXT: packuswb %xmm7, %xmm8 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm7 = [0,0,0,0,255,255,255,0,255,255,0,255,0,255,0,255] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm7 +; CHECK-SSE2-NEXT: paddb %xmm8, %xmm7 +; CHECK-SSE2-NEXT: movdqa %xmm7, %xmm8 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; CHECK-SSE2-NEXT: psraw $8, %xmm8 +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 +; CHECK-SSE2-NEXT: psrlw $8, %xmm8 +; CHECK-SSE2-NEXT: movdqa %xmm7, %xmm9 +; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; CHECK-SSE2-NEXT: psraw $8, %xmm9 +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9 +; CHECK-SSE2-NEXT: psrlw $8, %xmm9 +; CHECK-SSE2-NEXT: packuswb %xmm8, %xmm9 +; CHECK-SSE2-NEXT: psrlw $7, %xmm7 +; CHECK-SSE2-NEXT: pand %xmm6, %xmm7 +; CHECK-SSE2-NEXT: paddb %xmm9, %xmm7 +; CHECK-SSE2-NEXT: movdqa %xmm7, %xmm6 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm6 +; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm7 +; CHECK-SSE2-NEXT: packuswb %xmm6, %xmm7 +; CHECK-SSE2-NEXT: psubb %xmm7, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqb %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm5, %xmm5 +; CHECK-SSE2-NEXT: pxor %xmm5, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqb %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm5, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqb %xmm4, %xmm2 +; CHECK-SSE2-NEXT: pandn %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pcmpeqb %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pandn %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pmovmskb %xmm3, %ecx +; CHECK-SSE2-NEXT: shll $16, %ecx +; CHECK-SSE2-NEXT: pmovmskb %xmm2, %edx ; CHECK-SSE2-NEXT: orl %ecx, %edx ; CHECK-SSE2-NEXT: movl %edx, (%rdi) ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: pr51133: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm4 ; CHECK-SSE41-NEXT: movq %rdi, %rax -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm0 -; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE41-NEXT: pxor %xmm5, %xmm5 +; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; CHECK-SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE41-NEXT: psrlw $8, %xmm5 +; CHECK-SSE41-NEXT: pxor %xmm6, %xmm6 +; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; CHECK-SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 +; CHECK-SSE41-NEXT: psrlw $8, %xmm6 +; CHECK-SSE41-NEXT: packuswb %xmm5, %xmm6 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm7 = [0,0,0,0,255,255,255,0,255,255,0,255,0,255,0,255] +; CHECK-SSE41-NEXT: pand %xmm0, %xmm7 +; CHECK-SSE41-NEXT: paddb %xmm6, %xmm7 +; CHECK-SSE41-NEXT: movdqa %xmm7, %xmm5 +; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; CHECK-SSE41-NEXT: psraw $8, %xmm5 +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 +; CHECK-SSE41-NEXT: psrlw $8, %xmm5 +; CHECK-SSE41-NEXT: movdqa %xmm7, %xmm8 +; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; CHECK-SSE41-NEXT: psraw $8, %xmm8 +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 +; CHECK-SSE41-NEXT: psrlw $8, %xmm8 +; CHECK-SSE41-NEXT: packuswb %xmm5, %xmm8 +; CHECK-SSE41-NEXT: psrlw $7, %xmm7 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-SSE41-NEXT: pand %xmm6, %xmm7 +; CHECK-SSE41-NEXT: paddb %xmm8, %xmm7 +; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 ; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; CHECK-SSE41-NEXT: pand %xmm5, %xmm0 -; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; CHECK-SSE41-NEXT: pand %xmm5, %xmm7 +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 +; CHECK-SSE41-NEXT: pand %xmm5, %xmm8 +; CHECK-SSE41-NEXT: packuswb %xmm7, %xmm8 +; CHECK-SSE41-NEXT: psubb %xmm8, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm7, %xmm7 +; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] +; CHECK-SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 +; CHECK-SSE41-NEXT: psrlw $8, %xmm7 +; CHECK-SSE41-NEXT: pxor %xmm8, %xmm8 +; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; CHECK-SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 +; CHECK-SSE41-NEXT: psrlw $8, %xmm8 +; CHECK-SSE41-NEXT: packuswb %xmm7, %xmm8 +; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 +; CHECK-SSE41-NEXT: pand %xmm5, %xmm7 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm9 +; CHECK-SSE41-NEXT: pshufb {{.*#+}} xmm9 = zero,zero,xmm9[9],zero,zero,zero,xmm9[11],zero,xmm9[12],zero,xmm9[13],zero,zero,zero,xmm9[15],zero +; CHECK-SSE41-NEXT: packuswb %xmm9, %xmm7 +; CHECK-SSE41-NEXT: paddb %xmm8, %xmm7 +; CHECK-SSE41-NEXT: movdqa %xmm7, %xmm8 +; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; CHECK-SSE41-NEXT: psraw $8, %xmm8 +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 +; CHECK-SSE41-NEXT: psrlw $8, %xmm8 +; CHECK-SSE41-NEXT: movdqa %xmm7, %xmm9 +; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; CHECK-SSE41-NEXT: psraw $8, %xmm9 +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9 +; CHECK-SSE41-NEXT: psrlw $8, %xmm9 +; CHECK-SSE41-NEXT: packuswb %xmm8, %xmm9 +; CHECK-SSE41-NEXT: psrlw $7, %xmm7 +; CHECK-SSE41-NEXT: pand %xmm6, %xmm7 +; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 +; CHECK-SSE41-NEXT: paddb %xmm9, %xmm7 +; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 +; CHECK-SSE41-NEXT: pand %xmm5, %xmm7 ; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 ; CHECK-SSE41-NEXT: pand %xmm5, %xmm6 -; CHECK-SSE41-NEXT: packuswb %xmm0, %xmm6 -; CHECK-SSE41-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 -; CHECK-SSE41-NEXT: movdqa %xmm6, %xmm0 -; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: psrlw $8, %xmm0 -; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 -; CHECK-SSE41-NEXT: psrlw $8, %xmm6 -; CHECK-SSE41-NEXT: packuswb %xmm0, %xmm6 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [84,2,36,42,2,0,2,4,2,255,4,36,126,30,2,2] -; CHECK-SSE41-NEXT: pminub %xmm6, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm7, %xmm7 -; CHECK-SSE41-NEXT: pxor %xmm0, %xmm7 -; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm6, %xmm6 -; CHECK-SSE41-NEXT: pcmpgtb %xmm6, %xmm1 -; CHECK-SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; CHECK-SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm1 -; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; CHECK-SSE41-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pand %xmm5, %xmm0 -; CHECK-SSE41-NEXT: packuswb %xmm4, %xmm0 -; CHECK-SSE41-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; CHECK-SSE41-NEXT: psrlw $8, %xmm4 -; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: psrlw $8, %xmm0 -; CHECK-SSE41-NEXT: packuswb %xmm4, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm4 = [19,51,13,7,127,31,127,3,5,5,51,37,3,127,85,5] -; CHECK-SSE41-NEXT: pmaxub %xmm0, %xmm4 -; CHECK-SSE41-NEXT: pcmpeqb %xmm0, %xmm4 -; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm3 +; CHECK-SSE41-NEXT: packuswb %xmm7, %xmm6 +; CHECK-SSE41-NEXT: psubb %xmm6, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqb %xmm4, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm5, %xmm5 +; CHECK-SSE41-NEXT: pxor %xmm5, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqb %xmm4, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm5, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqb %xmm4, %xmm3 ; CHECK-SSE41-NEXT: pandn %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm2 -; CHECK-SSE41-NEXT: pandn %xmm4, %xmm2 +; CHECK-SSE41-NEXT: pcmpeqb %xmm4, %xmm2 +; CHECK-SSE41-NEXT: pandn %xmm0, %xmm2 ; CHECK-SSE41-NEXT: pmovmskb %xmm2, %ecx ; CHECK-SSE41-NEXT: pmovmskb %xmm3, %edx ; CHECK-SSE41-NEXT: shll $16, %edx @@ -2362,45 +3908,43 @@ ; CHECK-AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 ; CHECK-AVX1-NEXT: vpackuswb %xmm4, %xmm5, %xmm4 ; CHECK-AVX1-NEXT: vpsrlw $7, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; CHECK-AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; CHECK-AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm4 ; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm6 +; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm5 ; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; CHECK-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm6 +; CHECK-AVX1-NEXT: vpand %xmm3, %xmm5, %xmm5 ; CHECK-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero ; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 ; CHECK-AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 -; CHECK-AVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4 +; CHECK-AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 ; CHECK-AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm4 ; CHECK-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 +; CHECK-AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 +; CHECK-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 ; CHECK-AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 -; CHECK-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 -; CHECK-AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7 -; CHECK-AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm6 -; CHECK-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 -; CHECK-AVX1-NEXT: vpand %xmm3, %xmm7, %xmm7 -; CHECK-AVX1-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm0[9],zero,zero,zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,zero,zero,xmm0[15],zero -; CHECK-AVX1-NEXT: vpackuswb %xmm8, %xmm7, %xmm7 -; CHECK-AVX1-NEXT: vpaddb %xmm7, %xmm6, %xmm6 -; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-AVX1-NEXT: vpackuswb %xmm5, %xmm6, %xmm5 +; CHECK-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 +; CHECK-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm6 +; CHECK-AVX1-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm0[9],zero,zero,zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,zero,zero,xmm0[15],zero +; CHECK-AVX1-NEXT: vpackuswb %xmm7, %xmm6, %xmm6 +; CHECK-AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 +; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-AVX1-NEXT: vpsraw $8, %xmm6, %xmm6 +; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 +; CHECK-AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 +; CHECK-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; CHECK-AVX1-NEXT: vpsraw $8, %xmm7, %xmm7 ; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 ; CHECK-AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7 -; CHECK-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-AVX1-NEXT: vpsraw $8, %xmm8, %xmm8 -; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8, %xmm8 -; CHECK-AVX1-NEXT: vpsrlw $8, %xmm8, %xmm8 -; CHECK-AVX1-NEXT: vpackuswb %xmm7, %xmm8, %xmm7 -; CHECK-AVX1-NEXT: vpsrlw $7, %xmm6, %xmm6 -; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 -; CHECK-AVX1-NEXT: vpand %xmm5, %xmm6, %xmm5 -; CHECK-AVX1-NEXT: vpaddb %xmm5, %xmm7, %xmm5 +; CHECK-AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm6 +; CHECK-AVX1-NEXT: vpsrlw $7, %xmm5, %xmm5 +; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 +; CHECK-AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 ; CHECK-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 ; CHECK-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm6 @@ -2469,32 +4013,46 @@ ; ; CHECK-AVX512VL-LABEL: pr51133: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; CHECK-AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; CHECK-AVX512VL-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; CHECK-AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 +; CHECK-AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; CHECK-AVX512VL-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 +; CHECK-AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 +; CHECK-AVX512VL-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 ; CHECK-AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; CHECK-AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3 -; CHECK-AVX512VL-NEXT: vpackuswb %ymm2, %ymm3, %ymm2 -; CHECK-AVX512VL-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; CHECK-AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 +; CHECK-AVX512VL-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[8],zero,ymm0[9],zero,zero,zero,ymm0[11],zero,zero,zero,ymm0[13],zero,zero,zero,ymm0[15],zero,zero,zero,ymm0[25],zero,zero,zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,zero,zero,ymm0[31],zero +; CHECK-AVX512VL-NEXT: vpackuswb %ymm6, %ymm4, %ymm4 +; CHECK-AVX512VL-NEXT: vpaddb %ymm4, %ymm3, %ymm3 +; CHECK-AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-AVX512VL-NEXT: vpsraw $8, %ymm4, %ymm4 +; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 +; CHECK-AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 +; CHECK-AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-AVX512VL-NEXT: vpsraw $8, %ymm6, %ymm6 +; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 +; CHECK-AVX512VL-NEXT: vpsrlw $8, %ymm6, %ymm6 +; CHECK-AVX512VL-NEXT: vpackuswb %ymm4, %ymm6, %ymm4 +; CHECK-AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm3 +; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255] +; CHECK-AVX512VL-NEXT: vpternlogd $128, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm6 +; CHECK-AVX512VL-NEXT: vpaddb %ymm6, %ymm4, %ymm3 +; CHECK-AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 +; CHECK-AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 +; CHECK-AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 -; CHECK-AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; CHECK-AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; CHECK-AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 -; CHECK-AVX512VL-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 -; CHECK-AVX512VL-NEXT: vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 -; CHECK-AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; CHECK-AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-AVX512VL-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0 -; CHECK-AVX512VL-NEXT: vpandn %ymm0, %ymm3, %ymm3 -; CHECK-AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm0 -; CHECK-AVX512VL-NEXT: vpternlogq $14, %ymm3, %ymm2, %ymm0 +; CHECK-AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 +; CHECK-AVX512VL-NEXT: vpackuswb %ymm4, %ymm3, %ymm3 +; CHECK-AVX512VL-NEXT: vpsubb %ymm3, %ymm0, %ymm0 +; CHECK-AVX512VL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3 +; CHECK-AVX512VL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; CHECK-AVX512VL-NEXT: vpternlogq $20, %ymm1, %ymm3, %ymm0 ; CHECK-AVX512VL-NEXT: retq %rem = srem <32 x i8> %x, %cmp = icmp ne <32 x i8> %rem, zeroinitializer diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll @@ -9,55 +9,105 @@ define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_25: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm1, %xmm4 +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [25,25,25,25] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_25: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [171798690,171798690,171798690,171798690] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_25: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_25: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [85899345,85899345,85899345,85899345] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [171798690,171798690,171798690,171798690] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [25,25,25,25] +; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_25: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -71,70 +121,105 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_100: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: pslld $30, %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; CHECK-SSE2-NEXT: pand %xmm1, %xmm4 +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [100,100,100,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_100: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pslld $30, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_100: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_100: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [85899344,85899344,85899344,85899344] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsrad $5, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100] +; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_100: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $2, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrad $5, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -150,55 +235,102 @@ define <4 x i32> @test_srem_odd_neg25(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_neg25: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1374389535,2920577761,2920577761,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,4294967295,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: psrad $3, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_neg25: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [171798690,171798690,171798690,171798690] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,2920577761,2920577761,1374389535] +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrad $3, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_neg25: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_neg25: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [85899345,85899345,85899345,85899345] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [171798690,171798690,171798690,171798690] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_neg25: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrad $3, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -212,70 +344,106 @@ define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_neg100: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: pslld $30, %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2920577761,1374389535,2920577761,1374389535] +; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,0,4294967295,0] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 +; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: psrad $5, %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_neg100: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pslld $30, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2920577761,u,2920577761,u> +; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrad $5, %xmm2 +; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_neg100: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_neg100: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [85899344,85899344,85899344,85899344] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2920577761,2920577761,2920577761,2920577761] +; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsrad $5, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_neg100: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $2, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2920577761,2920577761,2920577761,2920577761] +; CHECK-AVX512VL-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrad $5, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/srem-seteq.ll b/llvm/test/CodeGen/X86/srem-seteq.ll --- a/llvm/test/CodeGen/X86/srem-seteq.ll +++ b/llvm/test/CodeGen/X86/srem-seteq.ll @@ -9,20 +9,32 @@ define i32 @test_srem_odd(i32 %X) nounwind { ; X86-LABEL: test_srem_odd: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %ecx # imm = 0xCCCCCCCD -; X86-NEXT: addl $429496729, %ecx # imm = 0x19999999 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1717986919, %edx # imm = 0x66666667 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: leal (%edx,%edx,4), %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_srem_odd: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %ecx # imm = 0xCCCCCCCD -; X64-NEXT: addl $429496729, %ecx # imm = 0x19999999 +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $1717986919, %rcx, %rax # imm = 0x66666667 +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $33, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: leal (%rax,%rax,4), %edx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $858993459, %ecx # imm = 0x33333333 -; X64-NEXT: setb %al +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: sete %al ; X64-NEXT: retq %srem = srem i32 %X, 5 %cmp = icmp eq i32 %srem, 0 @@ -33,20 +45,34 @@ define i32 @test_srem_odd_25(i32 %X) nounwind { ; X86-LABEL: test_srem_odd_25: ; X86: # %bb.0: -; X86-NEXT: imull $-1030792151, {{[0-9]+}}(%esp), %ecx # imm = 0xC28F5C29 -; X86-NEXT: addl $85899345, %ecx # imm = 0x51EB851 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1374389535, %edx # imm = 0x51EB851F +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $3, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: leal (%edx,%edx,4), %eax +; X86-NEXT: leal (%eax,%eax,4), %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $171798691, %ecx # imm = 0xA3D70A3 -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_srem_odd_25: ; X64: # %bb.0: -; X64-NEXT: imull $-1030792151, %edi, %ecx # imm = 0xC28F5C29 -; X64-NEXT: addl $85899345, %ecx # imm = 0x51EB851 +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $1374389535, %rcx, %rax # imm = 0x51EB851F +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $35, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: leal (%rax,%rax,4), %eax +; X64-NEXT: leal (%rax,%rax,4), %edx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $171798691, %ecx # imm = 0xA3D70A3 -; X64-NEXT: setb %al +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: sete %al ; X64-NEXT: retq %srem = srem i32 %X, 25 %cmp = icmp eq i32 %srem, 0 @@ -58,20 +84,34 @@ define i32 @test_srem_odd_bit30(i32 %X) nounwind { ; X86-LABEL: test_srem_odd_bit30: ; X86: # %bb.0: -; X86-NEXT: imull $1789569707, {{[0-9]+}}(%esp), %ecx # imm = 0x6AAAAAAB -; X86-NEXT: incl %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $536870911, %edx # imm = 0x1FFFFFFF +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $27, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: imull $1073741827, %edx, %edx # imm = 0x40000003 ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $3, %ecx -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_srem_odd_bit30: ; X64: # %bb.0: -; X64-NEXT: imull $1789569707, %edi, %ecx # imm = 0x6AAAAAAB -; X64-NEXT: incl %ecx +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: shlq $29, %rax +; X64-NEXT: subq %rcx, %rax +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $59, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: imull $1073741827, %eax, %edx # imm = 0x40000003 ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $3, %ecx -; X64-NEXT: setb %al +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: sete %al ; X64-NEXT: retq %srem = srem i32 %X, 1073741827 %cmp = icmp eq i32 %srem, 0 @@ -83,20 +123,35 @@ define i32 @test_srem_odd_bit31(i32 %X) nounwind { ; X86-LABEL: test_srem_odd_bit31: ; X86: # %bb.0: -; X86-NEXT: imull $-715827883, {{[0-9]+}}(%esp), %ecx # imm = 0xD5555555 -; X86-NEXT: incl %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-536870913, %edx # imm = 0xDFFFFFFF +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $28, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: imull $-2147483645, %edx, %edx # imm = 0x80000003 ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $3, %ecx -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_srem_odd_bit31: ; X64: # %bb.0: -; X64-NEXT: imull $-715827883, %edi, %ecx # imm = 0xD5555555 -; X64-NEXT: incl %ecx +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: shlq $29, %rax +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $60, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: imull $-2147483645, %eax, %edx # imm = 0x80000003 ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $3, %ecx -; X64-NEXT: setb %al +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: sete %al ; X64-NEXT: retq %srem = srem i32 %X, 2147483651 %cmp = icmp eq i32 %srem, 0 @@ -111,25 +166,35 @@ define i16 @test_srem_even(i16 %X) nounwind { ; X86-LABEL: test_srem_even: ; X86: # %bb.0: -; X86-NEXT: imull $28087, {{[0-9]+}}(%esp), %eax # imm = 0x6DB7 -; X86-NEXT: addl $4680, %eax # imm = 0x1248 -; X86-NEXT: rorw %ax -; X86-NEXT: movzwl %ax, %ecx +; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: imull $18725, %ecx, %edx # imm = 0x4925 +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $18, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: leal (%edx,%edx), %eax +; X86-NEXT: shll $4, %edx +; X86-NEXT: subl %eax, %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $4681, %ecx # imm = 0x1249 -; X86-NEXT: setae %al +; X86-NEXT: cmpw %dx, %cx +; X86-NEXT: setne %al ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; ; X64-LABEL: test_srem_even: ; X64: # %bb.0: -; X64-NEXT: imull $28087, %edi, %eax # imm = 0x6DB7 -; X64-NEXT: addl $4680, %eax # imm = 0x1248 -; X64-NEXT: rorw %ax -; X64-NEXT: movzwl %ax, %ecx +; X64-NEXT: movswl %di, %ecx +; X64-NEXT: imull $18725, %ecx, %edx # imm = 0x4925 +; X64-NEXT: movl %edx, %eax +; X64-NEXT: shrl $31, %eax +; X64-NEXT: sarl $18, %edx +; X64-NEXT: addl %eax, %edx +; X64-NEXT: leal (%rdx,%rdx), %eax +; X64-NEXT: shll $4, %edx +; X64-NEXT: subl %eax, %edx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $4681, %ecx # imm = 0x1249 -; X64-NEXT: setae %al +; X64-NEXT: cmpw %dx, %cx +; X64-NEXT: setne %al ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %srem = srem i16 %X, 14 @@ -141,22 +206,32 @@ define i32 @test_srem_even_100(i32 %X) nounwind { ; X86-LABEL: test_srem_even_100: ; X86: # %bb.0: -; X86-NEXT: imull $-1030792151, {{[0-9]+}}(%esp), %ecx # imm = 0xC28F5C29 -; X86-NEXT: addl $85899344, %ecx # imm = 0x51EB850 -; X86-NEXT: rorl $2, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1374389535, %edx # imm = 0x51EB851F +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $5, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: imull $100, %edx, %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $42949673, %ecx # imm = 0x28F5C29 -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_srem_even_100: ; X64: # %bb.0: -; X64-NEXT: imull $-1030792151, %edi, %ecx # imm = 0xC28F5C29 -; X64-NEXT: addl $85899344, %ecx # imm = 0x51EB850 -; X64-NEXT: rorl $2, %ecx +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $1374389535, %rcx, %rax # imm = 0x51EB851F +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $37, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: imull $100, %eax, %edx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $42949673, %ecx # imm = 0x28F5C29 -; X64-NEXT: setb %al +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: sete %al ; X64-NEXT: retq %srem = srem i32 %X, 100 %cmp = icmp eq i32 %srem, 0 @@ -168,22 +243,32 @@ define i32 @test_srem_even_bit30(i32 %X) nounwind { ; X86-LABEL: test_srem_even_bit30: ; X86: # %bb.0: -; X86-NEXT: imull $-51622203, {{[0-9]+}}(%esp), %ecx # imm = 0xFCEC4EC5 -; X86-NEXT: addl $8, %ecx -; X86-NEXT: rorl $3, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1073741721, %edx # imm = 0x3FFFFF99 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $28, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: imull $1073741928, %edx, %edx # imm = 0x40000068 ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $3, %ecx -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_srem_even_bit30: ; X64: # %bb.0: -; X64-NEXT: imull $-51622203, %edi, %ecx # imm = 0xFCEC4EC5 -; X64-NEXT: addl $8, %ecx -; X64-NEXT: rorl $3, %ecx +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $1073741721, %rcx, %rax # imm = 0x3FFFFF99 +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $60, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: imull $1073741928, %eax, %edx # imm = 0x40000068 ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $3, %ecx -; X64-NEXT: setb %al +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: sete %al ; X64-NEXT: retq %srem = srem i32 %X, 1073741928 %cmp = icmp eq i32 %srem, 0 @@ -195,22 +280,35 @@ define i32 @test_srem_even_bit31(i32 %X) nounwind { ; X86-LABEL: test_srem_even_bit31: ; X86: # %bb.0: -; X86-NEXT: imull $-989526779, {{[0-9]+}}(%esp), %ecx # imm = 0xC5050505 -; X86-NEXT: addl $2, %ecx -; X86-NEXT: rorl %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $2147483545, %edx # imm = 0x7FFFFF99 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $30, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: imull $-2147483546, %edx, %edx # imm = 0x80000066 ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $3, %ecx -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_srem_even_bit31: ; X64: # %bb.0: -; X64-NEXT: imull $-989526779, %edi, %ecx # imm = 0xC5050505 -; X64-NEXT: addl $2, %ecx -; X64-NEXT: rorl %ecx +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $2147483545, %rcx, %rax # imm = 0x7FFFFF99 +; X64-NEXT: shrq $32, %rax +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: movl %eax, %edx +; X64-NEXT: shrl $31, %edx +; X64-NEXT: sarl $30, %eax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: imull $-2147483546, %eax, %edx # imm = 0x80000066 ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $3, %ecx -; X64-NEXT: setb %al +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: sete %al ; X64-NEXT: retq %srem = srem i32 %X, 2147483750 %cmp = icmp eq i32 %srem, 0 @@ -226,20 +324,32 @@ define i32 @test_srem_odd_setne(i32 %X) nounwind { ; X86-LABEL: test_srem_odd_setne: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %ecx # imm = 0xCCCCCCCD -; X86-NEXT: addl $429496729, %ecx # imm = 0x19999999 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1717986919, %edx # imm = 0x66666667 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: leal (%edx,%edx,4), %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: setae %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: test_srem_odd_setne: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %ecx # imm = 0xCCCCCCCD -; X64-NEXT: addl $429496729, %ecx # imm = 0x19999999 +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $1717986919, %rcx, %rax # imm = 0x66666667 +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $33, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: leal (%rax,%rax,4), %edx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $858993459, %ecx # imm = 0x33333333 -; X64-NEXT: setae %al +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: setne %al ; X64-NEXT: retq %srem = srem i32 %X, 5 %cmp = icmp ne i32 %srem, 0 @@ -251,20 +361,32 @@ define i32 @test_srem_negative_odd(i32 %X) nounwind { ; X86-LABEL: test_srem_negative_odd: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %ecx # imm = 0xCCCCCCCD -; X86-NEXT: addl $429496729, %ecx # imm = 0x19999999 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-1717986919, %edx # imm = 0x99999999 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: leal (%edx,%edx,4), %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $858993459, %ecx # imm = 0x33333333 -; X86-NEXT: setae %al +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: test_srem_negative_odd: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %ecx # imm = 0xCCCCCCCD -; X64-NEXT: addl $429496729, %ecx # imm = 0x19999999 +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $-1717986919, %rcx, %rax # imm = 0x99999999 +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: shrq $63, %rdx +; X64-NEXT: sarq $33, %rax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: leal (%rax,%rax,4), %edx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $858993459, %ecx # imm = 0x33333333 -; X64-NEXT: setae %al +; X64-NEXT: addl %edx, %ecx +; X64-NEXT: setne %al ; X64-NEXT: retq %srem = srem i32 %X, -5 %cmp = icmp ne i32 %srem, 0 @@ -274,22 +396,35 @@ define i32 @test_srem_negative_even(i32 %X) nounwind { ; X86-LABEL: test_srem_negative_even: ; X86: # %bb.0: -; X86-NEXT: imull $-1227133513, {{[0-9]+}}(%esp), %ecx # imm = 0xB6DB6DB7 -; X86-NEXT: addl $306783378, %ecx # imm = 0x12492492 -; X86-NEXT: rorl %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1840700269, %edx # imm = 0x6DB6DB6D +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: imull %edx +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shrl $31, %eax +; X86-NEXT: sarl $3, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: imull $-14, %edx, %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $306783379, %ecx # imm = 0x12492493 -; X86-NEXT: setae %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: test_srem_negative_even: ; X64: # %bb.0: -; X64-NEXT: imull $-1227133513, %edi, %ecx # imm = 0xB6DB6DB7 -; X64-NEXT: addl $306783378, %ecx # imm = 0x12492492 -; X64-NEXT: rorl %ecx +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: imulq $1840700269, %rcx, %rax # imm = 0x6DB6DB6D +; X64-NEXT: shrq $32, %rax +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: movl %eax, %edx +; X64-NEXT: shrl $31, %edx +; X64-NEXT: sarl $3, %eax +; X64-NEXT: addl %edx, %eax +; X64-NEXT: imull $-14, %eax, %edx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $306783379, %ecx # imm = 0x12492493 -; X64-NEXT: setae %al +; X64-NEXT: cmpl %edx, %ecx +; X64-NEXT: setne %al ; X64-NEXT: retq %srem = srem i32 %X, -14 %cmp = icmp ne i32 %srem, 0 diff --git a/llvm/test/CodeGen/X86/srem-vector-lkk.ll b/llvm/test/CodeGen/X86/srem-vector-lkk.ll --- a/llvm/test/CodeGen/X86/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/X86/srem-vector-lkk.ll @@ -14,7 +14,7 @@ ; SSE-NEXT: movzwl %cx, %ecx ; SSE-NEXT: movswl %cx, %edx ; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $9, %edx +; SSE-NEXT: shrl $9, %edx ; SSE-NEXT: addl %ecx, %edx ; SSE-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 ; SSE-NEXT: subl %ecx, %eax @@ -26,7 +26,7 @@ ; SSE-NEXT: movzwl %dx, %edx ; SSE-NEXT: movswl %dx, %esi ; SSE-NEXT: shrl $15, %edx -; SSE-NEXT: sarl $6, %esi +; SSE-NEXT: shrl $6, %esi ; SSE-NEXT: addl %edx, %esi ; SSE-NEXT: imull $95, %esi, %edx ; SSE-NEXT: subl %edx, %ecx @@ -36,7 +36,8 @@ ; SSE-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF ; SSE-NEXT: movl %edx, %esi ; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $21, %edx +; SSE-NEXT: sarl $16, %edx +; SSE-NEXT: shrl $5, %edx ; SSE-NEXT: addl %esi, %edx ; SSE-NEXT: imull $-124, %edx, %edx ; SSE-NEXT: subl %edx, %ecx @@ -46,7 +47,8 @@ ; SSE-NEXT: imull $2675, %edx, %edx # imm = 0xA73 ; SSE-NEXT: movl %edx, %esi ; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $18, %edx +; SSE-NEXT: sarl $16, %edx +; SSE-NEXT: shrl $2, %edx ; SSE-NEXT: addl %esi, %edx ; SSE-NEXT: imull $98, %edx, %edx ; SSE-NEXT: subl %edx, %ecx @@ -65,7 +67,7 @@ ; AVX-NEXT: movzwl %cx, %ecx ; AVX-NEXT: movswl %cx, %edx ; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $9, %edx +; AVX-NEXT: shrl $9, %edx ; AVX-NEXT: addl %ecx, %edx ; AVX-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 ; AVX-NEXT: subl %ecx, %eax @@ -77,7 +79,7 @@ ; AVX-NEXT: movzwl %dx, %edx ; AVX-NEXT: movswl %dx, %esi ; AVX-NEXT: shrl $15, %edx -; AVX-NEXT: sarl $6, %esi +; AVX-NEXT: shrl $6, %esi ; AVX-NEXT: addl %edx, %esi ; AVX-NEXT: imull $95, %esi, %edx ; AVX-NEXT: subl %edx, %ecx @@ -87,7 +89,8 @@ ; AVX-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF ; AVX-NEXT: movl %edx, %esi ; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $21, %edx +; AVX-NEXT: sarl $16, %edx +; AVX-NEXT: shrl $5, %edx ; AVX-NEXT: addl %esi, %edx ; AVX-NEXT: imull $-124, %edx, %edx ; AVX-NEXT: subl %edx, %ecx @@ -97,7 +100,8 @@ ; AVX-NEXT: imull $2675, %edx, %edx # imm = 0xA73 ; AVX-NEXT: movl %edx, %esi ; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $18, %edx +; AVX-NEXT: sarl $16, %edx +; AVX-NEXT: shrl $2, %edx ; AVX-NEXT: addl %esi, %edx ; AVX-NEXT: imull $98, %edx, %edx ; AVX-NEXT: subl %edx, %ecx @@ -179,13 +183,13 @@ ; SSE-NEXT: leal 31(%rax), %ecx ; SSE-NEXT: testw %ax, %ax ; SSE-NEXT: cmovnsl %eax, %ecx -; SSE-NEXT: andl $-32, %ecx +; SSE-NEXT: andl $65504, %ecx # imm = 0xFFE0 ; SSE-NEXT: subl %ecx, %eax ; SSE-NEXT: movd %xmm0, %ecx ; SSE-NEXT: leal 63(%rcx), %edx ; SSE-NEXT: testw %cx, %cx ; SSE-NEXT: cmovnsl %ecx, %edx -; SSE-NEXT: andl $-64, %edx +; SSE-NEXT: andl $65472, %edx # imm = 0xFFC0 ; SSE-NEXT: subl %edx, %ecx ; SSE-NEXT: movd %ecx, %xmm1 ; SSE-NEXT: pinsrw $1, %eax, %xmm1 @@ -193,7 +197,7 @@ ; SSE-NEXT: leal 7(%rax), %ecx ; SSE-NEXT: testw %ax, %ax ; SSE-NEXT: cmovnsl %eax, %ecx -; SSE-NEXT: andl $-8, %ecx +; SSE-NEXT: andl $65528, %ecx # imm = 0xFFF8 ; SSE-NEXT: subl %ecx, %eax ; SSE-NEXT: pinsrw $2, %eax, %xmm1 ; SSE-NEXT: pextrw $3, %xmm0, %eax @@ -204,7 +208,7 @@ ; SSE-NEXT: movzwl %cx, %ecx ; SSE-NEXT: movswl %cx, %edx ; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $6, %edx +; SSE-NEXT: shrl $6, %edx ; SSE-NEXT: addl %ecx, %edx ; SSE-NEXT: imull $95, %edx, %ecx ; SSE-NEXT: subl %ecx, %eax @@ -218,13 +222,13 @@ ; AVX-NEXT: leal 31(%rax), %ecx ; AVX-NEXT: testw %ax, %ax ; AVX-NEXT: cmovnsl %eax, %ecx -; AVX-NEXT: andl $-32, %ecx +; AVX-NEXT: andl $65504, %ecx # imm = 0xFFE0 ; AVX-NEXT: subl %ecx, %eax ; AVX-NEXT: vmovd %xmm0, %ecx ; AVX-NEXT: leal 63(%rcx), %edx ; AVX-NEXT: testw %cx, %cx ; AVX-NEXT: cmovnsl %ecx, %edx -; AVX-NEXT: andl $-64, %edx +; AVX-NEXT: andl $65472, %edx # imm = 0xFFC0 ; AVX-NEXT: subl %edx, %ecx ; AVX-NEXT: vmovd %ecx, %xmm1 ; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 @@ -232,7 +236,7 @@ ; AVX-NEXT: leal 7(%rax), %ecx ; AVX-NEXT: testw %ax, %ax ; AVX-NEXT: cmovnsl %eax, %ecx -; AVX-NEXT: andl $-8, %ecx +; AVX-NEXT: andl $65528, %ecx # imm = 0xFFF8 ; AVX-NEXT: subl %ecx, %eax ; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 ; AVX-NEXT: vpextrw $3, %xmm0, %eax @@ -243,7 +247,7 @@ ; AVX-NEXT: movzwl %cx, %ecx ; AVX-NEXT: movswl %cx, %edx ; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $6, %edx +; AVX-NEXT: shrl $6, %edx ; AVX-NEXT: addl %ecx, %edx ; AVX-NEXT: imull $95, %edx, %ecx ; AVX-NEXT: subl %ecx, %eax @@ -257,32 +261,33 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; SSE-LABEL: dont_fold_srem_one: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx +; SSE-NEXT: pextrw $2, %xmm0, %ecx +; SSE-NEXT: movswl %cx, %eax +; SSE-NEXT: imull $-19945, %eax, %eax # imm = 0xB217 +; SSE-NEXT: shrl $16, %eax +; SSE-NEXT: addl %ecx, %eax +; SSE-NEXT: movzwl %ax, %edx +; SSE-NEXT: movswl %dx, %eax +; SSE-NEXT: shrl $15, %edx +; SSE-NEXT: shrl $4, %eax +; SSE-NEXT: addl %edx, %eax +; SSE-NEXT: leal (%rax,%rax,2), %edx +; SSE-NEXT: shll $3, %edx +; SSE-NEXT: subl %edx, %eax +; SSE-NEXT: addl %ecx, %eax +; SSE-NEXT: pextrw $1, %xmm0, %ecx ; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $4, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: leal (%rdx,%rdx,2), %ecx -; SSE-NEXT: shll $3, %ecx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: addl %eax, %edx -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B -; SSE-NEXT: movl %ecx, %esi +; SSE-NEXT: imull $12827, %edx, %edx # imm = 0x321B +; SSE-NEXT: movl %edx, %esi ; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $23, %ecx -; SSE-NEXT: addl %esi, %ecx -; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: sarl $16, %edx +; SSE-NEXT: shrl $7, %edx +; SSE-NEXT: addl %esi, %edx +; SSE-NEXT: imull $654, %edx, %edx # imm = 0x28E +; SSE-NEXT: subl %edx, %ecx ; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pinsrw $2, %edx, %xmm1 +; SSE-NEXT: pinsrw $1, %ecx, %xmm1 +; SSE-NEXT: pinsrw $2, %eax, %xmm1 ; SSE-NEXT: pextrw $3, %xmm0, %eax ; SSE-NEXT: movswl %ax, %ecx ; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 @@ -306,7 +311,7 @@ ; AVX-NEXT: movzwl %cx, %ecx ; AVX-NEXT: movswl %cx, %edx ; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $4, %edx +; AVX-NEXT: shrl $4, %edx ; AVX-NEXT: addl %ecx, %edx ; AVX-NEXT: leal (%rdx,%rdx,2), %ecx ; AVX-NEXT: shll $3, %ecx @@ -317,7 +322,8 @@ ; AVX-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B ; AVX-NEXT: movl %ecx, %esi ; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $23, %ecx +; AVX-NEXT: sarl $16, %ecx +; AVX-NEXT: shrl $7, %ecx ; AVX-NEXT: addl %esi, %ecx ; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E ; AVX-NEXT: subl %ecx, %eax @@ -351,7 +357,7 @@ ; SSE-NEXT: movzwl %cx, %ecx ; SSE-NEXT: movswl %cx, %edx ; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $4, %edx +; SSE-NEXT: shrl $4, %edx ; SSE-NEXT: addl %ecx, %edx ; SSE-NEXT: leal (%rdx,%rdx,2), %ecx ; SSE-NEXT: shll $3, %ecx @@ -361,7 +367,7 @@ ; SSE-NEXT: leal 32767(%rax), %ecx ; SSE-NEXT: testw %ax, %ax ; SSE-NEXT: cmovnsl %eax, %ecx -; SSE-NEXT: andl $-32768, %ecx # imm = 0x8000 +; SSE-NEXT: andl $32768, %ecx # imm = 0x8000 ; SSE-NEXT: addl %eax, %ecx ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: pinsrw $1, %ecx, %xmm1 @@ -389,7 +395,7 @@ ; AVX-NEXT: movzwl %cx, %ecx ; AVX-NEXT: movswl %cx, %edx ; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $4, %edx +; AVX-NEXT: shrl $4, %edx ; AVX-NEXT: addl %ecx, %edx ; AVX-NEXT: leal (%rdx,%rdx,2), %ecx ; AVX-NEXT: shll $3, %ecx @@ -399,7 +405,7 @@ ; AVX-NEXT: leal 32767(%rax), %ecx ; AVX-NEXT: testw %ax, %ax ; AVX-NEXT: cmovnsl %eax, %ecx -; AVX-NEXT: andl $-32768, %ecx # imm = 0x8000 +; AVX-NEXT: andl $32768, %ecx # imm = 0x8000 ; AVX-NEXT: addl %eax, %ecx ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/sse-intel-ocl.ll b/llvm/test/CodeGen/X86/sse-intel-ocl.ll --- a/llvm/test/CodeGen/X86/sse-intel-ocl.ll +++ b/llvm/test/CodeGen/X86/sse-intel-ocl.ll @@ -220,9 +220,9 @@ ; WIN64-NEXT: subq $232, %rsp ; WIN64-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; WIN64-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; WIN64-NEXT: movaps (%r9), %xmm4 -; WIN64-NEXT: movaps (%rdx), %xmm5 -; WIN64-NEXT: movaps (%r8), %xmm6 +; WIN64-NEXT: movaps (%rdx), %xmm4 +; WIN64-NEXT: movaps (%r8), %xmm5 +; WIN64-NEXT: movaps (%r9), %xmm6 ; WIN64-NEXT: movaps (%rcx), %xmm7 ; WIN64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) @@ -231,11 +231,11 @@ ; WIN64-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rax diff --git a/llvm/test/CodeGen/X86/sse2-vector-shifts.ll b/llvm/test/CodeGen/X86/sse2-vector-shifts.ll --- a/llvm/test/CodeGen/X86/sse2-vector-shifts.ll +++ b/llvm/test/CodeGen/X86/sse2-vector-shifts.ll @@ -334,8 +334,9 @@ define <4 x i16> @sra_trunc_srl_v4i32(<4 x i32> %x) nounwind { ; CHECK-LABEL: sra_trunc_srl_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: psrad $19, %xmm0 +; CHECK-NEXT: psrad $16, %xmm0 ; CHECK-NEXT: packssdw %xmm0, %xmm0 +; CHECK-NEXT: psraw $3, %xmm0 ; CHECK-NEXT: retq %srl = lshr <4 x i32> %x, %trunc = trunc <4 x i32> %srl to <4 x i16> diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll --- a/llvm/test/CodeGen/X86/sse2.ll +++ b/llvm/test/CodeGen/X86/sse2.ll @@ -103,6 +103,8 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: vmovaps (%edx), %xmm0 ; X86-AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X86-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; X86-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X86-AVX-NEXT: vmovaps %xmm0, (%eax) ; X86-AVX-NEXT: retl ; @@ -117,6 +119,8 @@ ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovaps (%rsi), %xmm0 ; X64-AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) ; X64-AVX-NEXT: retq %tmp = load <4 x float>, ptr %B ; <<4 x float>> [#uses=2] @@ -597,25 +601,25 @@ define fastcc void @test17() nounwind { ; X86-SSE-LABEL: test17: ; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = +; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = ; X86-SSE-NEXT: movaps %xmm0, (%eax) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: test17: ; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768] +; X86-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [4.59177481E-41,4.59177481E-41,4.59177481E-41,4.59177481E-41] ; X86-AVX-NEXT: vmovaps %xmm0, (%eax) ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: test17: ; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movaps {{.*#+}} xmm0 = +; X64-SSE-NEXT: movaps {{.*#+}} xmm0 = ; X64-SSE-NEXT: movaps %xmm0, (%rax) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: test17: ; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768] +; X64-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [4.59177481E-41,4.59177481E-41,4.59177481E-41,4.59177481E-41] ; X64-AVX-NEXT: vmovaps %xmm0, (%rax) ; X64-AVX-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -764,34 +764,52 @@ ; X86-SSE-LABEL: insertps_from_load_ins_elt_undef_i32: ; X86-SSE: ## %bb.0: ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-SSE-NEXT: pinsrd $2, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0x00,0x02] +; X86-SSE-NEXT: movd (%eax), %xmm1 ## encoding: [0x66,0x0f,0x6e,0x08] +; X86-SSE-NEXT: ## xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pshufd $68, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x44] +; X86-SSE-NEXT: ## xmm1 = xmm1[0,1,0,1] +; X86-SSE-NEXT: pblendw $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x30] +; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] ; X86-SSE-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX1-LABEL: insertps_from_load_ins_elt_undef_i32: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0x00,0x02] +; X86-AVX1-NEXT: vbroadcastss (%eax), %xmm1 ## encoding: [0xc4,0xe2,0x79,0x18,0x08] +; X86-AVX1-NEXT: vblendps $4, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x04] +; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; X86-AVX1-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512-LABEL: insertps_from_load_ins_elt_undef_i32: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x00,0x02] +; X86-AVX512-NEXT: vbroadcastss (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x08] +; X86-AVX512-NEXT: vblendps $4, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x04] +; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; ; X64-SSE-LABEL: insertps_from_load_ins_elt_undef_i32: ; X64-SSE: ## %bb.0: -; X64-SSE-NEXT: pinsrd $2, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0x07,0x02] +; X64-SSE-NEXT: movd (%rdi), %xmm1 ## encoding: [0x66,0x0f,0x6e,0x0f] +; X64-SSE-NEXT: ## xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pshufd $68, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x44] +; X64-SSE-NEXT: ## xmm1 = xmm1[0,1,0,1] +; X64-SSE-NEXT: pblendw $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x30] +; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] ; X64-SSE-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX1-LABEL: insertps_from_load_ins_elt_undef_i32: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpinsrd $2, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0x07,0x02] +; X64-AVX1-NEXT: vbroadcastss (%rdi), %xmm1 ## encoding: [0xc4,0xe2,0x79,0x18,0x0f] +; X64-AVX1-NEXT: vblendps $4, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x04] +; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: insertps_from_load_ins_elt_undef_i32: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpinsrd $2, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x07,0x02] +; X64-AVX512-NEXT: vbroadcastss (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x0f] +; X64-AVX512-NEXT: vblendps $4, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x04] +; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %1 = load i32, ptr %b, align 4 %2 = insertelement <4 x i32> undef, i32 %1, i32 0 @@ -1160,29 +1178,27 @@ define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) { ; SSE-LABEL: i32_shuf_W00W: ; SSE: ## %bb.0: -; SSE-NEXT: pshufd $255, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc8,0xff] -; SSE-NEXT: ## xmm1 = xmm0[3,3,3,3] -; SSE-NEXT: pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0] -; SSE-NEXT: pblendw $195, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc3] -; SSE-NEXT: ## xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; SSE-NEXT: movdqa %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x6f,0xc8] +; SSE-NEXT: psrldq $12, %xmm1 ## encoding: [0x66,0x0f,0x73,0xd9,0x0c] +; SSE-NEXT: ## xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pblendw $63, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3f] +; SSE-NEXT: ## xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX1-LABEL: i32_shuf_W00W: ; AVX1: ## %bb.0: -; AVX1-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff] -; AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; AVX1-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] -; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; AVX1-NEXT: vpsrldq $12, %xmm0, %xmm1 ## encoding: [0xc5,0xf1,0x73,0xd8,0x0c] +; AVX1-NEXT: ## xmm1 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpblendw $192, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0e,0xc0,0xc0] +; AVX1-NEXT: ## xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: i32_shuf_W00W: ; AVX512: ## %bb.0: -; AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff] -; AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] -; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; AVX512-NEXT: vpsrldq $12, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x73,0xd8,0x0c] +; AVX512-NEXT: ## xmm1 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vblendps $8, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x08] +; AVX512-NEXT: ## xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x i32> %x, i32 3 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 @@ -1595,8 +1611,9 @@ ; X86-AVX1-LABEL: insertps_from_broadcast_loadv4f32: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vinsertps $48, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30] -; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] +; X86-AVX1-NEXT: vmovups (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x10,0x08] +; X86-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] +; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] ; X86-AVX1-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512-LABEL: insertps_from_broadcast_loadv4f32: @@ -1615,8 +1632,9 @@ ; ; X64-AVX1-LABEL: insertps_from_broadcast_loadv4f32: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30] -; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] +; X64-AVX1-NEXT: vmovups (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x10,0x0f] +; X64-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] +; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: insertps_from_broadcast_loadv4f32: @@ -2124,14 +2142,14 @@ ; AVX1-LABEL: build_vector_to_shuffle_1: ; AVX1: ## %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; AVX1-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a] +; AVX1-NEXT: vblendps $5, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x05] ; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: build_vector_to_shuffle_1: ; AVX512: ## %bb.0: ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a] +; AVX512-NEXT: vblendps $5, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x05] ; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x float> %A, i32 1 @@ -2152,14 +2170,14 @@ ; AVX1-LABEL: build_vector_to_shuffle_2: ; AVX1: ## %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; AVX1-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02] +; AVX1-NEXT: vblendps $13, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x0d] ; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: build_vector_to_shuffle_2: ; AVX512: ## %bb.0: ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02] +; AVX512-NEXT: vblendps $13, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x0d] ; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x float> %A, i32 1 diff --git a/llvm/test/CodeGen/X86/sshl_sat.ll b/llvm/test/CodeGen/X86/sshl_sat.ll --- a/llvm/test/CodeGen/X86/sshl_sat.ll +++ b/llvm/test/CodeGen/X86/sshl_sat.ll @@ -128,7 +128,7 @@ ; X86: # %bb.0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll $7, %ecx ; X86-NEXT: addl %eax, %eax ; X86-NEXT: movl %eax, %edx diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll --- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll @@ -18,31 +18,29 @@ ; X64-NEXT: movdqa %xmm2, %xmm5 ; X64-NEXT: psrlq %xmm4, %xmm5 ; X64-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; X64-NEXT: movdqa %xmm0, %xmm6 -; X64-NEXT: psllq %xmm1, %xmm6 ; X64-NEXT: movdqa %xmm0, %xmm3 -; X64-NEXT: psllq %xmm4, %xmm3 -; X64-NEXT: movdqa %xmm3, %xmm7 -; X64-NEXT: movsd {{.*#+}} xmm3 = xmm6[0],xmm3[1] -; X64-NEXT: psrlq %xmm1, %xmm6 +; X64-NEXT: psllq %xmm1, %xmm3 +; X64-NEXT: movdqa %xmm0, %xmm6 +; X64-NEXT: psllq %xmm4, %xmm6 +; X64-NEXT: movdqa %xmm6, %xmm7 +; X64-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] +; X64-NEXT: psrlq %xmm1, %xmm3 ; X64-NEXT: psrlq %xmm4, %xmm7 -; X64-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] +; X64-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] ; X64-NEXT: xorpd %xmm5, %xmm7 ; X64-NEXT: psubq %xmm5, %xmm7 ; X64-NEXT: pcmpeqd %xmm0, %xmm7 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,0,3,2] ; X64-NEXT: pand %xmm7, %xmm1 -; X64-NEXT: andpd %xmm1, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; X64-NEXT: pand %xmm2, %xmm0 -; X64-NEXT: pxor %xmm5, %xmm5 -; X64-NEXT: pcmpgtd %xmm4, %xmm5 -; X64-NEXT: pcmpeqd %xmm4, %xmm4 -; X64-NEXT: pxor %xmm5, %xmm4 -; X64-NEXT: pandn %xmm4, %xmm2 -; X64-NEXT: por %xmm0, %xmm2 -; X64-NEXT: pandn %xmm2, %xmm1 -; X64-NEXT: por %xmm3, %xmm1 +; X64-NEXT: andpd %xmm1, %xmm6 +; X64-NEXT: pand %xmm0, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-NEXT: pxor %xmm3, %xmm3 +; X64-NEXT: pcmpgtd %xmm0, %xmm3 +; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; X64-NEXT: por %xmm2, %xmm3 +; X64-NEXT: pandn %xmm3, %xmm1 +; X64-NEXT: por %xmm6, %xmm1 ; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -1270,18 +1270,19 @@ ; SSE41-NEXT: pand %xmm5, %xmm4 ; SSE41-NEXT: por %xmm3, %xmm4 ; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: por %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm3 -; SSE41-NEXT: movapd {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: pxor %xmm4, %xmm1 +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: retq ; @@ -1475,55 +1476,57 @@ ; SSE41-LABEL: v4i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: psubq %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pxor %xmm6, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm6 ; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm6 +; SSE41-NEXT: por %xmm0, %xmm6 +; SSE41-NEXT: pxor %xmm5, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm8 -; SSE41-NEXT: por %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pand %xmm7, %xmm2 +; SSE41-NEXT: por %xmm0, %xmm2 ; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm5 -; SSE41-NEXT: por %xmm2, %xmm5 -; SSE41-NEXT: pxor %xmm8, %xmm5 -; SSE41-NEXT: movapd {{.*#+}} xmm8 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm7, %xmm2 +; SSE41-NEXT: movapd {{.*#+}} xmm7 = [9223372036854775807,9223372036854775807] +; SSE41-NEXT: movapd {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movapd %xmm6, %xmm8 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: psubq %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 +; SSE41-NEXT: pxor %xmm5, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm2, %xmm8 ; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm9 +; SSE41-NEXT: pand %xmm8, %xmm9 ; SSE41-NEXT: por %xmm0, %xmm9 -; SSE41-NEXT: pxor %xmm6, %xmm3 +; SSE41-NEXT: pxor %xmm5, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm2 -; SSE41-NEXT: por %xmm3, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm2 +; SSE41-NEXT: por %xmm0, %xmm2 ; SSE41-NEXT: pxor %xmm9, %xmm2 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm6 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 ; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: retq ; @@ -1844,103 +1847,107 @@ ; SSE41-LABEL: v8i64: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: psubq %xmm4, %xmm8 -; SSE41-NEXT: movdqa %xmm8, %xmm9 -; SSE41-NEXT: pxor %xmm10, %xmm9 +; SSE41-NEXT: movdqa %xmm8, %xmm10 +; SSE41-NEXT: pxor %xmm9, %xmm10 ; SSE41-NEXT: movdqa %xmm0, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm11 +; SSE41-NEXT: pcmpeqd %xmm10, %xmm11 +; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm11, %xmm10 +; SSE41-NEXT: por %xmm0, %xmm10 +; SSE41-NEXT: pxor %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm11, %xmm12 -; SSE41-NEXT: por %xmm0, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm11, %xmm4 +; SSE41-NEXT: por %xmm0, %xmm4 ; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm9 -; SSE41-NEXT: por %xmm4, %xmm9 -; SSE41-NEXT: pxor %xmm12, %xmm9 -; SSE41-NEXT: movapd {{.*#+}} xmm12 = [9223372036854775807,9223372036854775807] -; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775808,9223372036854775808] -; SSE41-NEXT: movapd %xmm11, %xmm4 +; SSE41-NEXT: movapd {{.*#+}} xmm11 = [9223372036854775807,9223372036854775807] +; SSE41-NEXT: movapd {{.*#+}} xmm10 = [9223372036854775808,9223372036854775808] +; SSE41-NEXT: movapd %xmm10, %xmm12 ; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm4 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm8 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: psubq %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm9 +; SSE41-NEXT: pxor %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm12 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm12 ; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm9, %xmm13 +; SSE41-NEXT: pand %xmm12, %xmm13 ; SSE41-NEXT: por %xmm0, %xmm13 -; SSE41-NEXT: pxor %xmm10, %xmm5 +; SSE41-NEXT: pxor %xmm9, %xmm5 ; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm4 -; SSE41-NEXT: por %xmm5, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm12, %xmm4 +; SSE41-NEXT: por %xmm0, %xmm4 ; SSE41-NEXT: pxor %xmm13, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm5 +; SSE41-NEXT: movapd %xmm10, %xmm5 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: psubq %xmm6, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 +; SSE41-NEXT: pxor %xmm9, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm9 -; SSE41-NEXT: por %xmm0, %xmm9 -; SSE41-NEXT: pxor %xmm10, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm12 +; SSE41-NEXT: por %xmm0, %xmm12 +; SSE41-NEXT: pxor %xmm9, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm4 -; SSE41-NEXT: por %xmm6, %xmm4 -; SSE41-NEXT: pxor %xmm9, %xmm4 -; SSE41-NEXT: movapd %xmm11, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: por %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm12, %xmm4 +; SSE41-NEXT: movapd %xmm10, %xmm5 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm5 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm5 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm0 ; SSE41-NEXT: psubq %xmm7, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pxor %xmm10, %xmm4 +; SSE41-NEXT: pxor %xmm9, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm6 ; SSE41-NEXT: por %xmm0, %xmm6 -; SSE41-NEXT: pxor %xmm10, %xmm7 +; SSE41-NEXT: pxor %xmm9, %xmm7 ; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm0, %xmm4 -; SSE41-NEXT: por %xmm7, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: por %xmm0, %xmm4 ; SSE41-NEXT: pxor %xmm6, %xmm4 ; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm11 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm10 ; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 ; SSE41-NEXT: movapd %xmm8, %xmm0 ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/store-narrow.ll b/llvm/test/CodeGen/X86/store-narrow.ll --- a/llvm/test/CodeGen/X86/store-narrow.ll +++ b/llvm/test/CodeGen/X86/store-narrow.ll @@ -67,22 +67,21 @@ define void @test3(ptr nocapture %a0, i16 zeroext %a1) nounwind ssp { ; X64-LABEL: test3: ; X64: ## %bb.0: ## %entry -; X64-NEXT: movw %si, (%rdi) +; X64-NEXT: movzwl 2(%rdi), %eax +; X64-NEXT: shll $16, %eax +; X64-NEXT: orl %esi, %eax +; X64-NEXT: movl %eax, (%rdi) ; X64-NEXT: retq ; -; X86-BWON-LABEL: test3: -; X86-BWON: ## %bb.0: ## %entry -; X86-BWON-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BWON-NEXT: movw %ax, (%ecx) -; X86-BWON-NEXT: retl -; -; X86-BWOFF-LABEL: test3: -; X86-BWOFF: ## %bb.0: ## %entry -; X86-BWOFF-NEXT: movw {{[0-9]+}}(%esp), %ax -; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BWOFF-NEXT: movw %ax, (%ecx) -; X86-BWOFF-NEXT: retl +; X86-LABEL: test3: +; X86: ## %bb.0: ## %entry +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl 2(%ecx), %edx +; X86-NEXT: shll $16, %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: movl %edx, (%ecx) +; X86-NEXT: retl entry: %A = load i32, ptr %a0, align 4 %B = and i32 %A, -65536 ; 0xFFFF0000 @@ -95,22 +94,21 @@ define void @test4(ptr nocapture %a0, i16 zeroext %a1) nounwind ssp { ; X64-LABEL: test4: ; X64: ## %bb.0: ## %entry -; X64-NEXT: movw %si, 2(%rdi) +; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: shll $16, %esi +; X64-NEXT: orl %eax, %esi +; X64-NEXT: movl %esi, (%rdi) ; X64-NEXT: retq ; -; X86-BWON-LABEL: test4: -; X86-BWON: ## %bb.0: ## %entry -; X86-BWON-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BWON-NEXT: movw %ax, 2(%ecx) -; X86-BWON-NEXT: retl -; -; X86-BWOFF-LABEL: test4: -; X86-BWOFF: ## %bb.0: ## %entry -; X86-BWOFF-NEXT: movw {{[0-9]+}}(%esp), %ax -; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BWOFF-NEXT: movw %ax, 2(%ecx) -; X86-BWOFF-NEXT: retl +; X86-LABEL: test4: +; X86: ## %bb.0: ## %entry +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx +; X86-NEXT: shll $16, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl entry: %A = load i32, ptr %a0, align 4 %B = and i32 %A, 65535 ; 0x0000FFFF diff --git a/llvm/test/CodeGen/X86/stores-merging.ll b/llvm/test/CodeGen/X86/stores-merging.ll --- a/llvm/test/CodeGen/X86/stores-merging.ll +++ b/llvm/test/CodeGen/X86/stores-merging.ll @@ -13,8 +13,9 @@ define dso_local void @redundant_stores_merging() { ; CHECK-LABEL: redundant_stores_merging: ; CHECK: # %bb.0: -; CHECK-NEXT: movabsq $1958505086977, %rax # imm = 0x1C800000001 +; CHECK-NEXT: movabsq $528280977409, %rax # imm = 0x7B00000001 ; CHECK-NEXT: movq %rax, e+4(%rip) +; CHECK-NEXT: movl $456, e+8(%rip) # imm = 0x1C8 ; CHECK-NEXT: retq store i32 1, ptr getelementptr inbounds (%structTy, ptr @e, i64 0, i32 1), align 4 store i32 123, ptr getelementptr inbounds (%structTy, ptr @e, i64 0, i32 2), align 4 @@ -26,9 +27,8 @@ define dso_local void @redundant_stores_merging_reverse() { ; CHECK-LABEL: redundant_stores_merging_reverse: ; CHECK: # %bb.0: -; CHECK-NEXT: movabsq $528280977409, %rax # imm = 0x7B00000001 +; CHECK-NEXT: movabsq $1958505086977, %rax # imm = 0x1C800000001 ; CHECK-NEXT: movq %rax, e+4(%rip) -; CHECK-NEXT: movl $456, e+8(%rip) # imm = 0x1C8 ; CHECK-NEXT: retq store i32 123, ptr getelementptr inbounds (%structTy, ptr @e, i64 0, i32 2), align 4 store i32 456, ptr getelementptr inbounds (%structTy, ptr @e, i64 0, i32 2), align 4 @@ -359,14 +359,12 @@ define dso_local void @rotate32_consecutive(ptr %p) { ; CHECK-LABEL: rotate32_consecutive: ; CHECK: # %bb.0: -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzwl 2(%rdi), %ecx -; CHECK-NEXT: movzwl 4(%rdi), %edx -; CHECK-NEXT: movzwl 6(%rdi), %esi -; CHECK-NEXT: movw %cx, 84(%rdi) -; CHECK-NEXT: movw %ax, 86(%rdi) -; CHECK-NEXT: movw %si, 88(%rdi) -; CHECK-NEXT: movw %dx, 90(%rdi) +; CHECK-NEXT: movl (%rdi), %eax +; CHECK-NEXT: movl 4(%rdi), %ecx +; CHECK-NEXT: roll $16, %eax +; CHECK-NEXT: roll $16, %ecx +; CHECK-NEXT: movl %eax, 84(%rdi) +; CHECK-NEXT: movl %ecx, 88(%rdi) ; CHECK-NEXT: retq %p1 = getelementptr i16, ptr %p, i64 1 %p2 = getelementptr i16, ptr %p, i64 2 @@ -433,7 +431,12 @@ define dso_local void @trunc_i32_to_i8(i32 %x, ptr %p) { ; CHECK-LABEL: trunc_i32_to_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, (%rsi) +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movw %di, (%rsi) +; CHECK-NEXT: shrl $16, %edi +; CHECK-NEXT: shrl $24, %eax +; CHECK-NEXT: movb %dil, 2(%rsi) +; CHECK-NEXT: movb %al, 3(%rsi) ; CHECK-NEXT: retq %t1 = trunc i32 %x to i8 %sh1 = lshr i32 %x, 8 @@ -499,7 +502,24 @@ define dso_local void @trunc_i64_to_i8(i64 %x, ptr %p) { ; CHECK-LABEL: trunc_i64_to_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, (%rsi) +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq %rdi, %rcx +; CHECK-NEXT: movq %rdi, %rdx +; CHECK-NEXT: movq %rdi, %r8 +; CHECK-NEXT: movq %rdi, %r9 +; CHECK-NEXT: movw %di, (%rsi) +; CHECK-NEXT: shrq $16, %rdi +; CHECK-NEXT: shrq $24, %rax +; CHECK-NEXT: shrq $32, %rcx +; CHECK-NEXT: shrq $40, %rdx +; CHECK-NEXT: shrq $48, %r8 +; CHECK-NEXT: shrq $56, %r9 +; CHECK-NEXT: movb %dil, 2(%rsi) +; CHECK-NEXT: movb %al, 3(%rsi) +; CHECK-NEXT: movb %cl, 4(%rsi) +; CHECK-NEXT: movb %dl, 5(%rsi) +; CHECK-NEXT: movb %r8b, 6(%rsi) +; CHECK-NEXT: movb %r9b, 7(%rsi) ; CHECK-NEXT: retq %t1 = trunc i64 %x to i8 %sh1 = lshr i64 %x, 8 @@ -537,7 +557,12 @@ define dso_local void @trunc_i64_to_i16(i64 %x, ptr %p) { ; CHECK-LABEL: trunc_i64_to_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, (%rsi) +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movl %edi, (%rsi) +; CHECK-NEXT: shrq $32, %rdi +; CHECK-NEXT: shrq $48, %rax +; CHECK-NEXT: movw %di, 4(%rsi) +; CHECK-NEXT: movw %ax, 6(%rsi) ; CHECK-NEXT: retq %t1 = trunc i64 %x to i16 %sh1 = lshr i64 %x, 16 diff --git a/llvm/test/CodeGen/X86/subcarry.ll b/llvm/test/CodeGen/X86/subcarry.ll --- a/llvm/test/CodeGen/X86/subcarry.ll +++ b/llvm/test/CodeGen/X86/subcarry.ll @@ -43,16 +43,21 @@ ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: subq (%rsi), %rdx -; CHECK-NEXT: movl $0, %edi -; CHECK-NEXT: sbbq 8(%rsi), %rdi -; CHECK-NEXT: movl $0, %r8d -; CHECK-NEXT: sbbq 16(%rsi), %r8 -; CHECK-NEXT: sbbq 24(%rsi), %rcx -; CHECK-NEXT: movq %rdx, (%rax) +; CHECK-NEXT: subq (%rsi), %rcx +; CHECK-NEXT: setae %dl +; CHECK-NEXT: movq 8(%rsi), %rdi +; CHECK-NEXT: movq 16(%rsi), %r8 +; CHECK-NEXT: notq %rdi +; CHECK-NEXT: addq %rdx, %rdi +; CHECK-NEXT: notq %r8 +; CHECK-NEXT: adcq $0, %r8 +; CHECK-NEXT: movq 24(%rsi), %rdx +; CHECK-NEXT: notq %rdx +; CHECK-NEXT: adcq $0, %rdx +; CHECK-NEXT: movq %rcx, (%rax) ; CHECK-NEXT: movq %rdi, 8(%rax) ; CHECK-NEXT: movq %r8, 16(%rax) -; CHECK-NEXT: movq %rcx, 24(%rax) +; CHECK-NEXT: movq %rdx, 24(%rax) ; CHECK-NEXT: retq entry: %0 = load i64, ptr %this, align 8 @@ -94,13 +99,12 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movq (%rsi), %rdi -; CHECK-NEXT: movq 8(%rsi), %r10 +; CHECK-NEXT: xorl %r10d, %r10d ; CHECK-NEXT: subq %rdx, %rdi -; CHECK-NEXT: setae %dl -; CHECK-NEXT: addb $-1, %dl -; CHECK-NEXT: adcq $0, %r10 +; CHECK-NEXT: setae %r10b +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: addq 8(%rsi), %r10 ; CHECK-NEXT: setb %dl -; CHECK-NEXT: movzbl %dl, %edx ; CHECK-NEXT: notq %rcx ; CHECK-NEXT: addq %r10, %rcx ; CHECK-NEXT: adcq 16(%rsi), %rdx @@ -345,10 +349,14 @@ define { i64, i1 } @subcarry_fake_carry(i64 %a, i64 %b, i1 %carryin) { ; CHECK-LABEL: subcarry_fake_carry: ; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $edx killed $edx def $rdx ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: btl $0, %edx -; CHECK-NEXT: sbbq %rsi, %rax +; CHECK-NEXT: subq %rsi, %rax +; CHECK-NEXT: setb %cl +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: subq %rdx, %rax ; CHECK-NEXT: setb %dl +; CHECK-NEXT: orb %cl, %dl ; CHECK-NEXT: retq %t1 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) %partial = extractvalue { i64, i1 } %t1, 0 @@ -596,14 +604,12 @@ ; CHECK-NEXT: movq 8(%rsi), %rdi ; CHECK-NEXT: movq 16(%rsi), %r8 ; CHECK-NEXT: movq 24(%rsi), %rsi -; CHECK-NEXT: xorl %r9d, %r9d ; CHECK-NEXT: subq 16(%rdx), %r8 -; CHECK-NEXT: setb %r9b -; CHECK-NEXT: subq 24(%rdx), %rsi +; CHECK-NEXT: sbbq 24(%rdx), %rsi ; CHECK-NEXT: subq (%rdx), %rcx ; CHECK-NEXT: sbbq 8(%rdx), %rdi ; CHECK-NEXT: sbbq $0, %r8 -; CHECK-NEXT: sbbq %r9, %rsi +; CHECK-NEXT: sbbq $0, %rsi ; CHECK-NEXT: movq %rcx, (%rax) ; CHECK-NEXT: movq %rdi, 8(%rax) ; CHECK-NEXT: movq %r8, 16(%rax) diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll --- a/llvm/test/CodeGen/X86/subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -1658,18 +1658,31 @@ } define <4 x double> @broadcast_v4f64_v2f64_4u61(ptr %vp, <4 x double> %default) { -; X86-LABEL: broadcast_v4f64_v2f64_4u61: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vinsertf128 $1, (%eax), %ymm0, %ymm1 -; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; X86-NEXT: retl +; X86-AVX-LABEL: broadcast_v4f64_v2f64_4u61: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; X86-AVX-NEXT: retl ; -; X64-LABEL: broadcast_v4f64_v2f64_4u61: -; X64: # %bb.0: -; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm1 -; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; X64-NEXT: retq +; X86-AVX512-LABEL: broadcast_v4f64_v2f64_4u61: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X86-AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; X86-AVX512-NEXT: retl +; +; X64-AVX-LABEL: broadcast_v4f64_v2f64_4u61: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; X64-AVX-NEXT: retq +; +; X64-AVX512-LABEL: broadcast_v4f64_v2f64_4u61: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; X64-AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; X64-AVX512-NEXT: retq %vec = load <2 x double>, ptr %vp %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> %res = select <4 x i1> , <4 x double> %shuf, <4 x double> %default @@ -1681,13 +1694,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vbroadcastsd (%eax), %ymm1 -; X86-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; X86-NEXT: retl ; ; X64-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: ; X64: # %bb.0: ; X64-NEXT: vbroadcastsd (%rdi), %ymm1 -; X64-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; X64-NEXT: retq %vec = load <2 x float>, ptr %vp %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> @@ -1735,7 +1748,8 @@ ; X86-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X86-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X86-AVX512-NEXT: retl ; ; X64-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101: @@ -1746,7 +1760,8 @@ ; ; X64-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X64-AVX512-NEXT: retq %vec = load <2 x double>, ptr %vp %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll --- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -356,44 +356,14 @@ ; SCALAR-NEXT: movl %eax, 8(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec128_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec128_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vmovdqa %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec128_v2i32: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx) -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec128_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512-NEXT: retq +; SSE-LABEL: vec128_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <2 x i32> %in.subvec.not, store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -419,44 +389,14 @@ ; SCALAR-NEXT: movl %eax, 8(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec128_v2f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec128_v2f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vmovdqa %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec128_v2f32: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx) -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec128_v2f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512-NEXT: retq +; SSE-LABEL: vec128_v2f32: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec.int = xor <2 x i32> %in.subvec.not, %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float> @@ -566,44 +506,14 @@ ; SCALAR-NEXT: movw %r8w, 8(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec128_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec128_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vmovdqa %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec128_v4i16: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx) -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec128_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512-NEXT: retq +; SSE-LABEL: vec128_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <4 x i16> %in.subvec.not, store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -661,44 +571,14 @@ ; SCALAR-NEXT: popq %rbx ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec128_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec128_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vmovdqa %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec128_v8i8: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx) -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec128_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512-NEXT: retq +; SSE-LABEL: vec128_v8i8: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <8 x i8> %in.subvec.not, store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -966,49 +846,16 @@ ; SCALAR-NEXT: movl %eax, 24(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec256_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec256_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec256_v2i32: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec256_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec256_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <2 x i32> %in.subvec.not, store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -1042,49 +889,16 @@ ; SCALAR-NEXT: movl %eax, 24(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec256_v2f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec256_v2f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec256_v2f32: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec256_v2f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec256_v2f32: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec.int = xor <2 x i32> %in.subvec.not, %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float> @@ -1318,49 +1132,16 @@ ; SCALAR-NEXT: movw %ax, 24(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec256_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec256_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec256_v4i16: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec256_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec256_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <4 x i16> %in.subvec.not, store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -1513,49 +1294,16 @@ ; SCALAR-NEXT: popq %rbx ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec256_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec256_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec256_v8i8: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec256_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec256_v8i8: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <8 x i8> %in.subvec.not, store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -2130,53 +1878,18 @@ ; SCALAR-NEXT: movl %eax, 40(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec384_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec384_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX1-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec384_v2i32: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec384_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec384_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: movq %rax, 32(%rdx) +; SSE-NEXT: movq %rax, 40(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <2 x i32> %in.subvec.not, store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -2218,53 +1931,18 @@ ; SCALAR-NEXT: movl %eax, 40(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec384_v2f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec384_v2f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX1-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec384_v2f32: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec384_v2f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec384_v2f32: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: movq %rax, 32(%rdx) +; SSE-NEXT: movq %rax, 40(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec.int = xor <2 x i32> %in.subvec.not, %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float> @@ -3337,21 +3015,21 @@ define void @vec384_v3i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { ; SCALAR-LABEL: vec384_v3i64: ; SCALAR: # %bb.0: -; SCALAR-NEXT: movq (%rdi), %rax -; SCALAR-NEXT: movq 8(%rdi), %rcx -; SCALAR-NEXT: movq 16(%rdi), %rdi -; SCALAR-NEXT: notq %rdi +; SCALAR-NEXT: movq 16(%rdi), %rax +; SCALAR-NEXT: movq (%rdi), %rcx +; SCALAR-NEXT: movq 8(%rdi), %rdi ; SCALAR-NEXT: notq %rcx +; SCALAR-NEXT: notq %rdi ; SCALAR-NEXT: notq %rax -; SCALAR-NEXT: movq %rax, (%rsi) -; SCALAR-NEXT: movq %rcx, 8(%rsi) -; SCALAR-NEXT: movq %rdi, 16(%rsi) -; SCALAR-NEXT: movq %rax, (%rdx) -; SCALAR-NEXT: movq %rcx, 8(%rdx) -; SCALAR-NEXT: movq %rdi, 16(%rdx) -; SCALAR-NEXT: movq %rdi, 48(%rdx) -; SCALAR-NEXT: movq %rcx, 40(%rdx) -; SCALAR-NEXT: movq %rax, 32(%rdx) +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movq %rdi, 8(%rsi) +; SCALAR-NEXT: movq %rcx, (%rsi) +; SCALAR-NEXT: movq %rax, 16(%rdx) +; SCALAR-NEXT: movq %rdi, 8(%rdx) +; SCALAR-NEXT: movq %rcx, (%rdx) +; SCALAR-NEXT: movq %rax, 48(%rdx) +; SCALAR-NEXT: movq %rdi, 40(%rdx) +; SCALAR-NEXT: movq %rcx, 32(%rdx) ; SCALAR-NEXT: retq ; ; SSE2-LABEL: vec384_v3i64: @@ -3409,21 +3087,21 @@ define void @vec384_v3f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { ; SCALAR-LABEL: vec384_v3f64: ; SCALAR: # %bb.0: -; SCALAR-NEXT: movq (%rdi), %rax -; SCALAR-NEXT: movq 8(%rdi), %rcx -; SCALAR-NEXT: movq 16(%rdi), %rdi -; SCALAR-NEXT: notq %rdi +; SCALAR-NEXT: movq 16(%rdi), %rax +; SCALAR-NEXT: movq (%rdi), %rcx +; SCALAR-NEXT: movq 8(%rdi), %rdi ; SCALAR-NEXT: notq %rcx +; SCALAR-NEXT: notq %rdi ; SCALAR-NEXT: notq %rax -; SCALAR-NEXT: movq %rax, (%rsi) -; SCALAR-NEXT: movq %rcx, 8(%rsi) -; SCALAR-NEXT: movq %rdi, 16(%rsi) -; SCALAR-NEXT: movq %rax, (%rdx) -; SCALAR-NEXT: movq %rcx, 8(%rdx) -; SCALAR-NEXT: movq %rdi, 16(%rdx) -; SCALAR-NEXT: movq %rdi, 48(%rdx) -; SCALAR-NEXT: movq %rcx, 40(%rdx) -; SCALAR-NEXT: movq %rax, 32(%rdx) +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movq %rdi, 8(%rsi) +; SCALAR-NEXT: movq %rcx, (%rsi) +; SCALAR-NEXT: movq %rax, 16(%rdx) +; SCALAR-NEXT: movq %rdi, 8(%rdx) +; SCALAR-NEXT: movq %rcx, (%rdx) +; SCALAR-NEXT: movq %rax, 48(%rdx) +; SCALAR-NEXT: movq %rdi, 40(%rdx) +; SCALAR-NEXT: movq %rcx, 32(%rdx) ; SCALAR-NEXT: retq ; ; SSE2-LABEL: vec384_v3f64: @@ -3647,53 +3325,18 @@ ; SCALAR-NEXT: movw %ax, 40(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec384_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec384_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX1-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec384_v4i16: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec384_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec384_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: movq %rax, 32(%rdx) +; SSE-NEXT: movq %rax, 40(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <4 x i16> %in.subvec.not, store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -3800,56 +3443,56 @@ define void @vec384_v6i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { ; SCALAR-LABEL: vec384_v6i8: ; SCALAR: # %bb.0: -; SCALAR-NEXT: movq (%rdi), %rdi -; SCALAR-NEXT: movq %rdi, %rax -; SCALAR-NEXT: shrq $40, %rax -; SCALAR-NEXT: movq %rdi, %rcx -; SCALAR-NEXT: shrq $32, %rcx -; SCALAR-NEXT: movl %edi, %r8d -; SCALAR-NEXT: shrl $24, %r8d -; SCALAR-NEXT: movl %edi, %r9d -; SCALAR-NEXT: shrl $16, %r9d -; SCALAR-NEXT: movl %edi, %r10d +; SCALAR-NEXT: movq (%rdi), %rcx +; SCALAR-NEXT: movl %ecx, %eax +; SCALAR-NEXT: shrl $16, %eax +; SCALAR-NEXT: movl %ecx, %edi +; SCALAR-NEXT: shrl $24, %edi +; SCALAR-NEXT: movq %rcx, %r8 +; SCALAR-NEXT: shrq $40, %r8 +; SCALAR-NEXT: movq %rcx, %r9 +; SCALAR-NEXT: shrq $32, %r9 +; SCALAR-NEXT: movl %ecx, %r10d ; SCALAR-NEXT: shrl $8, %r10d -; SCALAR-NEXT: notb %dil -; SCALAR-NEXT: movzbl %dil, %edi +; SCALAR-NEXT: notb %cl +; SCALAR-NEXT: movzbl %cl, %ecx ; SCALAR-NEXT: notb %r10b ; SCALAR-NEXT: movzbl %r10b, %r10d ; SCALAR-NEXT: shll $8, %r10d -; SCALAR-NEXT: orl %edi, %r10d +; SCALAR-NEXT: orl %ecx, %r10d ; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl %r9b, %edi +; SCALAR-NEXT: movzbl %r9b, %r9d ; SCALAR-NEXT: notb %r8b -; SCALAR-NEXT: movzbl %r8b, %r8d -; SCALAR-NEXT: shll $8, %r8d -; SCALAR-NEXT: orl %edi, %r8d -; SCALAR-NEXT: notb %cl -; SCALAR-NEXT: movzbl %cl, %ecx +; SCALAR-NEXT: movzbl %r8b, %ecx +; SCALAR-NEXT: shll $8, %ecx +; SCALAR-NEXT: orl %r9d, %ecx +; SCALAR-NEXT: notb %dil +; SCALAR-NEXT: movzbl %dil, %edi ; SCALAR-NEXT: notb %al ; SCALAR-NEXT: movzbl %al, %eax -; SCALAR-NEXT: shll $8, %eax -; SCALAR-NEXT: orl %ecx, %eax -; SCALAR-NEXT: movw %ax, 4(%rsi) -; SCALAR-NEXT: shll $16, %r8d -; SCALAR-NEXT: movzwl %r10w, %ecx -; SCALAR-NEXT: orl %r8d, %ecx -; SCALAR-NEXT: movl %ecx, (%rsi) -; SCALAR-NEXT: movw %ax, 4(%rdx) -; SCALAR-NEXT: movl %ecx, (%rdx) -; SCALAR-NEXT: movw %ax, 12(%rdx) -; SCALAR-NEXT: movl %ecx, 8(%rdx) -; SCALAR-NEXT: movw %ax, 20(%rdx) -; SCALAR-NEXT: movl %ecx, 16(%rdx) -; SCALAR-NEXT: movw %ax, 28(%rdx) -; SCALAR-NEXT: movl %ecx, 24(%rdx) -; SCALAR-NEXT: movw %ax, 36(%rdx) -; SCALAR-NEXT: movl %ecx, 32(%rdx) -; SCALAR-NEXT: movw %ax, 44(%rdx) -; SCALAR-NEXT: movl %ecx, 40(%rdx) -; SCALAR-NEXT: movw %ax, 52(%rdx) -; SCALAR-NEXT: movl %ecx, 48(%rdx) -; SCALAR-NEXT: movw %ax, 60(%rdx) -; SCALAR-NEXT: movl %ecx, 56(%rdx) +; SCALAR-NEXT: movw %cx, 4(%rsi) +; SCALAR-NEXT: shll $16, %eax +; SCALAR-NEXT: shll $24, %edi +; SCALAR-NEXT: orl %eax, %edi +; SCALAR-NEXT: movzwl %r10w, %eax +; SCALAR-NEXT: orl %edi, %eax +; SCALAR-NEXT: movl %eax, (%rsi) +; SCALAR-NEXT: movw %cx, 4(%rdx) +; SCALAR-NEXT: movl %eax, (%rdx) +; SCALAR-NEXT: movw %cx, 12(%rdx) +; SCALAR-NEXT: movl %eax, 8(%rdx) +; SCALAR-NEXT: movw %cx, 20(%rdx) +; SCALAR-NEXT: movl %eax, 16(%rdx) +; SCALAR-NEXT: movw %cx, 28(%rdx) +; SCALAR-NEXT: movl %eax, 24(%rdx) +; SCALAR-NEXT: movw %cx, 36(%rdx) +; SCALAR-NEXT: movl %eax, 32(%rdx) +; SCALAR-NEXT: movw %cx, 44(%rdx) +; SCALAR-NEXT: movl %eax, 40(%rdx) +; SCALAR-NEXT: movw %cx, 52(%rdx) +; SCALAR-NEXT: movl %eax, 48(%rdx) +; SCALAR-NEXT: movw %cx, 60(%rdx) +; SCALAR-NEXT: movl %eax, 56(%rdx) ; SCALAR-NEXT: retq ; ; SSE2-ONLY-LABEL: vec384_v6i8: @@ -4220,36 +3863,36 @@ define void @vec384_v6i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { ; SCALAR-LABEL: vec384_v6i32: ; SCALAR: # %bb.0: -; SCALAR-NEXT: movq (%rdi), %rax -; SCALAR-NEXT: movq 8(%rdi), %rcx +; SCALAR-NEXT: movq 16(%rdi), %rax ; SCALAR-NEXT: movq %rax, %r8 ; SCALAR-NEXT: shrq $32, %r8 -; SCALAR-NEXT: movq %rcx, %r9 +; SCALAR-NEXT: movq (%rdi), %rcx +; SCALAR-NEXT: movq 8(%rdi), %rdi +; SCALAR-NEXT: movq %rdi, %r9 ; SCALAR-NEXT: shrq $32, %r9 -; SCALAR-NEXT: movq 16(%rdi), %rdi -; SCALAR-NEXT: movq %rdi, %r10 +; SCALAR-NEXT: movq %rcx, %r10 ; SCALAR-NEXT: shrq $32, %r10 ; SCALAR-NEXT: notl %r10d ; SCALAR-NEXT: shlq $32, %r10 -; SCALAR-NEXT: notl %edi -; SCALAR-NEXT: orq %r10, %rdi +; SCALAR-NEXT: notl %ecx +; SCALAR-NEXT: orq %r10, %rcx ; SCALAR-NEXT: notl %r9d ; SCALAR-NEXT: shlq $32, %r9 -; SCALAR-NEXT: notl %ecx -; SCALAR-NEXT: orq %r9, %rcx +; SCALAR-NEXT: notl %edi +; SCALAR-NEXT: orq %r9, %rdi ; SCALAR-NEXT: notl %r8d ; SCALAR-NEXT: shlq $32, %r8 ; SCALAR-NEXT: notl %eax ; SCALAR-NEXT: orq %r8, %rax -; SCALAR-NEXT: movq %rax, (%rsi) -; SCALAR-NEXT: movq %rcx, 8(%rsi) -; SCALAR-NEXT: movq %rdi, 16(%rsi) -; SCALAR-NEXT: movq %rax, (%rdx) -; SCALAR-NEXT: movq %rcx, 8(%rdx) -; SCALAR-NEXT: movq %rdi, 16(%rdx) -; SCALAR-NEXT: movq %rdi, 48(%rdx) -; SCALAR-NEXT: movq %rcx, 40(%rdx) -; SCALAR-NEXT: movq %rax, 32(%rdx) +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movq %rdi, 8(%rsi) +; SCALAR-NEXT: movq %rcx, (%rsi) +; SCALAR-NEXT: movq %rax, 16(%rdx) +; SCALAR-NEXT: movq %rdi, 8(%rdx) +; SCALAR-NEXT: movq %rcx, (%rdx) +; SCALAR-NEXT: movq %rax, 48(%rdx) +; SCALAR-NEXT: movq %rdi, 40(%rdx) +; SCALAR-NEXT: movq %rcx, 32(%rdx) ; SCALAR-NEXT: retq ; ; SSE2-LABEL: vec384_v6i32: @@ -4307,36 +3950,36 @@ define void @vec384_v6f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { ; SCALAR-LABEL: vec384_v6f32: ; SCALAR: # %bb.0: -; SCALAR-NEXT: movq (%rdi), %rax -; SCALAR-NEXT: movq 8(%rdi), %rcx +; SCALAR-NEXT: movq 16(%rdi), %rax ; SCALAR-NEXT: movq %rax, %r8 ; SCALAR-NEXT: shrq $32, %r8 -; SCALAR-NEXT: movq %rcx, %r9 +; SCALAR-NEXT: movq (%rdi), %rcx +; SCALAR-NEXT: movq 8(%rdi), %rdi +; SCALAR-NEXT: movq %rdi, %r9 ; SCALAR-NEXT: shrq $32, %r9 -; SCALAR-NEXT: movq 16(%rdi), %rdi -; SCALAR-NEXT: movq %rdi, %r10 +; SCALAR-NEXT: movq %rcx, %r10 ; SCALAR-NEXT: shrq $32, %r10 ; SCALAR-NEXT: notl %r10d ; SCALAR-NEXT: shlq $32, %r10 -; SCALAR-NEXT: notl %edi -; SCALAR-NEXT: orq %r10, %rdi +; SCALAR-NEXT: notl %ecx +; SCALAR-NEXT: orq %r10, %rcx ; SCALAR-NEXT: notl %r9d ; SCALAR-NEXT: shlq $32, %r9 -; SCALAR-NEXT: notl %ecx -; SCALAR-NEXT: orq %r9, %rcx +; SCALAR-NEXT: notl %edi +; SCALAR-NEXT: orq %r9, %rdi ; SCALAR-NEXT: notl %r8d ; SCALAR-NEXT: shlq $32, %r8 ; SCALAR-NEXT: notl %eax ; SCALAR-NEXT: orq %r8, %rax -; SCALAR-NEXT: movq %rax, (%rsi) -; SCALAR-NEXT: movq %rcx, 8(%rsi) -; SCALAR-NEXT: movq %rdi, 16(%rsi) -; SCALAR-NEXT: movq %rax, (%rdx) -; SCALAR-NEXT: movq %rcx, 8(%rdx) -; SCALAR-NEXT: movq %rdi, 16(%rdx) -; SCALAR-NEXT: movq %rdi, 48(%rdx) -; SCALAR-NEXT: movq %rcx, 40(%rdx) -; SCALAR-NEXT: movq %rax, 32(%rdx) +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movq %rdi, 8(%rsi) +; SCALAR-NEXT: movq %rcx, (%rsi) +; SCALAR-NEXT: movq %rax, 16(%rdx) +; SCALAR-NEXT: movq %rdi, 8(%rdx) +; SCALAR-NEXT: movq %rcx, (%rdx) +; SCALAR-NEXT: movq %rax, 48(%rdx) +; SCALAR-NEXT: movq %rdi, 40(%rdx) +; SCALAR-NEXT: movq %rcx, 32(%rdx) ; SCALAR-NEXT: retq ; ; SSE2-LABEL: vec384_v6f32: @@ -4471,53 +4114,18 @@ ; SCALAR-NEXT: popq %rbx ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec384_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec384_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX1-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec384_v8i8: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec384_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec384_v8i8: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: movq %rax, 32(%rdx) +; SSE-NEXT: movq %rax, 40(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <8 x i8> %in.subvec.not, store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -4654,11 +4262,11 @@ ; SCALAR-NEXT: shrl $16, %r12d ; SCALAR-NEXT: notb %r12b ; SCALAR-NEXT: movzbl %r12b, %r12d +; SCALAR-NEXT: shll $16, %r12d ; SCALAR-NEXT: notb %r15b ; SCALAR-NEXT: movzbl %r15b, %r15d -; SCALAR-NEXT: shll $8, %r15d +; SCALAR-NEXT: shll $24, %r15d ; SCALAR-NEXT: orl %r12d, %r15d -; SCALAR-NEXT: shll $16, %r15d ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movzbl %r9b, %r9d ; SCALAR-NEXT: notb %bpl @@ -4669,11 +4277,11 @@ ; SCALAR-NEXT: orl %r15d, %r9d ; SCALAR-NEXT: notb %r14b ; SCALAR-NEXT: movzbl %r14b, %ebp +; SCALAR-NEXT: shll $16, %ebp ; SCALAR-NEXT: notb %bl ; SCALAR-NEXT: movzbl %bl, %ebx -; SCALAR-NEXT: shll $8, %ebx +; SCALAR-NEXT: shll $24, %ebx ; SCALAR-NEXT: orl %ebp, %ebx -; SCALAR-NEXT: shll $16, %ebx ; SCALAR-NEXT: notb %r11b ; SCALAR-NEXT: movzbl %r11b, %r11d ; SCALAR-NEXT: notb %r10b @@ -4684,11 +4292,11 @@ ; SCALAR-NEXT: orl %ebx, %r10d ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movzbl %r8b, %r8d +; SCALAR-NEXT: shll $16, %r8d ; SCALAR-NEXT: notb %dil ; SCALAR-NEXT: movzbl %dil, %edi -; SCALAR-NEXT: shll $8, %edi +; SCALAR-NEXT: shll $24, %edi ; SCALAR-NEXT: orl %r8d, %edi -; SCALAR-NEXT: shll $16, %edi ; SCALAR-NEXT: notb %cl ; SCALAR-NEXT: movzbl %cl, %ecx ; SCALAR-NEXT: notb %al @@ -4833,20 +4441,20 @@ ; SCALAR: # %bb.0: ; SCALAR-NEXT: pushq %r14 ; SCALAR-NEXT: pushq %rbx -; SCALAR-NEXT: movq (%rdi), %rax -; SCALAR-NEXT: movq 8(%rdi), %rcx +; SCALAR-NEXT: movq 16(%rdi), %rax ; SCALAR-NEXT: movq %rax, %r8 ; SCALAR-NEXT: shrq $32, %r8 ; SCALAR-NEXT: movq %rax, %r9 ; SCALAR-NEXT: shrq $48, %r9 -; SCALAR-NEXT: movq %rcx, %r10 +; SCALAR-NEXT: movq (%rdi), %rcx +; SCALAR-NEXT: movq 8(%rdi), %rdi +; SCALAR-NEXT: movq %rdi, %r10 ; SCALAR-NEXT: shrq $32, %r10 -; SCALAR-NEXT: movq %rcx, %r11 +; SCALAR-NEXT: movq %rdi, %r11 ; SCALAR-NEXT: shrq $48, %r11 -; SCALAR-NEXT: movq 16(%rdi), %rdi -; SCALAR-NEXT: movq %rdi, %rbx +; SCALAR-NEXT: movq %rcx, %rbx ; SCALAR-NEXT: shrq $32, %rbx -; SCALAR-NEXT: movq %rdi, %r14 +; SCALAR-NEXT: movq %rcx, %r14 ; SCALAR-NEXT: shrq $48, %r14 ; SCALAR-NEXT: notl %r14d ; SCALAR-NEXT: shll $16, %r14d @@ -4854,16 +4462,16 @@ ; SCALAR-NEXT: movzwl %bx, %ebx ; SCALAR-NEXT: orl %r14d, %ebx ; SCALAR-NEXT: shlq $32, %rbx -; SCALAR-NEXT: notl %edi -; SCALAR-NEXT: orq %rbx, %rdi +; SCALAR-NEXT: notl %ecx +; SCALAR-NEXT: orq %rbx, %rcx ; SCALAR-NEXT: notl %r11d ; SCALAR-NEXT: shll $16, %r11d ; SCALAR-NEXT: notl %r10d ; SCALAR-NEXT: movzwl %r10w, %r10d ; SCALAR-NEXT: orl %r11d, %r10d ; SCALAR-NEXT: shlq $32, %r10 -; SCALAR-NEXT: notl %ecx -; SCALAR-NEXT: orq %r10, %rcx +; SCALAR-NEXT: notl %edi +; SCALAR-NEXT: orq %r10, %rdi ; SCALAR-NEXT: notl %r9d ; SCALAR-NEXT: shll $16, %r9d ; SCALAR-NEXT: notl %r8d @@ -4872,15 +4480,15 @@ ; SCALAR-NEXT: shlq $32, %r8 ; SCALAR-NEXT: notl %eax ; SCALAR-NEXT: orq %r8, %rax -; SCALAR-NEXT: movq %rax, (%rsi) -; SCALAR-NEXT: movq %rcx, 8(%rsi) -; SCALAR-NEXT: movq %rdi, 16(%rsi) -; SCALAR-NEXT: movq %rax, (%rdx) -; SCALAR-NEXT: movq %rcx, 8(%rdx) -; SCALAR-NEXT: movq %rdi, 16(%rdx) -; SCALAR-NEXT: movq %rdi, 48(%rdx) -; SCALAR-NEXT: movq %rcx, 40(%rdx) -; SCALAR-NEXT: movq %rax, 32(%rdx) +; SCALAR-NEXT: movq %rax, 16(%rsi) +; SCALAR-NEXT: movq %rdi, 8(%rsi) +; SCALAR-NEXT: movq %rcx, (%rsi) +; SCALAR-NEXT: movq %rax, 16(%rdx) +; SCALAR-NEXT: movq %rdi, 8(%rdx) +; SCALAR-NEXT: movq %rcx, (%rdx) +; SCALAR-NEXT: movq %rax, 48(%rdx) +; SCALAR-NEXT: movq %rdi, 40(%rdx) +; SCALAR-NEXT: movq %rcx, 32(%rdx) ; SCALAR-NEXT: popq %rbx ; SCALAR-NEXT: popq %r14 ; SCALAR-NEXT: retq @@ -5128,101 +4736,101 @@ define void @vec384_v24i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { ; SCALAR-LABEL: vec384_v24i8: ; SCALAR: # %bb.0: -; SCALAR-NEXT: movq (%rdi), %rax +; SCALAR-NEXT: movq 16(%rdi), %rax +; SCALAR-NEXT: movq (%rdi), %r8 ; SCALAR-NEXT: movq 8(%rdi), %rcx -; SCALAR-NEXT: movq 16(%rdi), %rdi -; SCALAR-NEXT: movq %rdi, %r8 -; SCALAR-NEXT: shrq $40, %r8 -; SCALAR-NEXT: movq %rdi, %r9 +; SCALAR-NEXT: movq %r8, %rdi +; SCALAR-NEXT: shrq $40, %rdi +; SCALAR-NEXT: movq %r8, %r9 ; SCALAR-NEXT: shrq $56, %r9 -; SCALAR-NEXT: movq %rdi, %r10 +; SCALAR-NEXT: movq %r8, %r10 ; SCALAR-NEXT: shrq $48, %r10 ; SCALAR-NEXT: notb %r10b ; SCALAR-NEXT: movzbl %r10b, %r10d +; SCALAR-NEXT: shll $16, %r10d +; SCALAR-NEXT: notb %r9b +; SCALAR-NEXT: movzbl %r9b, %r11d +; SCALAR-NEXT: shll $24, %r11d +; SCALAR-NEXT: orl %r10d, %r11d +; SCALAR-NEXT: movq %r8, %r9 +; SCALAR-NEXT: shrq $32, %r9 ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movzbl %r9b, %r9d -; SCALAR-NEXT: shll $8, %r9d -; SCALAR-NEXT: orl %r10d, %r9d -; SCALAR-NEXT: movq %rdi, %r10 -; SCALAR-NEXT: shrq $32, %r10 +; SCALAR-NEXT: notb %dil +; SCALAR-NEXT: movzbl %dil, %edi +; SCALAR-NEXT: shll $8, %edi +; SCALAR-NEXT: orl %r9d, %edi +; SCALAR-NEXT: movl %r8d, %r10d +; SCALAR-NEXT: shrl $24, %r10d +; SCALAR-NEXT: movzwl %di, %r9d +; SCALAR-NEXT: orl %r11d, %r9d +; SCALAR-NEXT: movl %r8d, %edi +; SCALAR-NEXT: shrl $16, %edi +; SCALAR-NEXT: notb %dil +; SCALAR-NEXT: movzbl %dil, %edi +; SCALAR-NEXT: shll $16, %edi ; SCALAR-NEXT: notb %r10b ; SCALAR-NEXT: movzbl %r10b, %r10d +; SCALAR-NEXT: shll $24, %r10d +; SCALAR-NEXT: orl %edi, %r10d +; SCALAR-NEXT: movl %r8d, %edi +; SCALAR-NEXT: shrl $8, %edi ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movzbl %r8b, %r8d -; SCALAR-NEXT: shll $8, %r8d -; SCALAR-NEXT: orl %r10d, %r8d -; SCALAR-NEXT: movl %edi, %r10d -; SCALAR-NEXT: shrl $24, %r10d -; SCALAR-NEXT: shll $16, %r9d -; SCALAR-NEXT: movzwl %r8w, %r8d -; SCALAR-NEXT: orl %r9d, %r8d -; SCALAR-NEXT: movl %edi, %r9d -; SCALAR-NEXT: shrl $16, %r9d -; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl %r9b, %r9d -; SCALAR-NEXT: notb %r10b -; SCALAR-NEXT: movzbl %r10b, %r10d -; SCALAR-NEXT: shll $8, %r10d -; SCALAR-NEXT: orl %r9d, %r10d -; SCALAR-NEXT: movl %edi, %r9d -; SCALAR-NEXT: shrl $8, %r9d ; SCALAR-NEXT: notb %dil ; SCALAR-NEXT: movzbl %dil, %edi -; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl %r9b, %r9d -; SCALAR-NEXT: shll $8, %r9d -; SCALAR-NEXT: orl %edi, %r9d -; SCALAR-NEXT: movq %rcx, %r11 -; SCALAR-NEXT: shrq $40, %r11 -; SCALAR-NEXT: shll $16, %r10d -; SCALAR-NEXT: movzwl %r9w, %edi +; SCALAR-NEXT: shll $8, %edi +; SCALAR-NEXT: orl %r8d, %edi +; SCALAR-NEXT: movq %rcx, %r8 +; SCALAR-NEXT: shrq $40, %r8 +; SCALAR-NEXT: movzwl %di, %edi ; SCALAR-NEXT: orl %r10d, %edi +; SCALAR-NEXT: movq %rcx, %r10 +; SCALAR-NEXT: shrq $56, %r10 +; SCALAR-NEXT: shlq $32, %r9 +; SCALAR-NEXT: orq %r9, %rdi ; SCALAR-NEXT: movq %rcx, %r9 -; SCALAR-NEXT: shrq $56, %r9 -; SCALAR-NEXT: shlq $32, %r8 -; SCALAR-NEXT: orq %r8, %rdi -; SCALAR-NEXT: movq %rcx, %r8 -; SCALAR-NEXT: shrq $48, %r8 -; SCALAR-NEXT: notb %r8b -; SCALAR-NEXT: movzbl %r8b, %r8d +; SCALAR-NEXT: shrq $48, %r9 +; SCALAR-NEXT: notb %r9b +; SCALAR-NEXT: movzbl %r9b, %r9d +; SCALAR-NEXT: shll $16, %r9d +; SCALAR-NEXT: notb %r10b +; SCALAR-NEXT: movzbl %r10b, %r10d +; SCALAR-NEXT: shll $24, %r10d +; SCALAR-NEXT: orl %r9d, %r10d +; SCALAR-NEXT: movq %rcx, %r9 +; SCALAR-NEXT: shrq $32, %r9 ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movzbl %r9b, %r9d -; SCALAR-NEXT: shll $8, %r9d -; SCALAR-NEXT: orl %r8d, %r9d -; SCALAR-NEXT: movq %rcx, %r8 -; SCALAR-NEXT: shrq $32, %r8 ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movzbl %r8b, %r8d -; SCALAR-NEXT: notb %r11b -; SCALAR-NEXT: movzbl %r11b, %r10d -; SCALAR-NEXT: shll $8, %r10d -; SCALAR-NEXT: orl %r8d, %r10d -; SCALAR-NEXT: movl %ecx, %r11d -; SCALAR-NEXT: shrl $24, %r11d -; SCALAR-NEXT: shll $16, %r9d -; SCALAR-NEXT: movzwl %r10w, %r8d +; SCALAR-NEXT: shll $8, %r8d ; SCALAR-NEXT: orl %r9d, %r8d ; SCALAR-NEXT: movl %ecx, %r9d -; SCALAR-NEXT: shrl $16, %r9d +; SCALAR-NEXT: shrl $24, %r9d +; SCALAR-NEXT: movzwl %r8w, %r8d +; SCALAR-NEXT: orl %r10d, %r8d +; SCALAR-NEXT: movl %ecx, %r10d +; SCALAR-NEXT: shrl $16, %r10d +; SCALAR-NEXT: notb %r10b +; SCALAR-NEXT: movzbl %r10b, %r10d +; SCALAR-NEXT: shll $16, %r10d ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movzbl %r9b, %r9d -; SCALAR-NEXT: notb %r11b -; SCALAR-NEXT: movzbl %r11b, %r10d -; SCALAR-NEXT: shll $8, %r10d -; SCALAR-NEXT: orl %r9d, %r10d -; SCALAR-NEXT: movl %ecx, %r9d -; SCALAR-NEXT: shrl $8, %r9d +; SCALAR-NEXT: shll $24, %r9d +; SCALAR-NEXT: orl %r10d, %r9d +; SCALAR-NEXT: movl %ecx, %r10d +; SCALAR-NEXT: shrl $8, %r10d ; SCALAR-NEXT: notb %cl ; SCALAR-NEXT: movzbl %cl, %ecx -; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl %r9b, %r9d -; SCALAR-NEXT: shll $8, %r9d -; SCALAR-NEXT: orl %ecx, %r9d +; SCALAR-NEXT: notb %r10b +; SCALAR-NEXT: movzbl %r10b, %r10d +; SCALAR-NEXT: shll $8, %r10d +; SCALAR-NEXT: orl %ecx, %r10d ; SCALAR-NEXT: movq %rax, %r11 ; SCALAR-NEXT: shrq $40, %r11 -; SCALAR-NEXT: shll $16, %r10d -; SCALAR-NEXT: movzwl %r9w, %ecx -; SCALAR-NEXT: orl %r10d, %ecx +; SCALAR-NEXT: movzwl %r10w, %ecx +; SCALAR-NEXT: orl %r9d, %ecx ; SCALAR-NEXT: movq %rax, %r9 ; SCALAR-NEXT: shrq $56, %r9 ; SCALAR-NEXT: shlq $32, %r8 @@ -5231,9 +4839,10 @@ ; SCALAR-NEXT: shrq $48, %r8 ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movzbl %r8b, %r8d +; SCALAR-NEXT: shll $16, %r8d ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movzbl %r9b, %r9d -; SCALAR-NEXT: shll $8, %r9d +; SCALAR-NEXT: shll $24, %r9d ; SCALAR-NEXT: orl %r8d, %r9d ; SCALAR-NEXT: movq %rax, %r8 ; SCALAR-NEXT: shrq $32, %r8 @@ -5245,16 +4854,16 @@ ; SCALAR-NEXT: orl %r8d, %r10d ; SCALAR-NEXT: movl %eax, %r11d ; SCALAR-NEXT: shrl $24, %r11d -; SCALAR-NEXT: shll $16, %r9d ; SCALAR-NEXT: movzwl %r10w, %r8d ; SCALAR-NEXT: orl %r9d, %r8d ; SCALAR-NEXT: movl %eax, %r9d ; SCALAR-NEXT: shrl $16, %r9d ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movzbl %r9b, %r9d +; SCALAR-NEXT: shll $16, %r9d ; SCALAR-NEXT: notb %r11b ; SCALAR-NEXT: movzbl %r11b, %r10d -; SCALAR-NEXT: shll $8, %r10d +; SCALAR-NEXT: shll $24, %r10d ; SCALAR-NEXT: orl %r9d, %r10d ; SCALAR-NEXT: movl %eax, %r9d ; SCALAR-NEXT: shrl $8, %r9d @@ -5264,20 +4873,19 @@ ; SCALAR-NEXT: movzbl %r9b, %r9d ; SCALAR-NEXT: shll $8, %r9d ; SCALAR-NEXT: orl %eax, %r9d -; SCALAR-NEXT: shll $16, %r10d ; SCALAR-NEXT: movzwl %r9w, %eax ; SCALAR-NEXT: orl %r10d, %eax ; SCALAR-NEXT: shlq $32, %r8 ; SCALAR-NEXT: orq %r8, %rax -; SCALAR-NEXT: movq %rax, (%rsi) +; SCALAR-NEXT: movq %rax, 16(%rsi) ; SCALAR-NEXT: movq %rcx, 8(%rsi) -; SCALAR-NEXT: movq %rdi, 16(%rsi) -; SCALAR-NEXT: movq %rax, (%rdx) +; SCALAR-NEXT: movq %rdi, (%rsi) +; SCALAR-NEXT: movq %rax, 16(%rdx) ; SCALAR-NEXT: movq %rcx, 8(%rdx) -; SCALAR-NEXT: movq %rdi, 16(%rdx) -; SCALAR-NEXT: movq %rdi, 48(%rdx) +; SCALAR-NEXT: movq %rdi, (%rdx) +; SCALAR-NEXT: movq %rax, 48(%rdx) ; SCALAR-NEXT: movq %rcx, 40(%rdx) -; SCALAR-NEXT: movq %rax, 32(%rdx) +; SCALAR-NEXT: movq %rdi, 32(%rdx) ; SCALAR-NEXT: retq ; ; SSE2-LABEL: vec384_v24i8: @@ -5482,43 +5090,24 @@ ; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi) ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX1-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX1-NEXT: vmovdqa %xmm0, 48(%rdx) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-ONLY-LABEL: vec512_v2i8: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpxor (%rdi), %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512F-LABEL: vec512_v2i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpxor (%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: vec512_v2i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor (%rdi), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512BW-NEXT: vpbroadcastw %xmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX2-LABEL: vec512_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vpextrw $0, %xmm0, (%rsi) +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <2 x i8> %in.subvec.not, store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -5741,53 +5330,20 @@ ; SCALAR-NEXT: movl %eax, 56(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec512_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm0, 48(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec512_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec512_v2i32: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec512_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec512_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: movq %rax, 32(%rdx) +; SSE-NEXT: movq %rax, 40(%rdx) +; SSE-NEXT: movq %rax, 48(%rdx) +; SSE-NEXT: movq %rax, 56(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <2 x i32> %in.subvec.not, store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -5837,53 +5393,20 @@ ; SCALAR-NEXT: movl %eax, 56(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec512_v2f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm0, 48(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec512_v2f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec512_v2f32: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec512_v2f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec512_v2f32: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: movq %rax, 32(%rdx) +; SSE-NEXT: movq %rax, 40(%rdx) +; SSE-NEXT: movq %rax, 48(%rdx) +; SSE-NEXT: movq %rax, 56(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64 %in.subvec.int = xor <2 x i32> %in.subvec.not, %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float> @@ -6019,26 +5542,26 @@ define void @vec512_v2i128(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind { ; ALL-LABEL: vec512_v2i128: ; ALL: # %bb.0: -; ALL-NEXT: movq 16(%rdi), %rax -; ALL-NEXT: movq 24(%rdi), %rcx +; ALL-NEXT: movq 24(%rdi), %rax +; ALL-NEXT: movq 16(%rdi), %rcx ; ALL-NEXT: movq (%rdi), %r8 ; ALL-NEXT: movq 8(%rdi), %rdi -; ALL-NEXT: notq %rdi ; ALL-NEXT: notq %r8 +; ALL-NEXT: notq %rdi ; ALL-NEXT: notq %rcx ; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, 16(%rsi) -; ALL-NEXT: movq %rcx, 24(%rsi) -; ALL-NEXT: movq %r8, (%rsi) +; ALL-NEXT: movq %rax, 24(%rsi) +; ALL-NEXT: movq %rcx, 16(%rsi) ; ALL-NEXT: movq %rdi, 8(%rsi) -; ALL-NEXT: movq %rax, 16(%rdx) -; ALL-NEXT: movq %rcx, 24(%rdx) -; ALL-NEXT: movq %r8, (%rdx) +; ALL-NEXT: movq %r8, (%rsi) +; ALL-NEXT: movq %rax, 24(%rdx) +; ALL-NEXT: movq %rcx, 16(%rdx) ; ALL-NEXT: movq %rdi, 8(%rdx) -; ALL-NEXT: movq %rax, 48(%rdx) -; ALL-NEXT: movq %rcx, 56(%rdx) -; ALL-NEXT: movq %r8, 32(%rdx) +; ALL-NEXT: movq %r8, (%rdx) +; ALL-NEXT: movq %rax, 56(%rdx) +; ALL-NEXT: movq %rcx, 48(%rdx) ; ALL-NEXT: movq %rdi, 40(%rdx) +; ALL-NEXT: movq %r8, 32(%rdx) ; ALL-NEXT: retq %in.subvec.not = load <2 x i128>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <2 x i128> %in.subvec.not, @@ -6262,53 +5785,20 @@ ; SCALAR-NEXT: movw %ax, 56(%rdx) ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec512_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm0, 48(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec512_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec512_v4i16: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec512_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec512_v4i16: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: movq %rax, 32(%rdx) +; SSE-NEXT: movq %rax, 40(%rdx) +; SSE-NEXT: movq %rax, 48(%rdx) +; SSE-NEXT: movq %rax, 56(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <4 x i16> %in.subvec.not, store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -6454,15 +5944,15 @@ ; SSE2-LABEL: vec512_v4i64: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pxor (%rdi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rsi) -; SSE2-NEXT: movdqa %xmm1, 16(%rsi) -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) -; SSE2-NEXT: movdqa %xmm1, 48(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) +; SSE2-NEXT: pxor 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm1, 32(%rdx) ; SSE2-NEXT: retq ; ; AVX1-LABEL: vec512_v4i64: @@ -6523,15 +6013,15 @@ ; SSE2-LABEL: vec512_v4f64: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pxor (%rdi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rsi) -; SSE2-NEXT: movdqa %xmm1, 16(%rsi) -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) -; SSE2-NEXT: movdqa %xmm1, 48(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) +; SSE2-NEXT: pxor 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm1, 32(%rdx) ; SSE2-NEXT: retq ; ; AVX1-LABEL: vec512_v4f64: @@ -6660,53 +6150,20 @@ ; SCALAR-NEXT: popq %rbx ; SCALAR-NEXT: retq ; -; SSE2-LABEL: vec512_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rsi) -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm0, 48(%rdx) -; SSE2-NEXT: retq -; -; AVX1-LABEL: vec512_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-ONLY-LABEL: vec512_v8i8: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: vec512_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; SSE-LABEL: vec512_v8i8: +; SSE: # %bb.0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: notq %rax +; SSE-NEXT: movq %rax, (%rsi) +; SSE-NEXT: movq %rax, (%rdx) +; SSE-NEXT: movq %rax, 8(%rdx) +; SSE-NEXT: movq %rax, 16(%rdx) +; SSE-NEXT: movq %rax, 24(%rdx) +; SSE-NEXT: movq %rax, 32(%rdx) +; SSE-NEXT: movq %rax, 40(%rdx) +; SSE-NEXT: movq %rax, 48(%rdx) +; SSE-NEXT: movq %rax, 56(%rdx) +; SSE-NEXT: retq %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64 %in.subvec = xor <8 x i8> %in.subvec.not, store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64 @@ -6877,15 +6334,15 @@ ; SSE2-LABEL: vec512_v8i32: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pxor (%rdi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rsi) -; SSE2-NEXT: movdqa %xmm1, 16(%rsi) -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) -; SSE2-NEXT: movdqa %xmm1, 48(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) +; SSE2-NEXT: pxor 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm1, 32(%rdx) ; SSE2-NEXT: retq ; ; AVX1-LABEL: vec512_v8i32: @@ -6968,15 +6425,15 @@ ; SSE2-LABEL: vec512_v8f32: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pxor (%rdi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rsi) -; SSE2-NEXT: movdqa %xmm1, 16(%rsi) -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) -; SSE2-NEXT: movdqa %xmm1, 48(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) +; SSE2-NEXT: pxor 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm1, 32(%rdx) ; SSE2-NEXT: retq ; ; AVX1-LABEL: vec512_v8f32: @@ -7367,15 +6824,15 @@ ; SSE2-LABEL: vec512_v16i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pxor (%rdi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rsi) -; SSE2-NEXT: movdqa %xmm1, 16(%rsi) -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) -; SSE2-NEXT: movdqa %xmm1, 48(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) +; SSE2-NEXT: pxor 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm1, 32(%rdx) ; SSE2-NEXT: retq ; ; AVX1-LABEL: vec512_v16i16: @@ -7682,15 +7139,15 @@ ; SSE2-LABEL: vec512_v32i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pxor (%rdi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rsi) -; SSE2-NEXT: movdqa %xmm1, 16(%rsi) -; SSE2-NEXT: movdqa %xmm0, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) -; SSE2-NEXT: movdqa %xmm1, 48(%rdx) -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) +; SSE2-NEXT: pxor 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 16(%rsi) +; SSE2-NEXT: movdqa %xmm1, (%rsi) +; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm1, 32(%rdx) ; SSE2-NEXT: retq ; ; AVX1-LABEL: vec512_v32i8: @@ -7723,4 +7180,6 @@ ret void } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX512BW: {{.*}} +; AVX512F: {{.*}} ; SSSE3: {{.*}} diff --git a/llvm/test/CodeGen/X86/switch.ll b/llvm/test/CodeGen/X86/switch.ll --- a/llvm/test/CodeGen/X86/switch.ll +++ b/llvm/test/CodeGen/X86/switch.ll @@ -1416,10 +1416,10 @@ define void @int_max_table_cluster(i8 %x) { ; CHECK-LABEL: int_max_table_cluster: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cmpb $-9, %dil +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: cmpl $247, %eax ; CHECK-NEXT: ja .LBB15_4 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: jmpq *.LJTI15_0(,%rax,8) ; CHECK-NEXT: .LBB15_2: # %bb0 ; CHECK-NEXT: xorl %edi, %edi @@ -2443,12 +2443,10 @@ define void @pr23738(i4 %x) { ; CHECK-LABEL: pr23738: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: andb $15, %al -; CHECK-NEXT: cmpb $11, %al +; CHECK-NEXT: andl $15, %edi +; CHECK-NEXT: cmpl $11, %edi ; CHECK-NEXT: ja .LBB23_2 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: andl $15, %edi ; CHECK-NEXT: movl $2051, %eax # imm = 0x803 ; CHECK-NEXT: btl %edi, %eax ; CHECK-NEXT: jae .LBB23_2 diff --git a/llvm/test/CodeGen/X86/test-shrink-bug.ll b/llvm/test/CodeGen/X86/test-shrink-bug.ll --- a/llvm/test/CodeGen/X86/test-shrink-bug.ll +++ b/llvm/test/CodeGen/X86/test-shrink-bug.ll @@ -70,8 +70,8 @@ ; CHECK-X64-NEXT: je .LBB1_3 ; CHECK-X64-NEXT: # %bb.1: ; CHECK-X64-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8] -; CHECK-X64-NEXT: pextrw $4, %xmm0, %eax +; CHECK-X64-NEXT: pslld $8, %xmm0 +; CHECK-X64-NEXT: pextrw $1, %xmm0, %eax ; CHECK-X64-NEXT: testb $1, %al ; CHECK-X64-NEXT: jne .LBB1_3 ; CHECK-X64-NEXT: # %bb.2: # %no diff --git a/llvm/test/CodeGen/X86/test-shrink.ll b/llvm/test/CodeGen/X86/test-shrink.ll --- a/llvm/test/CodeGen/X86/test-shrink.ll +++ b/llvm/test/CodeGen/X86/test-shrink.ll @@ -822,8 +822,8 @@ ; ; CHECK-X86-LABEL: and64_trunc_16_sign: ; CHECK-X86: # %bb.0: -; CHECK-X86-NEXT: movl $32768, %eax # imm = 0x8000 -; CHECK-X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; CHECK-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-X86-NEXT: andl $32768, %eax # imm = 0x8000 ; CHECK-X86-NEXT: testw %ax, %ax ; CHECK-X86-NEXT: js .LBB18_2 ; CHECK-X86-NEXT: # %bb.1: # %yes @@ -867,7 +867,8 @@ ; ; CHECK-X86-LABEL: and64_trunc_16_sign_minsize: ; CHECK-X86: # %bb.0: -; CHECK-X86-NEXT: testw $-32768, {{[0-9]+}}(%esp) # imm = 0x8000 +; CHECK-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-X86-NEXT: testw $-32768, %ax # imm = 0x8000 ; CHECK-X86-NEXT: js .LBB19_2 ; CHECK-X86-NEXT: # %bb.1: # %yes ; CHECK-X86-NEXT: calll bar@PLT diff --git a/llvm/test/CodeGen/X86/test-vs-bittest.ll b/llvm/test/CodeGen/X86/test-vs-bittest.ll --- a/llvm/test/CodeGen/X86/test-vs-bittest.ll +++ b/llvm/test/CodeGen/X86/test-vs-bittest.ll @@ -601,9 +601,10 @@ define i64 @is_lower_bit_clear_i64(i64 %x) { ; CHECK-LABEL: is_lower_bit_clear_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testl $134217728, %edi # imm = 0x8000000 -; CHECK-NEXT: sete %al +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: shrl $27, %eax +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: xorq $1, %rax ; CHECK-NEXT: retq %sh = lshr i64 %x, 27 %m = and i64 %sh, 1 diff --git a/llvm/test/CodeGen/X86/udiv_fix.ll b/llvm/test/CodeGen/X86/udiv_fix.ll --- a/llvm/test/CodeGen/X86/udiv_fix.ll +++ b/llvm/test/CodeGen/X86/udiv_fix.ll @@ -89,8 +89,8 @@ ; ; X86-LABEL: func3: ; X86: # %bb.0: +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %eax, %eax ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: shll $4, %ecx diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll --- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll @@ -27,8 +27,7 @@ ; X86-LABEL: func: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl %ax, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shll $8, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %ecx @@ -108,8 +107,8 @@ ; ; X86-LABEL: func3: ; X86: # %bb.0: +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: addl %eax, %eax ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: shll $4, %ecx @@ -170,23 +169,24 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X64-LABEL: func5: ; X64: # %bb.0: -; X64-NEXT: pushq %rax +; X64-NEXT: pushq %rbx ; X64-NEXT: movq %rsi, %rdx -; X64-NEXT: leaq (%rdi,%rdi), %rax ; X64-NEXT: movq %rdi, %rsi -; X64-NEXT: shrq $63, %rsi -; X64-NEXT: shldq $31, %rax, %rsi ; X64-NEXT: shlq $32, %rdi +; X64-NEXT: shrq $32, %rsi +; X64-NEXT: xorl %ebx, %ebx ; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: callq __udivti3@PLT -; X64-NEXT: cmpq $2, %rdx -; X64-NEXT: movq $-1, %rcx -; X64-NEXT: cmovaeq %rcx, %rax +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: negq %rcx +; X64-NEXT: sbbq %rbx, %rbx +; X64-NEXT: orq %rax, %rbx ; X64-NEXT: cmpq $1, %rdx ; X64-NEXT: movl $1, %ecx ; X64-NEXT: cmovbq %rdx, %rcx +; X64-NEXT: cmovneq %rbx, %rax ; X64-NEXT: shrdq $1, %rcx, %rax -; X64-NEXT: popq %rcx +; X64-NEXT: popq %rbx ; X64-NEXT: retq ; ; X86-LABEL: func5: @@ -214,15 +214,17 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: calll __udivti3 ; X86-NEXT: addl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl $-1, %eax -; X86-NEXT: movl $-1, %edx -; X86-NEXT: jne .LBB4_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl (%esp), %eax +; X86-NEXT: movl (%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: .LBB4_2: +; X86-NEXT: cmpl $-1, %edx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: cmovel %edx, %eax +; X86-NEXT: cmovel %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: cmovnel %ecx, %edx +; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: leal -4(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %ebp @@ -284,15 +286,14 @@ ; X86-LABEL: func7: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shll $17, %edx -; X86-NEXT: shrl $15, %ecx -; X86-NEXT: andl $1, %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: shrl $15, %edx +; X86-NEXT: shll $17, %eax ; X86-NEXT: pushl $0 -; X86-NEXT: pushl %eax ; X86-NEXT: pushl %ecx ; X86-NEXT: pushl %edx +; X86-NEXT: pushl %eax ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: cmpl $131071, %eax # imm = 0x1FFFF @@ -319,7 +320,7 @@ ; X64-NEXT: movq %xmm4, %rcx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx -; X64-NEXT: movq %rax, %xmm8 +; X64-NEXT: movq %rax, %xmm7 ; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; X64-NEXT: movq %xmm3, %rax ; X64-NEXT: movdqa %xmm1, %xmm3 @@ -328,31 +329,30 @@ ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm3 -; X64-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm3[0] +; X64-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm3[0] ; X64-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] -; X64-NEXT: movdqa %xmm8, %xmm3 +; X64-NEXT: movdqa %xmm7, %xmm3 ; X64-NEXT: pxor %xmm4, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; X64-NEXT: movdqa {{.*#+}} xmm6 = [2147483649,2147483649,2147483649,2147483649] -; X64-NEXT: pcmpeqd %xmm6, %xmm7 ; X64-NEXT: movdqa {{.*#+}} xmm5 = [9223372043297226751,9223372043297226751] -; X64-NEXT: movdqa %xmm5, %xmm9 -; X64-NEXT: pcmpgtd %xmm3, %xmm9 -; X64-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; X64-NEXT: pand %xmm7, %xmm10 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3] -; X64-NEXT: por %xmm10, %xmm3 -; X64-NEXT: movdqa {{.*#+}} xmm7 = [8589934591,8589934591] -; X64-NEXT: pand %xmm3, %xmm8 -; X64-NEXT: pandn %xmm7, %xmm3 -; X64-NEXT: por %xmm8, %xmm3 +; X64-NEXT: movdqa %xmm5, %xmm6 +; X64-NEXT: pcmpgtd %xmm3, %xmm6 +; X64-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] +; X64-NEXT: pcmpeqd %xmm5, %xmm3 +; X64-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] +; X64-NEXT: pand %xmm8, %xmm9 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; X64-NEXT: por %xmm9, %xmm3 +; X64-NEXT: movdqa {{.*#+}} xmm6 = [8589934591,8589934591] +; X64-NEXT: pand %xmm3, %xmm7 +; X64-NEXT: pandn %xmm6, %xmm3 +; X64-NEXT: por %xmm7, %xmm3 ; X64-NEXT: psrlq $1, %xmm3 ; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; X64-NEXT: movq %xmm2, %rax ; X64-NEXT: movd %xmm1, %ecx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx -; X64-NEXT: movq %rax, %xmm8 +; X64-NEXT: movq %rax, %xmm7 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; X64-NEXT: movq %xmm0, %rax ; X64-NEXT: psrlq $32, %xmm1 @@ -360,18 +360,19 @@ ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divq %rcx ; X64-NEXT: movq %rax, %xmm0 -; X64-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; X64-NEXT: pxor %xmm8, %xmm4 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; X64-NEXT: pcmpeqd %xmm6, %xmm0 -; X64-NEXT: pcmpgtd %xmm4, %xmm5 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2] -; X64-NEXT: pand %xmm0, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; X64-NEXT: por %xmm1, %xmm0 -; X64-NEXT: pand %xmm0, %xmm8 -; X64-NEXT: pandn %xmm7, %xmm0 -; X64-NEXT: por %xmm8, %xmm0 +; X64-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; X64-NEXT: pxor %xmm7, %xmm4 +; X64-NEXT: movdqa %xmm5, %xmm0 +; X64-NEXT: pcmpgtd %xmm4, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] +; X64-NEXT: pcmpeqd %xmm5, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; X64-NEXT: pand %xmm1, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-NEXT: por %xmm2, %xmm0 +; X64-NEXT: pand %xmm0, %xmm7 +; X64-NEXT: pandn %xmm6, %xmm0 +; X64-NEXT: por %xmm7, %xmm0 ; X64-NEXT: psrlq $1, %xmm0 ; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; X64-NEXT: retq @@ -382,25 +383,19 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $8, %esp +; X86-NEXT: subl $12, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax), %ecx -; X86-NEXT: shrl $31, %eax -; X86-NEXT: shldl $31, %ecx, %eax ; X86-NEXT: pushl $0 ; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl $0 ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %edi -; X86-NEXT: leal (%ebx,%ebx), %eax -; X86-NEXT: shrl $31, %ebx -; X86-NEXT: shldl $31, %eax, %ebx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: pushl $0 ; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx @@ -408,59 +403,67 @@ ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: leal (%esi,%esi), %eax -; X86-NEXT: shrl $31, %esi -; X86-NEXT: shldl $31, %eax, %esi +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: pushl $0 -; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: pushl $0 ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: leal (%edx,%edx), %ecx -; X86-NEXT: shrl $31, %edx -; X86-NEXT: shldl $31, %ecx, %edx +; X86-NEXT: xorl %esi, %esi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: cmpl $2, %esi -; X86-NEXT: movl $-1, %edx -; X86-NEXT: cmovael %edx, %eax -; X86-NEXT: cmpl $1, %esi -; X86-NEXT: movl $1, %ebp -; X86-NEXT: cmovael %ebp, %esi -; X86-NEXT: shldl $31, %eax, %esi -; X86-NEXT: cmpl $2, %ebx +; X86-NEXT: negl %ecx +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: orl %eax, %ebx +; X86-NEXT: cmpl $1, %edx +; X86-NEXT: cmovel %eax, %ebx +; X86-NEXT: movl $1, %eax +; X86-NEXT: cmovael %eax, %edx +; X86-NEXT: shrdl $1, %edx, %ebx +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: negl %eax +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi ; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: cmovael %edx, %eax -; X86-NEXT: cmpl $1, %ebx -; X86-NEXT: cmovael %ebp, %ebx -; X86-NEXT: shldl $31, %eax, %ebx -; X86-NEXT: cmpl $2, %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: cmpl $1, %ebp +; X86-NEXT: cmovel %eax, %edi +; X86-NEXT: movl $1, %edx +; X86-NEXT: cmovael %edx, %ebp +; X86-NEXT: shrdl $1, %ebp, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: negl %eax +; X86-NEXT: movl $0, %ebp +; X86-NEXT: sbbl %ebp, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: cmovael %edx, %eax -; X86-NEXT: cmpl $1, %edi -; X86-NEXT: cmovael %ebp, %edi -; X86-NEXT: shldl $31, %eax, %edi +; X86-NEXT: orl %eax, %ebp +; X86-NEXT: cmpl $1, %ecx +; X86-NEXT: cmovel %eax, %ebp +; X86-NEXT: cmovael %edx, %ecx +; X86-NEXT: shrdl $1, %ecx, %ebp ; X86-NEXT: pushl $0 ; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %ecx +; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl $0 ; X86-NEXT: calll __udivdi3 ; X86-NEXT: addl $16, %esp -; X86-NEXT: cmpl $2, %edx -; X86-NEXT: movl $-1, %ecx -; X86-NEXT: cmovael %ecx, %eax +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: negl %ecx +; X86-NEXT: sbbl %esi, %esi +; X86-NEXT: orl %eax, %esi ; X86-NEXT: cmpl $1, %edx -; X86-NEXT: cmovbl %edx, %ebp -; X86-NEXT: shldl $31, %eax, %ebp +; X86-NEXT: movl $1, %ecx +; X86-NEXT: cmovbl %edx, %ecx +; X86-NEXT: cmovel %eax, %esi +; X86-NEXT: shrdl $1, %ecx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebp, 12(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %ebx, 4(%eax) -; X86-NEXT: movl %esi, (%eax) -; X86-NEXT: addl $8, %esp +; X86-NEXT: movl %esi, 12(%eax) +; X86-NEXT: movl %ebp, 8(%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: addl $12, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll --- a/llvm/test/CodeGen/X86/umax.ll +++ b/llvm/test/CodeGen/X86/umax.ll @@ -312,34 +312,38 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmpl $1, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: testl %esi, %esi -; X86-NEXT: movl $1, %edi -; X86-NEXT: cmovnel %eax, %edi -; X86-NEXT: cmovel %ebx, %edi -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: negl %ebp -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %edx, %ebp +; X86-NEXT: cmpl $1, %edi +; X86-NEXT: movl %edi, %esi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: testl %edx, %edx ; X86-NEXT: movl $1, %ebp -; X86-NEXT: cmovbl %eax, %ebp -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: cmovbl %esi, %ebx -; X86-NEXT: orl %edx, %eax +; X86-NEXT: cmovnel %edi, %ebp +; X86-NEXT: cmovel %esi, %ebp +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: negl %ebx +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %ecx, %ebx +; X86-NEXT: movl $1, %ebx +; X86-NEXT: cmovbl %edx, %esi +; X86-NEXT: cmovbl %edi, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: cmovel %ebp, %ebx +; X86-NEXT: cmovel %edx, %esi +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: cmovel %ecx, %edx +; X86-NEXT: cmovel %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edx, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: cmovel %edi, %ebp -; X86-NEXT: cmovel %esi, %ebx -; X86-NEXT: movl %ebx, 4(%eax) -; X86-NEXT: movl %ebp, (%eax) +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %ebx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll --- a/llvm/test/CodeGen/X86/umul-with-overflow.ll +++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll @@ -529,18 +529,17 @@ ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx ; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r8, %r11 -; X64-NEXT: movq %rcx, %r8 -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %r8, %r10 +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r10 +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %r10, %rbp +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %r8, %rbp ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %r15 ; X64-NEXT: addq %rbx, %r15 @@ -552,59 +551,59 @@ ; X64-NEXT: addq %r15, %rbx ; X64-NEXT: adcq %r14, %r12 ; X64-NEXT: setb %al -; X64-NEXT: movzbl %al, %r10d -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movzbl %al, %r8d +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: addq %r12, %r13 -; X64-NEXT: adcq %r10, %r15 -; X64-NEXT: movq %r8, %rax +; X64-NEXT: adcq %r8, %r15 +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r14 -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %r12, %r10 +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %r12, %r8 ; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %r10, %r12 +; X64-NEXT: addq %r8, %r12 ; X64-NEXT: adcq %rbp, %rdx -; X64-NEXT: imulq %r9, %r11 +; X64-NEXT: imulq %r9, %r10 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; X64-NEXT: addq %r13, %r14 ; X64-NEXT: adcq %r15, %r12 -; X64-NEXT: adcq %rdx, %r11 +; X64-NEXT: adcq %rdx, %r10 ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %r15 -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r10, %rbp +; X64-NEXT: addq %r8, %rbp ; X64-NEXT: adcq $0, %r13 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8 ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r10 +; X64-NEXT: mulq %r8 ; X64-NEXT: addq %rbp, %rax ; X64-NEXT: adcq %r13, %rdx -; X64-NEXT: imulq %r10, %rcx -; X64-NEXT: addq %rdx, %rcx +; X64-NEXT: imulq %r8, %r11 ; X64-NEXT: addq %r14, %r15 ; X64-NEXT: adcq %r12, %rax -; X64-NEXT: adcq %r11, %rcx -; X64-NEXT: imulq %r9, %r8 +; X64-NEXT: adcq %rdx, %r11 +; X64-NEXT: imulq %r9, %rcx +; X64-NEXT: addq %r10, %rcx +; X64-NEXT: addq %r11, %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rdx ; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rsi ; X64-NEXT: addq %rdx, %rsi -; X64-NEXT: addq %r8, %rsi ; X64-NEXT: addq %rcx, %rsi ; X64-NEXT: movq %rbx, 8(%rdi) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll --- a/llvm/test/CodeGen/X86/umul_fix.ll +++ b/llvm/test/CodeGen/X86/umul_fix.ll @@ -76,11 +76,11 @@ ; X64-LABEL: func3: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: andb $15, %al -; X64-NEXT: andb $15, %sil -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: mulb %sil +; X64-NEXT: andl $15, %esi +; X64-NEXT: andl $15, %eax +; X64-NEXT: imull %esi, %eax ; X64-NEXT: shrb $2, %al +; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X86-LABEL: func3: @@ -89,8 +89,11 @@ ; X86-NEXT: andb $15, %al ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: andb $15, %cl -; X86-NEXT: mulb %cl +; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: imull %ecx, %eax ; X86-NEXT: shrb $2, %al +; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl %tmp = call i4 @llvm.umul.fix.i4(i4 %x, i4 %y, i32 2) ret i4 %tmp diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll --- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll @@ -39,7 +39,7 @@ ; CHECK-NOBMI-NEXT: # kill: def $esi killed $esi def $rsi ; CHECK-NOBMI-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NOBMI-NEXT: andl $4080, %edi # imm = 0xFF0 -; CHECK-NOBMI-NEXT: andl $-4081, %esi # imm = 0xF00F +; CHECK-NOBMI-NEXT: andl $61455, %esi # imm = 0xF00F ; CHECK-NOBMI-NEXT: leal (%rsi,%rdi), %eax ; CHECK-NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NOBMI-NEXT: retq @@ -49,7 +49,7 @@ ; CHECK-BMI-NEXT: # kill: def $esi killed $esi def $rsi ; CHECK-BMI-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-BMI-NEXT: andl $4080, %edi # imm = 0xFF0 -; CHECK-BMI-NEXT: andl $-4081, %esi # imm = 0xF00F +; CHECK-BMI-NEXT: andl $61455, %esi # imm = 0xF00F ; CHECK-BMI-NEXT: leal (%rsi,%rdi), %eax ; CHECK-BMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-BMI-NEXT: retq diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll --- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll @@ -39,7 +39,7 @@ ; CHECK-NOBMI-NEXT: # kill: def $esi killed $esi def $rsi ; CHECK-NOBMI-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NOBMI-NEXT: andl $21845, %edi # imm = 0x5555 -; CHECK-NOBMI-NEXT: andl $-21846, %esi # imm = 0xAAAA +; CHECK-NOBMI-NEXT: andl $43690, %esi # imm = 0xAAAA ; CHECK-NOBMI-NEXT: leal (%rsi,%rdi), %eax ; CHECK-NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NOBMI-NEXT: retq @@ -49,7 +49,7 @@ ; CHECK-BMI-NEXT: # kill: def $esi killed $esi def $rsi ; CHECK-BMI-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-BMI-NEXT: andl $21845, %edi # imm = 0x5555 -; CHECK-BMI-NEXT: andl $-21846, %esi # imm = 0xAAAA +; CHECK-BMI-NEXT: andl $43690, %esi # imm = 0xAAAA ; CHECK-BMI-NEXT: leal (%rsi,%rdi), %eax ; CHECK-BMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-BMI-NEXT: retq diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll --- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll @@ -39,7 +39,7 @@ ; CHECK-NOBMI-NEXT: # kill: def $esi killed $esi def $rsi ; CHECK-NOBMI-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NOBMI-NEXT: andl $3855, %edi # imm = 0xF0F -; CHECK-NOBMI-NEXT: andl $-3856, %esi # imm = 0xF0F0 +; CHECK-NOBMI-NEXT: andl $61680, %esi # imm = 0xF0F0 ; CHECK-NOBMI-NEXT: leal (%rsi,%rdi), %eax ; CHECK-NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NOBMI-NEXT: retq @@ -49,7 +49,7 @@ ; CHECK-BMI-NEXT: # kill: def $esi killed $esi def $rsi ; CHECK-BMI-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-BMI-NEXT: andl $3855, %edi # imm = 0xF0F -; CHECK-BMI-NEXT: andl $-3856, %esi # imm = 0xF0F0 +; CHECK-BMI-NEXT: andl $61680, %esi # imm = 0xF0F0 ; CHECK-BMI-NEXT: leal (%rsi,%rdi), %eax ; CHECK-BMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-BMI-NEXT: retq diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll --- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll @@ -37,7 +37,7 @@ ; CHECK-NOBMI-LABEL: out16_constmask: ; CHECK-NOBMI: # %bb.0: ; CHECK-NOBMI-NEXT: movzbl %dil, %eax -; CHECK-NOBMI-NEXT: andl $-256, %esi +; CHECK-NOBMI-NEXT: andl $65280, %esi # imm = 0xFF00 ; CHECK-NOBMI-NEXT: orl %esi, %eax ; CHECK-NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NOBMI-NEXT: retq @@ -45,7 +45,7 @@ ; CHECK-BMI-LABEL: out16_constmask: ; CHECK-BMI: # %bb.0: ; CHECK-BMI-NEXT: movzbl %dil, %eax -; CHECK-BMI-NEXT: andl $-256, %esi +; CHECK-BMI-NEXT: andl $65280, %esi # imm = 0xFF00 ; CHECK-BMI-NEXT: orl %esi, %eax ; CHECK-BMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-BMI-NEXT: retq diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll --- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll @@ -33,11 +33,10 @@ define i16 @out16(i16 %x, i16 %y, i16 %mask) { ; CHECK-NOBMI-LABEL: out16: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %edx, %eax -; CHECK-NOBMI-NEXT: andl %edx, %edi -; CHECK-NOBMI-NEXT: notl %eax -; CHECK-NOBMI-NEXT: andl %esi, %eax -; CHECK-NOBMI-NEXT: orl %edi, %eax +; CHECK-NOBMI-NEXT: movl %edi, %eax +; CHECK-NOBMI-NEXT: xorl %esi, %eax +; CHECK-NOBMI-NEXT: andl %edx, %eax +; CHECK-NOBMI-NEXT: xorl %esi, %eax ; CHECK-NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NOBMI-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll @@ -11,12 +11,26 @@ ; CHECK-SSE1-LABEL: out_constant_varx_mone: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 -; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: andps (%rsi), %xmm0 -; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE1-NEXT: movq (%rcx), %rdx +; CHECK-SSE1-NEXT: movq 8(%rcx), %rcx +; CHECK-SSE1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rcx +; CHECK-SSE1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN] +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: andps (%rsi), %xmm1 +; CHECK-SSE1-NEXT: orps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_constant_varx_mone: @@ -85,8 +99,24 @@ ; CHECK-SSE1-LABEL: out_constant_varx_mone_invmask: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movaps (%rsi), %xmm0 -; CHECK-SSE1-NEXT: orps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: movq (%rcx), %rdx +; CHECK-SSE1-NEXT: movq 8(%rcx), %rcx +; CHECK-SSE1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rcx +; CHECK-SSE1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-SSE1-NEXT: movaps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: andnps (%rsi), %xmm0 +; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; @@ -194,12 +224,12 @@ ; CHECK-SSE1-LABEL: in_constant_varx_42: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm0 = [5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44] ; CHECK-SSE1-NEXT: movaps (%rsi), %xmm1 -; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: andps (%rcx), %xmm1 +; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_constant_varx_42: @@ -270,10 +300,11 @@ ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: andnps (%rsi), %xmm1 -; CHECK-SSE1-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44] +; CHECK-SSE1-NEXT: movaps (%rsi), %xmm2 +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm2 +; CHECK-SSE1-NEXT: andnps %xmm2, %xmm0 +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; @@ -306,8 +337,24 @@ ; CHECK-SSE1-LABEL: out_constant_mone_vary: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movaps (%rdx), %xmm0 -; CHECK-SSE1-NEXT: orps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: movq (%rcx), %rsi +; CHECK-SSE1-NEXT: movq 8(%rcx), %rcx +; CHECK-SSE1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rcx +; CHECK-SSE1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rsi +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-SSE1-NEXT: movaps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: andnps (%rdx), %xmm0 +; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; @@ -336,8 +383,24 @@ ; CHECK-SSE1-LABEL: in_constant_mone_vary: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 -; CHECK-SSE1-NEXT: orps (%rdx), %xmm0 +; CHECK-SSE1-NEXT: movq (%rdx), %rsi +; CHECK-SSE1-NEXT: movq 8(%rdx), %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rsi +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-SSE1-NEXT: movaps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: andnps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; @@ -366,12 +429,26 @@ ; CHECK-SSE1-LABEL: out_constant_mone_vary_invmask: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 -; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: andps (%rdx), %xmm0 -; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE1-NEXT: movq (%rcx), %rsi +; CHECK-SSE1-NEXT: movq 8(%rcx), %rcx +; CHECK-SSE1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rcx +; CHECK-SSE1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rsi +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN] +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: andps (%rdx), %xmm1 +; CHECK-SSE1-NEXT: orps %xmm0, %xmm1 +; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_constant_mone_vary_invmask: @@ -406,10 +483,27 @@ ; CHECK-SSE1-LABEL: in_constant_mone_vary_invmask: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: movq (%rdx), %rsi +; CHECK-SSE1-NEXT: movq 8(%rdx), %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rsi +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 ; CHECK-SSE1-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE1-NEXT: orps (%rdx), %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE1-NEXT: movaps %xmm1, %xmm2 +; CHECK-SSE1-NEXT: andnps %xmm0, %xmm2 +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm2 +; CHECK-SSE1-NEXT: movaps %xmm2, (%rdi) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_constant_mone_vary_invmask: @@ -476,11 +570,25 @@ ; CHECK-SSE1-LABEL: in_constant_42_vary: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: andnps (%rdx), %xmm1 -; CHECK-SSE1-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movq (%rdx), %rsi +; CHECK-SSE1-NEXT: movq 8(%rdx), %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rsi +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm0 = [5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44] +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: andps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; @@ -551,11 +659,26 @@ ; CHECK-SSE1-LABEL: in_constant_42_vary_invmask: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: movq (%rdx), %rsi +; CHECK-SSE1-NEXT: movq 8(%rdx), %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rsi +; CHECK-SSE1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 -; CHECK-SSE1-NEXT: movaps (%rdx), %xmm1 -; CHECK-SSE1-NEXT: andps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm2 = [5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44] +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm2 +; CHECK-SSE1-NEXT: andnps %xmm2, %xmm0 +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 ; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) ; CHECK-SSE1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll @@ -86,11 +86,10 @@ define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind { ; CHECK-LABEL: out_v1i16: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: andl %edx, %edi -; CHECK-NEXT: notl %eax -; CHECK-NEXT: andl %esi, %eax -; CHECK-NEXT: orl %edi, %eax +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: xorl %esi, %eax +; CHECK-NEXT: andl %edx, %eax +; CHECK-NEXT: xorl %esi, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq %mx = and <1 x i16> %x, %mask @@ -235,32 +234,28 @@ define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind { ; CHECK-BASELINE-LABEL: out_v2i16: ; CHECK-BASELINE: # %bb.0: -; CHECK-BASELINE-NEXT: movl %r8d, %eax +; CHECK-BASELINE-NEXT: movl %edi, %eax +; CHECK-BASELINE-NEXT: xorl %edx, %eax +; CHECK-BASELINE-NEXT: andl %r8d, %eax +; CHECK-BASELINE-NEXT: xorl %edx, %eax +; CHECK-BASELINE-NEXT: xorl %ecx, %esi ; CHECK-BASELINE-NEXT: andl %r9d, %esi -; CHECK-BASELINE-NEXT: andl %r8d, %edi -; CHECK-BASELINE-NEXT: notl %eax -; CHECK-BASELINE-NEXT: notl %r9d -; CHECK-BASELINE-NEXT: andl %ecx, %r9d -; CHECK-BASELINE-NEXT: orl %esi, %r9d -; CHECK-BASELINE-NEXT: andl %edx, %eax -; CHECK-BASELINE-NEXT: orl %edi, %eax +; CHECK-BASELINE-NEXT: xorl %ecx, %esi ; CHECK-BASELINE-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-BASELINE-NEXT: movl %r9d, %edx +; CHECK-BASELINE-NEXT: movl %esi, %edx ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v2i16: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movl %r8d, %eax +; CHECK-SSE1-NEXT: movl %edi, %eax +; CHECK-SSE1-NEXT: xorl %edx, %eax +; CHECK-SSE1-NEXT: andl %r8d, %eax +; CHECK-SSE1-NEXT: xorl %edx, %eax +; CHECK-SSE1-NEXT: xorl %ecx, %esi ; CHECK-SSE1-NEXT: andl %r9d, %esi -; CHECK-SSE1-NEXT: andl %r8d, %edi -; CHECK-SSE1-NEXT: notl %eax -; CHECK-SSE1-NEXT: notl %r9d -; CHECK-SSE1-NEXT: andl %ecx, %r9d -; CHECK-SSE1-NEXT: orl %esi, %r9d -; CHECK-SSE1-NEXT: andl %edx, %eax -; CHECK-SSE1-NEXT: orl %edi, %eax +; CHECK-SSE1-NEXT: xorl %ecx, %esi ; CHECK-SSE1-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-SSE1-NEXT: movl %r9d, %edx +; CHECK-SSE1-NEXT: movl %esi, %edx ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v2i16: @@ -439,49 +434,55 @@ ; CHECK-BASELINE-LABEL: out_v4i16: ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d -; CHECK-BASELINE-NEXT: xorl %r11d, %edx -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx -; CHECK-BASELINE-NEXT: xorl %r11d, %edx -; CHECK-BASELINE-NEXT: xorl %r10d, %ecx -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-BASELINE-NEXT: xorl %r10d, %ecx -; CHECK-BASELINE-NEXT: xorl %edi, %r8d -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w -; CHECK-BASELINE-NEXT: xorl %edi, %r8d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: andl %r11d, %r8d +; CHECK-BASELINE-NEXT: andl %r10d, %ecx +; CHECK-BASELINE-NEXT: andl %edi, %edx +; CHECK-BASELINE-NEXT: notl %edi +; CHECK-BASELINE-NEXT: notl %r10d +; CHECK-BASELINE-NEXT: notl %r11d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r11w +; CHECK-BASELINE-NEXT: orl %r8d, %r11d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r10w +; CHECK-BASELINE-NEXT: orl %ecx, %r10d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %di +; CHECK-BASELINE-NEXT: orl %edx, %edi ; CHECK-BASELINE-NEXT: xorl %r9d, %esi -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-BASELINE-NEXT: andl {{[0-9]+}}(%rsp), %esi ; CHECK-BASELINE-NEXT: xorl %r9d, %esi ; CHECK-BASELINE-NEXT: movw %si, (%rax) -; CHECK-BASELINE-NEXT: movw %r8w, 6(%rax) -; CHECK-BASELINE-NEXT: movw %cx, 4(%rax) -; CHECK-BASELINE-NEXT: movw %dx, 2(%rax) +; CHECK-BASELINE-NEXT: movw %r11w, 6(%rax) +; CHECK-BASELINE-NEXT: movw %r10w, 4(%rax) +; CHECK-BASELINE-NEXT: movw %di, 2(%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i16: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d -; CHECK-SSE1-NEXT: xorl %r11d, %edx -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx -; CHECK-SSE1-NEXT: xorl %r11d, %edx -; CHECK-SSE1-NEXT: xorl %r10d, %ecx -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-SSE1-NEXT: xorl %r10d, %ecx -; CHECK-SSE1-NEXT: xorl %edi, %r8d -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w -; CHECK-SSE1-NEXT: xorl %edi, %r8d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: andl %r11d, %r8d +; CHECK-SSE1-NEXT: andl %r10d, %ecx +; CHECK-SSE1-NEXT: andl %edi, %edx +; CHECK-SSE1-NEXT: notl %edi +; CHECK-SSE1-NEXT: notl %r10d +; CHECK-SSE1-NEXT: notl %r11d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r11w +; CHECK-SSE1-NEXT: orl %r8d, %r11d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r10w +; CHECK-SSE1-NEXT: orl %ecx, %r10d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %di +; CHECK-SSE1-NEXT: orl %edx, %edi ; CHECK-SSE1-NEXT: xorl %r9d, %esi -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-SSE1-NEXT: andl {{[0-9]+}}(%rsp), %esi ; CHECK-SSE1-NEXT: xorl %r9d, %esi ; CHECK-SSE1-NEXT: movw %si, (%rax) -; CHECK-SSE1-NEXT: movw %r8w, 6(%rax) -; CHECK-SSE1-NEXT: movw %cx, 4(%rax) -; CHECK-SSE1-NEXT: movw %dx, 2(%rax) +; CHECK-SSE1-NEXT: movw %r11w, 6(%rax) +; CHECK-SSE1-NEXT: movw %r10w, 4(%rax) +; CHECK-SSE1-NEXT: movw %di, 2(%rax) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v4i16: @@ -506,43 +507,47 @@ ; CHECK-BASELINE-LABEL: out_v4i16_undef: ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: andl %r10d, %r8d +; CHECK-BASELINE-NEXT: andl %edi, %edx ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-BASELINE-NEXT: xorl %r10d, %edx -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx -; CHECK-BASELINE-NEXT: xorl %r10d, %edx -; CHECK-BASELINE-NEXT: xorl %edi, %r8d -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w -; CHECK-BASELINE-NEXT: xorl %edi, %r8d +; CHECK-BASELINE-NEXT: notl %edi +; CHECK-BASELINE-NEXT: notl %r10d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r10w +; CHECK-BASELINE-NEXT: orl %r8d, %r10d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %di +; CHECK-BASELINE-NEXT: orl %edx, %edi ; CHECK-BASELINE-NEXT: xorl %r9d, %esi -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-BASELINE-NEXT: andl {{[0-9]+}}(%rsp), %esi ; CHECK-BASELINE-NEXT: xorl %r9d, %esi ; CHECK-BASELINE-NEXT: movw %cx, 4(%rax) ; CHECK-BASELINE-NEXT: movw %si, (%rax) -; CHECK-BASELINE-NEXT: movw %r8w, 6(%rax) -; CHECK-BASELINE-NEXT: movw %dx, 2(%rax) +; CHECK-BASELINE-NEXT: movw %r10w, 6(%rax) +; CHECK-BASELINE-NEXT: movw %di, 2(%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i16_undef: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: andl %r10d, %r8d +; CHECK-SSE1-NEXT: andl %edi, %edx ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-SSE1-NEXT: xorl %r10d, %edx -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx -; CHECK-SSE1-NEXT: xorl %r10d, %edx -; CHECK-SSE1-NEXT: xorl %edi, %r8d -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w -; CHECK-SSE1-NEXT: xorl %edi, %r8d +; CHECK-SSE1-NEXT: notl %edi +; CHECK-SSE1-NEXT: notl %r10d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r10w +; CHECK-SSE1-NEXT: orl %r8d, %r10d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %di +; CHECK-SSE1-NEXT: orl %edx, %edi ; CHECK-SSE1-NEXT: xorl %r9d, %esi -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-SSE1-NEXT: andl {{[0-9]+}}(%rsp), %esi ; CHECK-SSE1-NEXT: xorl %r9d, %esi ; CHECK-SSE1-NEXT: movw %cx, 4(%rax) ; CHECK-SSE1-NEXT: movw %si, (%rax) -; CHECK-SSE1-NEXT: movw %r8w, 6(%rax) -; CHECK-SSE1-NEXT: movw %dx, 2(%rax) +; CHECK-SSE1-NEXT: movw %r10w, 6(%rax) +; CHECK-SSE1-NEXT: movw %di, 2(%rax) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v4i16_undef: @@ -877,118 +882,118 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind { ; CHECK-BASELINE-LABEL: out_v8i16: ; CHECK-BASELINE: # %bb.0: -; CHECK-BASELINE-NEXT: pushq %rbp -; CHECK-BASELINE-NEXT: pushq %r15 -; CHECK-BASELINE-NEXT: pushq %r14 -; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d ; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d -; CHECK-BASELINE-NEXT: xorl %r12d, %esi -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si -; CHECK-BASELINE-NEXT: xorl %r12d, %esi -; CHECK-BASELINE-NEXT: xorl %r15d, %edx -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx -; CHECK-BASELINE-NEXT: xorl %r15d, %edx -; CHECK-BASELINE-NEXT: xorl %r14d, %ecx -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-BASELINE-NEXT: xorl %r14d, %ecx -; CHECK-BASELINE-NEXT: xorl %ebp, %r8d -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w -; CHECK-BASELINE-NEXT: xorl %ebp, %r8d -; CHECK-BASELINE-NEXT: xorl %ebx, %r9d -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r9w -; CHECK-BASELINE-NEXT: xorl %ebx, %r9d -; CHECK-BASELINE-NEXT: movl %r11d, %ebx -; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %bx -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bx -; CHECK-BASELINE-NEXT: xorl %r11d, %ebx -; CHECK-BASELINE-NEXT: movl %r10d, %r11d -; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %r11w +; CHECK-BASELINE-NEXT: andw %r11w, %bx +; CHECK-BASELINE-NEXT: notl %r11d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r11w -; CHECK-BASELINE-NEXT: xorl %r10d, %r11d -; CHECK-BASELINE-NEXT: movl %edi, %r10d -; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %r10w +; CHECK-BASELINE-NEXT: orl %ebx, %r11d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: andw %r10w, %bx +; CHECK-BASELINE-NEXT: notl %r10d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r10w -; CHECK-BASELINE-NEXT: xorl %edi, %r10d -; CHECK-BASELINE-NEXT: movw %r10w, 14(%rax) -; CHECK-BASELINE-NEXT: movw %r11w, 12(%rax) -; CHECK-BASELINE-NEXT: movw %bx, 10(%rax) -; CHECK-BASELINE-NEXT: movw %r9w, 8(%rax) -; CHECK-BASELINE-NEXT: movw %r8w, 6(%rax) -; CHECK-BASELINE-NEXT: movw %cx, 4(%rax) -; CHECK-BASELINE-NEXT: movw %dx, 2(%rax) -; CHECK-BASELINE-NEXT: movw %si, (%rax) +; CHECK-BASELINE-NEXT: orl %ebx, %r10d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: andw %di, %bx +; CHECK-BASELINE-NEXT: notl %edi +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %di +; CHECK-BASELINE-NEXT: orl %ebx, %edi +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: andl %ebx, %r9d +; CHECK-BASELINE-NEXT: notl %ebx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bx +; CHECK-BASELINE-NEXT: orl %r9d, %ebx +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r9d +; CHECK-BASELINE-NEXT: andl %r9d, %r8d +; CHECK-BASELINE-NEXT: notl %r9d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r9w +; CHECK-BASELINE-NEXT: orl %r8d, %r9d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r8d +; CHECK-BASELINE-NEXT: andl %r8d, %ecx +; CHECK-BASELINE-NEXT: notl %r8d +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w +; CHECK-BASELINE-NEXT: orl %ecx, %r8d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; CHECK-BASELINE-NEXT: andl %ecx, %edx +; CHECK-BASELINE-NEXT: notl %ecx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx +; CHECK-BASELINE-NEXT: orl %edx, %ecx +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edx +; CHECK-BASELINE-NEXT: andl %edx, %esi +; CHECK-BASELINE-NEXT: notl %edx +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx +; CHECK-BASELINE-NEXT: orl %esi, %edx +; CHECK-BASELINE-NEXT: movw %r11w, 14(%rax) +; CHECK-BASELINE-NEXT: movw %r10w, 12(%rax) +; CHECK-BASELINE-NEXT: movw %di, 10(%rax) +; CHECK-BASELINE-NEXT: movw %bx, 8(%rax) +; CHECK-BASELINE-NEXT: movw %r9w, 6(%rax) +; CHECK-BASELINE-NEXT: movw %r8w, 4(%rax) +; CHECK-BASELINE-NEXT: movw %cx, 2(%rax) +; CHECK-BASELINE-NEXT: movw %dx, (%rax) ; CHECK-BASELINE-NEXT: popq %rbx -; CHECK-BASELINE-NEXT: popq %r12 -; CHECK-BASELINE-NEXT: popq %r14 -; CHECK-BASELINE-NEXT: popq %r15 -; CHECK-BASELINE-NEXT: popq %rbp ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v8i16: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: pushq %rbp -; CHECK-SSE1-NEXT: pushq %r15 -; CHECK-SSE1-NEXT: pushq %r14 -; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d ; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d -; CHECK-SSE1-NEXT: xorl %r12d, %esi -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si -; CHECK-SSE1-NEXT: xorl %r12d, %esi -; CHECK-SSE1-NEXT: xorl %r15d, %edx -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx -; CHECK-SSE1-NEXT: xorl %r15d, %edx -; CHECK-SSE1-NEXT: xorl %r14d, %ecx -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-SSE1-NEXT: xorl %r14d, %ecx -; CHECK-SSE1-NEXT: xorl %ebp, %r8d -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w -; CHECK-SSE1-NEXT: xorl %ebp, %r8d -; CHECK-SSE1-NEXT: xorl %ebx, %r9d -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r9w -; CHECK-SSE1-NEXT: xorl %ebx, %r9d -; CHECK-SSE1-NEXT: movl %r11d, %ebx -; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %bx -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bx -; CHECK-SSE1-NEXT: xorl %r11d, %ebx -; CHECK-SSE1-NEXT: movl %r10d, %r11d -; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %r11w +; CHECK-SSE1-NEXT: andw %r11w, %bx +; CHECK-SSE1-NEXT: notl %r11d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r11w -; CHECK-SSE1-NEXT: xorl %r10d, %r11d -; CHECK-SSE1-NEXT: movl %edi, %r10d -; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %r10w +; CHECK-SSE1-NEXT: orl %ebx, %r11d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: andw %r10w, %bx +; CHECK-SSE1-NEXT: notl %r10d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r10w -; CHECK-SSE1-NEXT: xorl %edi, %r10d -; CHECK-SSE1-NEXT: movw %r10w, 14(%rax) -; CHECK-SSE1-NEXT: movw %r11w, 12(%rax) -; CHECK-SSE1-NEXT: movw %bx, 10(%rax) -; CHECK-SSE1-NEXT: movw %r9w, 8(%rax) -; CHECK-SSE1-NEXT: movw %r8w, 6(%rax) -; CHECK-SSE1-NEXT: movw %cx, 4(%rax) -; CHECK-SSE1-NEXT: movw %dx, 2(%rax) -; CHECK-SSE1-NEXT: movw %si, (%rax) +; CHECK-SSE1-NEXT: orl %ebx, %r10d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: andw %di, %bx +; CHECK-SSE1-NEXT: notl %edi +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %di +; CHECK-SSE1-NEXT: orl %ebx, %edi +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: andl %ebx, %r9d +; CHECK-SSE1-NEXT: notl %ebx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bx +; CHECK-SSE1-NEXT: orl %r9d, %ebx +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r9d +; CHECK-SSE1-NEXT: andl %r9d, %r8d +; CHECK-SSE1-NEXT: notl %r9d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r9w +; CHECK-SSE1-NEXT: orl %r8d, %r9d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r8d +; CHECK-SSE1-NEXT: andl %r8d, %ecx +; CHECK-SSE1-NEXT: notl %r8d +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w +; CHECK-SSE1-NEXT: orl %ecx, %r8d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; CHECK-SSE1-NEXT: andl %ecx, %edx +; CHECK-SSE1-NEXT: notl %ecx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx +; CHECK-SSE1-NEXT: orl %edx, %ecx +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edx +; CHECK-SSE1-NEXT: andl %edx, %esi +; CHECK-SSE1-NEXT: notl %edx +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx +; CHECK-SSE1-NEXT: orl %esi, %edx +; CHECK-SSE1-NEXT: movw %r11w, 14(%rax) +; CHECK-SSE1-NEXT: movw %r10w, 12(%rax) +; CHECK-SSE1-NEXT: movw %di, 10(%rax) +; CHECK-SSE1-NEXT: movw %bx, 8(%rax) +; CHECK-SSE1-NEXT: movw %r9w, 6(%rax) +; CHECK-SSE1-NEXT: movw %r8w, 4(%rax) +; CHECK-SSE1-NEXT: movw %cx, 2(%rax) +; CHECK-SSE1-NEXT: movw %dx, (%rax) ; CHECK-SSE1-NEXT: popq %rbx -; CHECK-SSE1-NEXT: popq %r12 -; CHECK-SSE1-NEXT: popq %r14 -; CHECK-SSE1-NEXT: popq %r15 -; CHECK-SSE1-NEXT: popq %rbp ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v8i16: @@ -1759,113 +1764,135 @@ ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movzwl 18(%rdx), %r15d -; CHECK-BASELINE-NEXT: movzwl 16(%rdx), %r14d -; CHECK-BASELINE-NEXT: movzwl 14(%rdx), %ebp -; CHECK-BASELINE-NEXT: movzwl 12(%rdx), %ebx -; CHECK-BASELINE-NEXT: movzwl 10(%rdx), %r13d -; CHECK-BASELINE-NEXT: movzwl 8(%rdx), %r11d -; CHECK-BASELINE-NEXT: movzwl 6(%rdx), %r10d -; CHECK-BASELINE-NEXT: movzwl 4(%rdx), %r9d -; CHECK-BASELINE-NEXT: movzwl (%rdx), %r8d -; CHECK-BASELINE-NEXT: movzwl 2(%rdx), %r12d -; CHECK-BASELINE-NEXT: movzwl (%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r8w, %ax -; CHECK-BASELINE-NEXT: andw (%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r8d -; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 2(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r12w, %ax -; CHECK-BASELINE-NEXT: andw 2(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r12d -; CHECK-BASELINE-NEXT: movzwl 4(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r9w, %ax -; CHECK-BASELINE-NEXT: andw 4(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r9d -; CHECK-BASELINE-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 6(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r10w, %ax -; CHECK-BASELINE-NEXT: andw 6(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r10d -; CHECK-BASELINE-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 8(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r11w, %ax -; CHECK-BASELINE-NEXT: andw 8(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r11d -; CHECK-BASELINE-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 10(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r13w, %ax -; CHECK-BASELINE-NEXT: andw 10(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r13d -; CHECK-BASELINE-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 12(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %bx, %ax -; CHECK-BASELINE-NEXT: andw 12(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %ebx -; CHECK-BASELINE-NEXT: movzwl 14(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %bp, %ax -; CHECK-BASELINE-NEXT: andw 14(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %ebp -; CHECK-BASELINE-NEXT: movzwl 16(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r14w, %ax -; CHECK-BASELINE-NEXT: andw 16(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r14d -; CHECK-BASELINE-NEXT: movzwl 18(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r15w, %ax -; CHECK-BASELINE-NEXT: andw 18(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r15d -; CHECK-BASELINE-NEXT: movzwl 20(%rdx), %r13d -; CHECK-BASELINE-NEXT: movzwl 20(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r13w, %ax -; CHECK-BASELINE-NEXT: andw 20(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r13d -; CHECK-BASELINE-NEXT: movzwl 22(%rdx), %r9d -; CHECK-BASELINE-NEXT: movzwl 22(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r9w, %ax -; CHECK-BASELINE-NEXT: andw 22(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r9d -; CHECK-BASELINE-NEXT: movzwl 24(%rdx), %r8d -; CHECK-BASELINE-NEXT: movzwl 24(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r8w, %ax -; CHECK-BASELINE-NEXT: andw 24(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r8d -; CHECK-BASELINE-NEXT: movzwl 26(%rdx), %eax -; CHECK-BASELINE-NEXT: movzwl 26(%rsi), %r10d -; CHECK-BASELINE-NEXT: xorw %ax, %r10w -; CHECK-BASELINE-NEXT: andw 26(%rcx), %r10w -; CHECK-BASELINE-NEXT: xorl %r10d, %eax -; CHECK-BASELINE-NEXT: movzwl 28(%rdx), %r10d -; CHECK-BASELINE-NEXT: movzwl 28(%rsi), %r11d -; CHECK-BASELINE-NEXT: xorw %r10w, %r11w -; CHECK-BASELINE-NEXT: andw 28(%rcx), %r11w -; CHECK-BASELINE-NEXT: xorl %r11d, %r10d -; CHECK-BASELINE-NEXT: movzwl 30(%rdx), %edx -; CHECK-BASELINE-NEXT: movzwl 30(%rsi), %esi -; CHECK-BASELINE-NEXT: xorw %dx, %si -; CHECK-BASELINE-NEXT: andw 30(%rcx), %si -; CHECK-BASELINE-NEXT: xorl %esi, %edx -; CHECK-BASELINE-NEXT: movw %dx, 30(%rdi) -; CHECK-BASELINE-NEXT: movw %r10w, 28(%rdi) -; CHECK-BASELINE-NEXT: movw %ax, 26(%rdi) -; CHECK-BASELINE-NEXT: movw %r8w, 24(%rdi) -; CHECK-BASELINE-NEXT: movw %r9w, 22(%rdi) -; CHECK-BASELINE-NEXT: movw %r13w, 20(%rdi) -; CHECK-BASELINE-NEXT: movw %r15w, 18(%rdi) -; CHECK-BASELINE-NEXT: movw %r14w, 16(%rdi) -; CHECK-BASELINE-NEXT: movw %bp, 14(%rdi) -; CHECK-BASELINE-NEXT: movw %bx, 12(%rdi) -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 10(%rdi) -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 8(%rdi) -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 6(%rdi) -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 4(%rdi) -; CHECK-BASELINE-NEXT: movw %r12w, 2(%rdi) -; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, (%rdi) -; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: movq %rcx, %r9 +; CHECK-BASELINE-NEXT: movq %rdx, %r10 +; CHECK-BASELINE-NEXT: movq %rsi, %r8 +; CHECK-BASELINE-NEXT: movq %rdi, %r11 +; CHECK-BASELINE-NEXT: movl 12(%rcx), %eax +; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 14(%rcx), %edx +; CHECK-BASELINE-NEXT: movl 16(%rcx), %esi +; CHECK-BASELINE-NEXT: movzwl 18(%rcx), %edi +; CHECK-BASELINE-NEXT: movl 20(%rcx), %ecx +; CHECK-BASELINE-NEXT: movzwl 22(%r9), %ebx +; CHECK-BASELINE-NEXT: movl 24(%r9), %ebp +; CHECK-BASELINE-NEXT: movzwl 26(%r9), %r14d +; CHECK-BASELINE-NEXT: movl 28(%r9), %r15d +; CHECK-BASELINE-NEXT: movzwl 30(%r9), %r12d +; CHECK-BASELINE-NEXT: movzwl 30(%r8), %r13d +; CHECK-BASELINE-NEXT: andw %r12w, %r13w +; CHECK-BASELINE-NEXT: notl %r12d +; CHECK-BASELINE-NEXT: andw 30(%r10), %r12w +; CHECK-BASELINE-NEXT: orl %r13d, %r12d +; CHECK-BASELINE-NEXT: movzwl 28(%r8), %eax +; CHECK-BASELINE-NEXT: andw %r15w, %ax +; CHECK-BASELINE-NEXT: notl %r15d +; CHECK-BASELINE-NEXT: andw 28(%r10), %r15w +; CHECK-BASELINE-NEXT: orl %eax, %r15d +; CHECK-BASELINE-NEXT: movzwl 26(%r8), %eax +; CHECK-BASELINE-NEXT: andw %r14w, %ax +; CHECK-BASELINE-NEXT: notl %r14d +; CHECK-BASELINE-NEXT: andw 26(%r10), %r14w +; CHECK-BASELINE-NEXT: orl %eax, %r14d +; CHECK-BASELINE-NEXT: movzwl 24(%r8), %eax +; CHECK-BASELINE-NEXT: andw %bp, %ax +; CHECK-BASELINE-NEXT: notl %ebp +; CHECK-BASELINE-NEXT: andw 24(%r10), %bp +; CHECK-BASELINE-NEXT: orl %eax, %ebp +; CHECK-BASELINE-NEXT: movzwl 22(%r8), %eax +; CHECK-BASELINE-NEXT: andw %bx, %ax +; CHECK-BASELINE-NEXT: notl %ebx +; CHECK-BASELINE-NEXT: andw 22(%r10), %bx +; CHECK-BASELINE-NEXT: orl %eax, %ebx +; CHECK-BASELINE-NEXT: movzwl 20(%r8), %eax +; CHECK-BASELINE-NEXT: andw %cx, %ax +; CHECK-BASELINE-NEXT: notl %ecx +; CHECK-BASELINE-NEXT: andw 20(%r10), %cx +; CHECK-BASELINE-NEXT: orl %eax, %ecx +; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 18(%r8), %eax +; CHECK-BASELINE-NEXT: andw %di, %ax +; CHECK-BASELINE-NEXT: notl %edi +; CHECK-BASELINE-NEXT: andw 18(%r10), %di +; CHECK-BASELINE-NEXT: orl %eax, %edi +; CHECK-BASELINE-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 16(%r8), %eax +; CHECK-BASELINE-NEXT: andw %si, %ax +; CHECK-BASELINE-NEXT: notl %esi +; CHECK-BASELINE-NEXT: andw 16(%r10), %si +; CHECK-BASELINE-NEXT: orl %eax, %esi +; CHECK-BASELINE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 14(%r8), %eax +; CHECK-BASELINE-NEXT: andw %dx, %ax +; CHECK-BASELINE-NEXT: notl %edx +; CHECK-BASELINE-NEXT: andw 14(%r10), %dx +; CHECK-BASELINE-NEXT: orl %eax, %edx +; CHECK-BASELINE-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 12(%r8), %eax +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-BASELINE-NEXT: andw %cx, %ax +; CHECK-BASELINE-NEXT: notl %ecx +; CHECK-BASELINE-NEXT: andw 12(%r10), %cx +; CHECK-BASELINE-NEXT: orl %eax, %ecx +; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 10(%r9), %r13d +; CHECK-BASELINE-NEXT: movzwl 10(%r8), %eax +; CHECK-BASELINE-NEXT: andw %r13w, %ax +; CHECK-BASELINE-NEXT: notl %r13d +; CHECK-BASELINE-NEXT: andw 10(%r10), %r13w +; CHECK-BASELINE-NEXT: orl %eax, %r13d +; CHECK-BASELINE-NEXT: movl 8(%r9), %edi +; CHECK-BASELINE-NEXT: movzwl 8(%r8), %eax +; CHECK-BASELINE-NEXT: andw %di, %ax +; CHECK-BASELINE-NEXT: notl %edi +; CHECK-BASELINE-NEXT: andw 8(%r10), %di +; CHECK-BASELINE-NEXT: orl %eax, %edi +; CHECK-BASELINE-NEXT: movzwl 6(%r9), %esi +; CHECK-BASELINE-NEXT: movzwl 6(%r8), %eax +; CHECK-BASELINE-NEXT: andw %si, %ax +; CHECK-BASELINE-NEXT: notl %esi +; CHECK-BASELINE-NEXT: andw 6(%r10), %si +; CHECK-BASELINE-NEXT: orl %eax, %esi +; CHECK-BASELINE-NEXT: movl 4(%r9), %edx +; CHECK-BASELINE-NEXT: movzwl 4(%r8), %eax +; CHECK-BASELINE-NEXT: andw %dx, %ax +; CHECK-BASELINE-NEXT: notl %edx +; CHECK-BASELINE-NEXT: andw 4(%r10), %dx +; CHECK-BASELINE-NEXT: orl %eax, %edx +; CHECK-BASELINE-NEXT: movzwl 2(%r9), %eax +; CHECK-BASELINE-NEXT: movzwl 2(%r8), %ecx +; CHECK-BASELINE-NEXT: andw %ax, %cx +; CHECK-BASELINE-NEXT: notl %eax +; CHECK-BASELINE-NEXT: andw 2(%r10), %ax +; CHECK-BASELINE-NEXT: orl %ecx, %eax +; CHECK-BASELINE-NEXT: movl (%r9), %r9d +; CHECK-BASELINE-NEXT: movzwl (%r8), %ecx +; CHECK-BASELINE-NEXT: andw %r9w, %cx +; CHECK-BASELINE-NEXT: notl %r9d +; CHECK-BASELINE-NEXT: andw (%r10), %r9w +; CHECK-BASELINE-NEXT: orl %ecx, %r9d +; CHECK-BASELINE-NEXT: movw %r12w, 30(%r11) +; CHECK-BASELINE-NEXT: movw %r15w, 28(%r11) +; CHECK-BASELINE-NEXT: movw %r14w, 26(%r11) +; CHECK-BASELINE-NEXT: movw %bp, 24(%r11) +; CHECK-BASELINE-NEXT: movw %bx, 22(%r11) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-BASELINE-NEXT: movw %cx, 20(%r11) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-BASELINE-NEXT: movw %cx, 18(%r11) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-BASELINE-NEXT: movw %cx, 16(%r11) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-BASELINE-NEXT: movw %cx, 14(%r11) +; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-BASELINE-NEXT: movw %cx, 12(%r11) +; CHECK-BASELINE-NEXT: movw %r13w, 10(%r11) +; CHECK-BASELINE-NEXT: movw %di, 8(%r11) +; CHECK-BASELINE-NEXT: movw %si, 6(%r11) +; CHECK-BASELINE-NEXT: movw %dx, 4(%r11) +; CHECK-BASELINE-NEXT: movw %ax, 2(%r11) +; CHECK-BASELINE-NEXT: movw %r9w, (%r11) +; CHECK-BASELINE-NEXT: movq %r11, %rax ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 ; CHECK-BASELINE-NEXT: popq %r13 @@ -1882,113 +1909,135 @@ ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movzwl 18(%rdx), %r15d -; CHECK-SSE1-NEXT: movzwl 16(%rdx), %r14d -; CHECK-SSE1-NEXT: movzwl 14(%rdx), %ebp -; CHECK-SSE1-NEXT: movzwl 12(%rdx), %ebx -; CHECK-SSE1-NEXT: movzwl 10(%rdx), %r13d -; CHECK-SSE1-NEXT: movzwl 8(%rdx), %r11d -; CHECK-SSE1-NEXT: movzwl 6(%rdx), %r10d -; CHECK-SSE1-NEXT: movzwl 4(%rdx), %r9d -; CHECK-SSE1-NEXT: movzwl (%rdx), %r8d -; CHECK-SSE1-NEXT: movzwl 2(%rdx), %r12d -; CHECK-SSE1-NEXT: movzwl (%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r8w, %ax -; CHECK-SSE1-NEXT: andw (%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r8d -; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 2(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r12w, %ax -; CHECK-SSE1-NEXT: andw 2(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r12d -; CHECK-SSE1-NEXT: movzwl 4(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r9w, %ax -; CHECK-SSE1-NEXT: andw 4(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r9d -; CHECK-SSE1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 6(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r10w, %ax -; CHECK-SSE1-NEXT: andw 6(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r10d -; CHECK-SSE1-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 8(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r11w, %ax -; CHECK-SSE1-NEXT: andw 8(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r11d -; CHECK-SSE1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 10(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r13w, %ax -; CHECK-SSE1-NEXT: andw 10(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r13d -; CHECK-SSE1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 12(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %bx, %ax -; CHECK-SSE1-NEXT: andw 12(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %ebx -; CHECK-SSE1-NEXT: movzwl 14(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %bp, %ax -; CHECK-SSE1-NEXT: andw 14(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %ebp -; CHECK-SSE1-NEXT: movzwl 16(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r14w, %ax -; CHECK-SSE1-NEXT: andw 16(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r14d -; CHECK-SSE1-NEXT: movzwl 18(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r15w, %ax -; CHECK-SSE1-NEXT: andw 18(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r15d -; CHECK-SSE1-NEXT: movzwl 20(%rdx), %r13d -; CHECK-SSE1-NEXT: movzwl 20(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r13w, %ax -; CHECK-SSE1-NEXT: andw 20(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r13d -; CHECK-SSE1-NEXT: movzwl 22(%rdx), %r9d -; CHECK-SSE1-NEXT: movzwl 22(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r9w, %ax -; CHECK-SSE1-NEXT: andw 22(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r9d -; CHECK-SSE1-NEXT: movzwl 24(%rdx), %r8d -; CHECK-SSE1-NEXT: movzwl 24(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r8w, %ax -; CHECK-SSE1-NEXT: andw 24(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r8d -; CHECK-SSE1-NEXT: movzwl 26(%rdx), %eax -; CHECK-SSE1-NEXT: movzwl 26(%rsi), %r10d -; CHECK-SSE1-NEXT: xorw %ax, %r10w -; CHECK-SSE1-NEXT: andw 26(%rcx), %r10w -; CHECK-SSE1-NEXT: xorl %r10d, %eax -; CHECK-SSE1-NEXT: movzwl 28(%rdx), %r10d -; CHECK-SSE1-NEXT: movzwl 28(%rsi), %r11d -; CHECK-SSE1-NEXT: xorw %r10w, %r11w -; CHECK-SSE1-NEXT: andw 28(%rcx), %r11w -; CHECK-SSE1-NEXT: xorl %r11d, %r10d -; CHECK-SSE1-NEXT: movzwl 30(%rdx), %edx -; CHECK-SSE1-NEXT: movzwl 30(%rsi), %esi -; CHECK-SSE1-NEXT: xorw %dx, %si -; CHECK-SSE1-NEXT: andw 30(%rcx), %si -; CHECK-SSE1-NEXT: xorl %esi, %edx -; CHECK-SSE1-NEXT: movw %dx, 30(%rdi) -; CHECK-SSE1-NEXT: movw %r10w, 28(%rdi) -; CHECK-SSE1-NEXT: movw %ax, 26(%rdi) -; CHECK-SSE1-NEXT: movw %r8w, 24(%rdi) -; CHECK-SSE1-NEXT: movw %r9w, 22(%rdi) -; CHECK-SSE1-NEXT: movw %r13w, 20(%rdi) -; CHECK-SSE1-NEXT: movw %r15w, 18(%rdi) -; CHECK-SSE1-NEXT: movw %r14w, 16(%rdi) -; CHECK-SSE1-NEXT: movw %bp, 14(%rdi) -; CHECK-SSE1-NEXT: movw %bx, 12(%rdi) -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 10(%rdi) -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 8(%rdi) -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 6(%rdi) -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 4(%rdi) -; CHECK-SSE1-NEXT: movw %r12w, 2(%rdi) -; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, (%rdi) -; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: movq %rcx, %r9 +; CHECK-SSE1-NEXT: movq %rdx, %r10 +; CHECK-SSE1-NEXT: movq %rsi, %r8 +; CHECK-SSE1-NEXT: movq %rdi, %r11 +; CHECK-SSE1-NEXT: movl 12(%rcx), %eax +; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 14(%rcx), %edx +; CHECK-SSE1-NEXT: movl 16(%rcx), %esi +; CHECK-SSE1-NEXT: movzwl 18(%rcx), %edi +; CHECK-SSE1-NEXT: movl 20(%rcx), %ecx +; CHECK-SSE1-NEXT: movzwl 22(%r9), %ebx +; CHECK-SSE1-NEXT: movl 24(%r9), %ebp +; CHECK-SSE1-NEXT: movzwl 26(%r9), %r14d +; CHECK-SSE1-NEXT: movl 28(%r9), %r15d +; CHECK-SSE1-NEXT: movzwl 30(%r9), %r12d +; CHECK-SSE1-NEXT: movzwl 30(%r8), %r13d +; CHECK-SSE1-NEXT: andw %r12w, %r13w +; CHECK-SSE1-NEXT: notl %r12d +; CHECK-SSE1-NEXT: andw 30(%r10), %r12w +; CHECK-SSE1-NEXT: orl %r13d, %r12d +; CHECK-SSE1-NEXT: movzwl 28(%r8), %eax +; CHECK-SSE1-NEXT: andw %r15w, %ax +; CHECK-SSE1-NEXT: notl %r15d +; CHECK-SSE1-NEXT: andw 28(%r10), %r15w +; CHECK-SSE1-NEXT: orl %eax, %r15d +; CHECK-SSE1-NEXT: movzwl 26(%r8), %eax +; CHECK-SSE1-NEXT: andw %r14w, %ax +; CHECK-SSE1-NEXT: notl %r14d +; CHECK-SSE1-NEXT: andw 26(%r10), %r14w +; CHECK-SSE1-NEXT: orl %eax, %r14d +; CHECK-SSE1-NEXT: movzwl 24(%r8), %eax +; CHECK-SSE1-NEXT: andw %bp, %ax +; CHECK-SSE1-NEXT: notl %ebp +; CHECK-SSE1-NEXT: andw 24(%r10), %bp +; CHECK-SSE1-NEXT: orl %eax, %ebp +; CHECK-SSE1-NEXT: movzwl 22(%r8), %eax +; CHECK-SSE1-NEXT: andw %bx, %ax +; CHECK-SSE1-NEXT: notl %ebx +; CHECK-SSE1-NEXT: andw 22(%r10), %bx +; CHECK-SSE1-NEXT: orl %eax, %ebx +; CHECK-SSE1-NEXT: movzwl 20(%r8), %eax +; CHECK-SSE1-NEXT: andw %cx, %ax +; CHECK-SSE1-NEXT: notl %ecx +; CHECK-SSE1-NEXT: andw 20(%r10), %cx +; CHECK-SSE1-NEXT: orl %eax, %ecx +; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 18(%r8), %eax +; CHECK-SSE1-NEXT: andw %di, %ax +; CHECK-SSE1-NEXT: notl %edi +; CHECK-SSE1-NEXT: andw 18(%r10), %di +; CHECK-SSE1-NEXT: orl %eax, %edi +; CHECK-SSE1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 16(%r8), %eax +; CHECK-SSE1-NEXT: andw %si, %ax +; CHECK-SSE1-NEXT: notl %esi +; CHECK-SSE1-NEXT: andw 16(%r10), %si +; CHECK-SSE1-NEXT: orl %eax, %esi +; CHECK-SSE1-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 14(%r8), %eax +; CHECK-SSE1-NEXT: andw %dx, %ax +; CHECK-SSE1-NEXT: notl %edx +; CHECK-SSE1-NEXT: andw 14(%r10), %dx +; CHECK-SSE1-NEXT: orl %eax, %edx +; CHECK-SSE1-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 12(%r8), %eax +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-SSE1-NEXT: andw %cx, %ax +; CHECK-SSE1-NEXT: notl %ecx +; CHECK-SSE1-NEXT: andw 12(%r10), %cx +; CHECK-SSE1-NEXT: orl %eax, %ecx +; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 10(%r9), %r13d +; CHECK-SSE1-NEXT: movzwl 10(%r8), %eax +; CHECK-SSE1-NEXT: andw %r13w, %ax +; CHECK-SSE1-NEXT: notl %r13d +; CHECK-SSE1-NEXT: andw 10(%r10), %r13w +; CHECK-SSE1-NEXT: orl %eax, %r13d +; CHECK-SSE1-NEXT: movl 8(%r9), %edi +; CHECK-SSE1-NEXT: movzwl 8(%r8), %eax +; CHECK-SSE1-NEXT: andw %di, %ax +; CHECK-SSE1-NEXT: notl %edi +; CHECK-SSE1-NEXT: andw 8(%r10), %di +; CHECK-SSE1-NEXT: orl %eax, %edi +; CHECK-SSE1-NEXT: movzwl 6(%r9), %esi +; CHECK-SSE1-NEXT: movzwl 6(%r8), %eax +; CHECK-SSE1-NEXT: andw %si, %ax +; CHECK-SSE1-NEXT: notl %esi +; CHECK-SSE1-NEXT: andw 6(%r10), %si +; CHECK-SSE1-NEXT: orl %eax, %esi +; CHECK-SSE1-NEXT: movl 4(%r9), %edx +; CHECK-SSE1-NEXT: movzwl 4(%r8), %eax +; CHECK-SSE1-NEXT: andw %dx, %ax +; CHECK-SSE1-NEXT: notl %edx +; CHECK-SSE1-NEXT: andw 4(%r10), %dx +; CHECK-SSE1-NEXT: orl %eax, %edx +; CHECK-SSE1-NEXT: movzwl 2(%r9), %eax +; CHECK-SSE1-NEXT: movzwl 2(%r8), %ecx +; CHECK-SSE1-NEXT: andw %ax, %cx +; CHECK-SSE1-NEXT: notl %eax +; CHECK-SSE1-NEXT: andw 2(%r10), %ax +; CHECK-SSE1-NEXT: orl %ecx, %eax +; CHECK-SSE1-NEXT: movl (%r9), %r9d +; CHECK-SSE1-NEXT: movzwl (%r8), %ecx +; CHECK-SSE1-NEXT: andw %r9w, %cx +; CHECK-SSE1-NEXT: notl %r9d +; CHECK-SSE1-NEXT: andw (%r10), %r9w +; CHECK-SSE1-NEXT: orl %ecx, %r9d +; CHECK-SSE1-NEXT: movw %r12w, 30(%r11) +; CHECK-SSE1-NEXT: movw %r15w, 28(%r11) +; CHECK-SSE1-NEXT: movw %r14w, 26(%r11) +; CHECK-SSE1-NEXT: movw %bp, 24(%r11) +; CHECK-SSE1-NEXT: movw %bx, 22(%r11) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-SSE1-NEXT: movw %cx, 20(%r11) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-SSE1-NEXT: movw %cx, 18(%r11) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-SSE1-NEXT: movw %cx, 16(%r11) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-SSE1-NEXT: movw %cx, 14(%r11) +; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-SSE1-NEXT: movw %cx, 12(%r11) +; CHECK-SSE1-NEXT: movw %r13w, 10(%r11) +; CHECK-SSE1-NEXT: movw %di, 8(%r11) +; CHECK-SSE1-NEXT: movw %si, 6(%r11) +; CHECK-SSE1-NEXT: movw %dx, 4(%r11) +; CHECK-SSE1-NEXT: movw %ax, 2(%r11) +; CHECK-SSE1-NEXT: movw %r9w, (%r11) +; CHECK-SSE1-NEXT: movq %r11, %rax ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 ; CHECK-SSE1-NEXT: popq %r13 @@ -3144,12 +3193,26 @@ ; CHECK-SSE1-LABEL: in_v4i32: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1 -; CHECK-SSE1-NEXT: andnps (%rdx), %xmm1 -; CHECK-SSE1-NEXT: andps (%rsi), %xmm0 -; CHECK-SSE1-NEXT: orps %xmm1, %xmm0 -; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi) +; CHECK-SSE1-NEXT: movq (%rdx), %rdi +; CHECK-SSE1-NEXT: movq 8(%rdx), %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rdx +; CHECK-SSE1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: shrq $32, %rdi +; CHECK-SSE1-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-SSE1-NEXT: movaps (%rsi), %xmm0 +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: andps (%rcx), %xmm0 +; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0 +; CHECK-SSE1-NEXT: movaps %xmm0, (%rax) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: in_v4i32: diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll @@ -62,22 +62,33 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; X86-LABEL: test_urem_odd_setne: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %eax -; X86-NEXT: andb $15, %al -; X86-NEXT: cmpb $4, %al -; X86-NEXT: setae %al +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andb $15, %cl +; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: leal (%ecx,%ecx,2), %edx +; X86-NEXT: leal (%ecx,%edx,4), %ecx +; X86-NEXT: shrb $6, %cl +; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: leal (%ecx,%ecx,4), %ecx +; X86-NEXT: subb %cl, %al +; X86-NEXT: testb $15, %al +; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_odd_setne: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: leal (%rdi,%rdi,2), %eax -; X64-NEXT: leal (%rdi,%rax,4), %eax +; X64-NEXT: movl %edi, %eax ; X64-NEXT: andb $15, %al -; X64-NEXT: cmpb $4, %al -; X64-NEXT: setae %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: leal (%rax,%rax,2), %ecx +; X64-NEXT: leal (%rax,%rcx,4), %eax +; X64-NEXT: shrb $6, %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: leal (%rax,%rax,4), %eax +; X64-NEXT: subb %al, %dil +; X64-NEXT: testb $15, %dil +; X64-NEXT: setne %al ; X64-NEXT: retq %urem = urem i4 %X, 5 %cmp = icmp ne i4 %urem, 0 diff --git a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll --- a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll @@ -5,18 +5,27 @@ define i1 @t32_3_1(i32 %X) nounwind { ; X86-LABEL: t32_3_1: ; X86: # %bb.0: -; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB -; X86-NEXT: addl $1431655765, %eax # imm = 0x55555555 -; X86-NEXT: cmpl $1431655765, %eax # imm = 0x55555555 -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl %edx +; X86-NEXT: leal (%edx,%edx,2), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $1, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_3_1: ; X64: # %bb.0: -; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB -; X64-NEXT: addl $1431655765, %eax # imm = 0x55555555 -; X64-NEXT: cmpl $1431655765, %eax # imm = 0x55555555 -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $33, %rcx +; X64-NEXT: leal (%rcx,%rcx,2), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $1, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 3 %cmp = icmp eq i32 %urem, 1 @@ -26,18 +35,27 @@ define i1 @t32_3_2(i32 %X) nounwind { ; X86-LABEL: t32_3_2: ; X86: # %bb.0: -; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB -; X86-NEXT: addl $-1431655766, %eax # imm = 0xAAAAAAAA -; X86-NEXT: cmpl $1431655765, %eax # imm = 0x55555555 -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl %edx +; X86-NEXT: leal (%edx,%edx,2), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $2, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_3_2: ; X64: # %bb.0: -; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB -; X64-NEXT: addl $-1431655766, %eax # imm = 0xAAAAAAAA -; X64-NEXT: cmpl $1431655765, %eax # imm = 0x55555555 -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $33, %rcx +; X64-NEXT: leal (%rcx,%rcx,2), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $2, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 3 %cmp = icmp eq i32 %urem, 2 @@ -48,18 +66,27 @@ define i1 @t32_5_1(i32 %X) nounwind { ; X86-LABEL: t32_5_1: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD -; X86-NEXT: addl $858993459, %eax # imm = 0x33333333 -; X86-NEXT: cmpl $858993459, %eax # imm = 0x33333333 -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $2, %edx +; X86-NEXT: leal (%edx,%edx,4), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $1, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_5_1: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %eax # imm = 0xCCCCCCCD -; X64-NEXT: addl $858993459, %eax # imm = 0x33333333 -; X64-NEXT: cmpl $858993459, %eax # imm = 0x33333333 -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: leal (%rcx,%rcx,4), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $1, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 1 @@ -69,18 +96,27 @@ define i1 @t32_5_2(i32 %X) nounwind { ; X86-LABEL: t32_5_2: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD -; X86-NEXT: addl $1717986918, %eax # imm = 0x66666666 -; X86-NEXT: cmpl $858993459, %eax # imm = 0x33333333 -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $2, %edx +; X86-NEXT: leal (%edx,%edx,4), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $2, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_5_2: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %eax # imm = 0xCCCCCCCD -; X64-NEXT: addl $1717986918, %eax # imm = 0x66666666 -; X64-NEXT: cmpl $858993459, %eax # imm = 0x33333333 -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: leal (%rcx,%rcx,4), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $2, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 2 @@ -90,18 +126,27 @@ define i1 @t32_5_3(i32 %X) nounwind { ; X86-LABEL: t32_5_3: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD -; X86-NEXT: addl $-1717986919, %eax # imm = 0x99999999 -; X86-NEXT: cmpl $858993459, %eax # imm = 0x33333333 -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $2, %edx +; X86-NEXT: leal (%edx,%edx,4), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $3, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_5_3: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %eax # imm = 0xCCCCCCCD -; X64-NEXT: addl $-1717986919, %eax # imm = 0x99999999 -; X64-NEXT: cmpl $858993459, %eax # imm = 0x33333333 -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: leal (%rcx,%rcx,4), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $3, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 3 @@ -111,18 +156,27 @@ define i1 @t32_5_4(i32 %X) nounwind { ; X86-LABEL: t32_5_4: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD -; X86-NEXT: addl $-858993460, %eax # imm = 0xCCCCCCCC -; X86-NEXT: cmpl $858993459, %eax # imm = 0x33333333 -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $2, %edx +; X86-NEXT: leal (%edx,%edx,4), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $4, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_5_4: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %eax # imm = 0xCCCCCCCD -; X64-NEXT: addl $-858993460, %eax # imm = 0xCCCCCCCC -; X64-NEXT: cmpl $858993459, %eax # imm = 0x33333333 -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: leal (%rcx,%rcx,4), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $4, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 4 @@ -133,20 +187,29 @@ define i1 @t32_6_1(i32 %X) nounwind { ; X86-LABEL: t32_6_1: ; X86: # %bb.0: -; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB -; X86-NEXT: addl $1431655765, %eax # imm = 0x55555555 -; X86-NEXT: rorl %eax -; X86-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl %edx +; X86-NEXT: andl $-2, %edx +; X86-NEXT: leal (%edx,%edx,2), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $1, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_6_1: ; X64: # %bb.0: -; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB -; X64-NEXT: addl $1431655765, %eax # imm = 0x55555555 -; X64-NEXT: rorl %eax -; X64-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: addl %ecx, %ecx +; X64-NEXT: leal (%rcx,%rcx,2), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $1, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 1 @@ -156,20 +219,29 @@ define i1 @t32_6_2(i32 %X) nounwind { ; X86-LABEL: t32_6_2: ; X86: # %bb.0: -; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB -; X86-NEXT: addl $-1431655766, %eax # imm = 0xAAAAAAAA -; X86-NEXT: rorl %eax -; X86-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl %edx +; X86-NEXT: andl $-2, %edx +; X86-NEXT: leal (%edx,%edx,2), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $2, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_6_2: ; X64: # %bb.0: -; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB -; X64-NEXT: addl $-1431655766, %eax # imm = 0xAAAAAAAA -; X64-NEXT: rorl %eax -; X64-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: addl %ecx, %ecx +; X64-NEXT: leal (%rcx,%rcx,2), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $2, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 2 @@ -179,20 +251,29 @@ define i1 @t32_6_3(i32 %X) nounwind { ; X86-LABEL: t32_6_3: ; X86: # %bb.0: -; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB -; X86-NEXT: decl %eax -; X86-NEXT: rorl %eax -; X86-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl %edx +; X86-NEXT: andl $-2, %edx +; X86-NEXT: leal (%edx,%edx,2), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $3, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_6_3: ; X64: # %bb.0: -; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB -; X64-NEXT: decl %eax -; X64-NEXT: rorl %eax -; X64-NEXT: cmpl $715827883, %eax # imm = 0x2AAAAAAB -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: addl %ecx, %ecx +; X64-NEXT: leal (%rcx,%rcx,2), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $3, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 3 @@ -202,20 +283,29 @@ define i1 @t32_6_4(i32 %X) nounwind { ; X86-LABEL: t32_6_4: ; X86: # %bb.0: -; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB -; X86-NEXT: addl $1431655764, %eax # imm = 0x55555554 -; X86-NEXT: rorl %eax -; X86-NEXT: cmpl $715827882, %eax # imm = 0x2AAAAAAA -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl %edx +; X86-NEXT: andl $-2, %edx +; X86-NEXT: leal (%edx,%edx,2), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $4, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_6_4: ; X64: # %bb.0: -; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB -; X64-NEXT: addl $1431655764, %eax # imm = 0x55555554 -; X64-NEXT: rorl %eax -; X64-NEXT: cmpl $715827882, %eax # imm = 0x2AAAAAAA -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: addl %ecx, %ecx +; X64-NEXT: leal (%rcx,%rcx,2), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $4, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 4 @@ -225,20 +315,29 @@ define i1 @t32_6_5(i32 %X) nounwind { ; X86-LABEL: t32_6_5: ; X86: # %bb.0: -; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %eax # imm = 0xAAAAAAAB -; X86-NEXT: addl $-1431655767, %eax # imm = 0xAAAAAAA9 -; X86-NEXT: rorl %eax -; X86-NEXT: cmpl $715827882, %eax # imm = 0x2AAAAAAA -; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl %edx +; X86-NEXT: andl $-2, %edx +; X86-NEXT: leal (%edx,%edx,2), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: cmpl $5, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t32_6_5: ; X64: # %bb.0: -; X64-NEXT: imull $-1431655765, %edi, %eax # imm = 0xAAAAAAAB -; X64-NEXT: addl $-1431655767, %eax # imm = 0xAAAAAAA9 -; X64-NEXT: rorl %eax -; X64-NEXT: cmpl $715827882, %eax # imm = 0x2AAAAAAA -; X64-NEXT: setb %al +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $2863311531, %ecx # imm = 0xAAAAAAAB +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: addl %ecx, %ecx +; X64-NEXT: leal (%rcx,%rcx,2), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpl $5, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 5 @@ -251,20 +350,24 @@ define i1 @t16_3_2(i16 %X) nounwind { ; X86-LABEL: t16_3_2: ; X86: # %bb.0: -; X86-NEXT: imull $-21845, {{[0-9]+}}(%esp), %eax # imm = 0xAAAB -; X86-NEXT: addl $-21846, %eax # imm = 0xAAAA -; X86-NEXT: movzwl %ax, %eax -; X86-NEXT: cmpl $21845, %eax # imm = 0x5555 -; X86-NEXT: setb %al +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $43691, %eax, %ecx # imm = 0xAAAB +; X86-NEXT: shrl $17, %ecx +; X86-NEXT: leal (%ecx,%ecx,2), %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: cmpw $2, %ax +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t16_3_2: ; X64: # %bb.0: -; X64-NEXT: imull $-21845, %edi, %eax # imm = 0xAAAB -; X64-NEXT: addl $-21846, %eax # imm = 0xAAAA -; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: cmpl $21845, %eax # imm = 0x5555 -; X64-NEXT: setb %al +; X64-NEXT: movzwl %di, %eax +; X64-NEXT: imull $43691, %eax, %eax # imm = 0xAAAB +; X64-NEXT: shrl $17, %eax +; X64-NEXT: leal (%rax,%rax,2), %eax +; X64-NEXT: subl %eax, %edi +; X64-NEXT: cmpw $2, %di +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i16 %X, 3 %cmp = icmp eq i16 %urem, 2 @@ -274,18 +377,24 @@ define i1 @t8_3_2(i8 %X) nounwind { ; X86-LABEL: t8_3_2: ; X86: # %bb.0: -; X86-NEXT: imull $-85, {{[0-9]+}}(%esp), %eax -; X86-NEXT: addb $-86, %al -; X86-NEXT: cmpb $85, %al -; X86-NEXT: setb %al +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $171, %eax, %ecx +; X86-NEXT: shrl $9, %ecx +; X86-NEXT: leal (%ecx,%ecx,2), %ecx +; X86-NEXT: subb %cl, %al +; X86-NEXT: cmpb $2, %al +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: t8_3_2: ; X64: # %bb.0: -; X64-NEXT: imull $-85, %edi, %eax -; X64-NEXT: addb $-86, %al -; X64-NEXT: cmpb $85, %al -; X64-NEXT: setb %al +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: imull $171, %eax, %ecx +; X64-NEXT: shrl $9, %ecx +; X64-NEXT: leal (%rcx,%rcx,2), %ecx +; X64-NEXT: subb %cl, %al +; X64-NEXT: cmpb $2, %al +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i8 %X, 3 %cmp = icmp eq i8 %urem, 2 @@ -312,13 +421,14 @@ ; ; X64-LABEL: t64_3_2: ; X64: # %bb.0: -; X64-NEXT: movabsq $-6148914691236517205, %rax # imm = 0xAAAAAAAAAAAAAAAB -; X64-NEXT: imulq %rdi, %rax -; X64-NEXT: movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA -; X64-NEXT: addq %rax, %rcx -; X64-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 -; X64-NEXT: cmpq %rax, %rcx -; X64-NEXT: setb %al +; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: shrq %rdx +; X64-NEXT: leaq (%rdx,%rdx,2), %rax +; X64-NEXT: subq %rax, %rdi +; X64-NEXT: cmpq $2, %rdi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i64 %X, 3 %cmp = icmp eq i64 %urem, 2 diff --git a/llvm/test/CodeGen/X86/urem-seteq-optsize.ll b/llvm/test/CodeGen/X86/urem-seteq-optsize.ll --- a/llvm/test/CodeGen/X86/urem-seteq-optsize.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-optsize.ll @@ -47,10 +47,15 @@ define i32 @test_optsize(i32 %X) optsize nounwind readnone { ; X86-LABEL: test_optsize: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %eax # imm = 0xCCCCCCCD -; X86-NEXT: cmpl $858993460, %eax # imm = 0x33333334 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $2, %edx +; X86-NEXT: leal (%edx,%edx,4), %eax +; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: movl $42, %eax -; X86-NEXT: jb .LBB1_2 +; X86-NEXT: je .LBB1_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: movl $-10, %eax ; X86-NEXT: .LBB1_2: @@ -58,11 +63,15 @@ ; ; X64-LABEL: test_optsize: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %eax # imm = 0xCCCCCCCD -; X64-NEXT: cmpl $858993460, %eax # imm = 0x33333334 +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: leal (%rcx,%rcx,4), %eax +; X64-NEXT: cmpl %eax, %edi ; X64-NEXT: movl $42, %ecx ; X64-NEXT: movl $-10, %eax -; X64-NEXT: cmovbl %ecx, %eax +; X64-NEXT: cmovel %ecx, %eax ; X64-NEXT: retq %rem = urem i32 %X, 5 %cmp = icmp eq i32 %rem, 0 diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -9,68 +9,108 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: por %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,1374389535,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: psrld $1, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: psrld $5, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; CHECK-SSE2-NEXT: psrld $3, %xmm1 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,306783378,171798691,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,1374389535,1374389535] +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm3 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $3, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -86,33 +126,95 @@ define <4 x i32> @test_urem_odd_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_allones_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,2147483649,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_allones_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,1,858993459] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,2147483649,u> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX-LABEL: test_urem_odd_allones_eq: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq +; CHECK-AVX1-LABEL: test_urem_odd_allones_eq: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_odd_allones_eq: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_odd_allones_eq: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> @@ -121,33 +223,96 @@ define <4 x i32> @test_urem_odd_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,2147483649,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993460,858993460,2,858993460] -; CHECK-SSE41-NEXT: pmaxud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,2147483649,u> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX-LABEL: test_urem_odd_allones_ne: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq +; CHECK-AVX1-LABEL: test_urem_odd_allones_ne: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_odd_allones_ne: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_odd_allones_ne: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp ne <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> @@ -158,73 +323,101 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_allones_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_allones_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,1,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_allones_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_allones_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_allones_eq: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -236,75 +429,104 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783379,306783379,2,306783379] -; CHECK-SSE41-NEXT: pmaxud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_allones_ne: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_allones_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_allones_ne: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp ne <4 x i32> %urem, @@ -316,68 +538,108 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: por %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: psrld $1, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: psrld $5, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,306783378,1,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm3 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_eq: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -389,70 +651,111 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: por %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: psrld $1, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: psrld $5, %xmm3 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; CHECK-SSE2-NEXT: psrld $31, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993460,306783379,2,42949673] -; CHECK-SSE41-NEXT: pmaxud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm3 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: psrld $31, %xmm0 +; CHECK-SSE41-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_ne: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_ne: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp ne <4 x i32> %urem, @@ -466,62 +769,89 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,268435456,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u> -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: psrlq $32, %xmm1 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,268435455,858993459] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,268435456,u> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -535,73 +865,98 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,268435455,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -615,72 +970,104 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,268435456,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: psrld $1, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: psrld $2, %xmm3 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: psrld $5, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,306783378,268435455,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,268435456,1374389535] +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm3 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -696,48 +1083,98 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-SSE2-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD +; CHECK-SSE2-NEXT: movd %eax, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,4294967295,858993459] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD +; CHECK-SSE41-NEXT: movd %eax, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD +; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD +; CHECK-AVX2-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD +; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -751,63 +1188,105 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: pslld $31, %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-SSE2-NEXT: movd %eax, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: psrld $1, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pslld $31, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-SSE41-NEXT: movd %eax, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-AVX2-NEXT: vmovd %eax, %xmm3 +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: movl $-1840700269, %eax # imm = 0x92492493 +; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm3 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -821,68 +1300,103 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,2454267027,u,1374389535> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: psrld $1, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: psrld $5, %xmm2 +; CHECK-SSE2-NEXT: movaps %xmm0, %xmm3 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: por %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,306783378,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,2454267027,u,1374389535> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -898,62 +1412,89 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_INT_MIN: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,2,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_INT_MIN: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <1,u,2,u> -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: psrlq $32, %xmm1 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,1,858993459] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,2,u> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_INT_MIN: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_INT_MIN: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_INT_MIN: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -967,73 +1508,98 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_INT_MIN: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_INT_MIN: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,1,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_INT_MIN: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_INT_MIN: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_INT_MIN: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1047,72 +1613,104 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_INT_MIN: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: psrld $1, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: psrld $2, %xmm3 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: psrld $5, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_INT_MIN: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,306783378,1,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2,1374389535] +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm3 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_INT_MIN: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_INT_MIN: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_INT_MIN: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1128,62 +1726,95 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: psrld $2, %xmm3 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm3[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u> +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837] ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: psrlq $32, %xmm1 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,1,268435455,858993459] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1197,72 +1828,104 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: psrld $2, %xmm3 +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm3[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,1,268435455,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm3 +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1276,72 +1939,96 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrld $5, %xmm3 +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm3[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535] +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,1,268435455,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE41-NEXT: psrld $5, %xmm3 +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1357,33 +2044,96 @@ define <4 x i32> @test_urem_odd_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,2147483649,u,3435973837> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,1,4294967295,858993459] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,2147483649,u,3435973837> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX-LABEL: test_urem_odd_allones_and_one: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq +; CHECK-AVX1-LABEL: test_urem_odd_allones_and_one: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_odd_allones_and_one: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_one: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> @@ -1394,72 +2144,102 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_allones_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,2147483649,u,2454267027> +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,1,4294967295,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_allones_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_allones_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1474,67 +2254,97 @@ ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_one: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $5, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: por %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,2147483649,u,1374389535> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,2147483649,u,1374389535> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: psrld $2, %xmm2 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,1,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1550,68 +2360,90 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,268435456,u,3435973837> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: por %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,268435455,4294967295,858993459] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,268435456,u,3435973837> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1625,72 +2457,99 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,268435456,u,2454267027> +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,268435455,4294967295,306783378] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1705,67 +2564,94 @@ ; CHECK-SSE2-LABEL: test_urem_odd_even_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $5, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: por %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,268435456,u,1374389535> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: por %xmm2, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,268435455,4294967295,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,268435456,u,1374389535> +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1780,62 +2666,98 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,268435456,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: psrld $2, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u> +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,268435456,u> ; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: psrlq $32, %xmm1 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993459,1,268435455,4294967295] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm0[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483649,2147483649,2147483649,2147483649] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483649,2147483649,2147483649,2147483649] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1848,62 +2770,107 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 +; CHECK-SSE2-NEXT: psrld $2, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <2147483648,u,268435456,u> -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: psrlq $32, %xmm1 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,1,268435455,4294967295] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psrld $31, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm0[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483649,2147483649,2147483649,2147483649] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483649,2147483649,2147483649,2147483649] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -1912,3 +2879,5 @@ %ret = zext <4 x i1> %cmp to <4 x i32> ret <4 x i32> %ret } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-AVX: {{.*}} diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll @@ -8,52 +8,81 @@ define <4 x i1> @t32_3(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t32_3: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t32_3: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,1431655764,1431655764,1431655764] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t32_3: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t32_3: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t32_3: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpaddd %xmm1, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -63,53 +92,81 @@ define <4 x i1> @t32_5(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t32_5: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: paddd %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t32_5: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [858993458,858993458,858993458,858993458] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pslld $2, %xmm1 +; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t32_5: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpslld $2, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t32_5: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [858993458,858993458,858993458,858993458] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpslld $2, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t32_5: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpslld $2, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -119,68 +176,81 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t32_6_part0: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm0 -; CHECK-SSE2-NEXT: psrld $1, %xmm0 -; CHECK-SSE2-NEXT: pslld $31, %xmm3 -; CHECK-SSE2-NEXT: por %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t32_6_part0: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pslld $31, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t32_6_part0: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t32_6_part0: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6] +; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t32_6_part0: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -190,67 +260,81 @@ define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t32_6_part1: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm0 -; CHECK-SSE2-NEXT: psrld $1, %xmm0 -; CHECK-SSE2-NEXT: pslld $31, %xmm3 -; CHECK-SSE2-NEXT: por %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t32_6_part1: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pslld $31, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [715827881,715827881,715827882,715827882] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t32_6_part1: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t32_6_part1: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6] +; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t32_6_part1: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -260,59 +344,79 @@ define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t32_tautological: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: psrld $1, %xmm2 +; CHECK-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t32_tautological: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295,4294967295,1431655764] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t32_tautological: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t32_tautological: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t32_tautological: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll @@ -9,49 +9,89 @@ define <4 x i32> @test_urem_odd_25(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_25: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psrld $3, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [25,25,25,25] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_25: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [171798691,171798691,171798691,171798691] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $3, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_25: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_25: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [171798691,171798691,171798691,171798691] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $3, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [25,25,25,25] +; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_25: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $3, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -65,64 +105,89 @@ define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_100: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psrld $5, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [100,100,100,100] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: pslld $30, %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_100: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pslld $30, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: psrld $5, %xmm2 +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_100: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_100: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $5, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100] +; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_100: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $2, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $5, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -138,33 +203,95 @@ define <4 x i32> @test_urem_odd_neg25(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_neg25: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,2147483661,2147483661,1374389535] +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE2-NEXT: psrld $3, %xmm2 +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_neg25: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [171798691,1,1,171798691] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,2147483661,2147483661,1374389535] +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 +; CHECK-SSE41-NEXT: psrld $31, %xmm1 +; CHECK-SSE41-NEXT: psrld $3, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; -; CHECK-AVX-LABEL: test_urem_odd_neg25: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq +; CHECK-AVX1-LABEL: test_urem_odd_neg25: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_odd_neg25: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_odd_neg25: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> @@ -176,60 +303,92 @@ ; CHECK-SSE2-LABEL: test_urem_even_neg100: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: psrld $5, %xmm1 ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: pslld $30, %xmm0 -; CHECK-SSE2-NEXT: por %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: psrld $2, %xmm2 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: psrld $27, %xmm2 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_neg100: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pslld $30, %xmm0 -; CHECK-SSE41-NEXT: por %xmm1, %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,42949672,1,42949672] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psrld $5, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE41-NEXT: psrld $2, %xmm2 +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: psrld $27, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_neg100: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpsrld $27, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_neg100: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [536870925,536870925,536870925,536870925] +; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_neg100: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vprord $2, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1374389535,1374389535,1374389535,1374389535] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [536870925,536870925,536870925,536870925] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq @@ -550,50 +709,42 @@ ; We could lower remainder of division by all-ones much better elsewhere. define <4 x i32> @test_urem_allones(<4 x i32> %X) nounwind { -; CHECK-SSE2-LABEL: test_urem_allones: -; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: psubd %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq -; -; CHECK-SSE41-LABEL: test_urem_allones: -; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm0, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] -; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: psrld $31, %xmm0 -; CHECK-SSE41-NEXT: retq +; CHECK-SSE-LABEL: test_urem_allones: +; CHECK-SSE: # %bb.0: +; CHECK-SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-SSE-NEXT: pcmpeqd %xmm0, %xmm1 +; CHECK-SSE-NEXT: pandn %xmm0, %xmm1 +; CHECK-SSE-NEXT: pxor %xmm0, %xmm0 +; CHECK-SSE-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE-NEXT: psrld $31, %xmm0 +; CHECK-SSE-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_allones: ; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_allones: ; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_allones: ; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 +; CHECK-AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 ; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll @@ -25,54 +25,89 @@ define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t1_all_odd_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-SSE2-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB +; CHECK-SSE2-NEXT: movd %eax, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[2,3] +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t1_all_odd_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,4294967295,4294967295,4294967295] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB +; CHECK-SSE41-NEXT: movd %eax, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t1_all_odd_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB +; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t1_all_odd_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [954437177,954437177,954437177,954437177] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB +; CHECK-AVX2-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t1_all_odd_eq: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [954437177,954437177,954437177,954437177] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB +; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -82,60 +117,98 @@ define <4 x i1> @t1_all_odd_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t1_all_odd_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-SSE2-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB +; CHECK-SSE2-NEXT: movd %eax, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[2,3] +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t1_all_odd_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,4294967295,4294967295,4294967295] -; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB +; CHECK-SSE41-NEXT: movd %eax, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: psrld $1, %xmm2 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t1_all_odd_ne: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB +; CHECK-AVX1-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t1_all_odd_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [954437177,954437177,954437177,954437177] +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB +; CHECK-AVX2-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t1_all_odd_ne: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [954437177,954437177,954437177,954437177] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: movl $-1431655765, %eax # imm = 0xAAAAAAAB +; CHECK-AVX512VL-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpsrld $1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp ne <4 x i32> %urem, @@ -145,48 +218,71 @@ define <8 x i1> @t2_narrow(<8 x i16> %X) nounwind { ; CHECK-SSE2-LABEL: t2_narrow: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqw %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,0,65535,65535,0] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pand %xmm1, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = <43691,u,u,58255,43691,u,u,58255> +; CHECK-SSE2-NEXT: pmulhuw %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: psrlw $3, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: psrlw $1, %xmm3 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: pandn %xmm3, %xmm1 +; CHECK-SSE2-NEXT: por %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: psubw %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t2_narrow: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [21845,65535,65535,65535,21845,65535,65535,65535] -; CHECK-SSE41-NEXT: pminuw %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqw %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = <43691,u,u,58255,43691,u,u,58255> +; CHECK-SSE41-NEXT: pmulhuw %xmm0, %xmm1 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 +; CHECK-SSE41-NEXT: psrlw $3, %xmm2 +; CHECK-SSE41-NEXT: psrlw $1, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3,4],xmm0[5,6],xmm1[7] +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: psubw %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t2_narrow: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3,4],xmm0[5,6],xmm1[7] +; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t2_narrow: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; CHECK-AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsrlw $3, %xmm1, %xmm2 +; CHECK-AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3,4],xmm0[5,6],xmm1[7] +; CHECK-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t2_narrow: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; CHECK-AVX512VL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; CHECK-AVX512VL-NEXT: vpsrlw $3, %xmm1, %xmm2 +; CHECK-AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3,4],xmm0[5,6],xmm1[7] +; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-AVX512VL-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: retq %urem = urem <8 x i16> %X, %cmp = icmp eq <8 x i16> %urem, @@ -207,16 +303,18 @@ ; CHECK-SSE-NEXT: psllq $32, %xmm0 ; CHECK-SSE-NEXT: paddq %xmm2, %xmm0 ; CHECK-SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; CHECK-SSE-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE-NEXT: pand %xmm2, %xmm1 +; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15372286730238776661,9223372034707292159] +; CHECK-SSE-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE-NEXT: pcmpgtd %xmm1, %xmm2 +; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; CHECK-SSE-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-SSE-NEXT: por %xmm1, %xmm0 -; CHECK-SSE-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-SSE-NEXT: pxor %xmm0, %xmm1 -; CHECK-SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero +; CHECK-SSE-NEXT: pand %xmm3, %xmm0 +; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-SSE-NEXT: por %xmm0, %xmm1 +; CHECK-SSE-NEXT: pcmpeqd %xmm0, %xmm0 +; CHECK-SSE-NEXT: pxor %xmm1, %xmm0 +; CHECK-SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX1-LABEL: t3_wide: @@ -264,9 +362,12 @@ ; CHECK-AVX512VL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0 ; CHECK-AVX512VL-NEXT: vpaddq %xmm0, %xmm2, %xmm0 -; CHECK-AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; CHECK-AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; CHECK-AVX512VL-NEXT: vpcmpleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0 +; CHECK-AVX512VL-NEXT: movb $2, %al +; CHECK-AVX512VL-NEXT: kmovw %eax, %k1 +; CHECK-AVX512VL-NEXT: kxorw %k1, %k0, %k1 +; CHECK-AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; CHECK-AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; CHECK-AVX512VL-NEXT: retq %urem = urem <2 x i64> %X, %cmp = icmp eq <2 x i64> %urem, diff --git a/llvm/test/CodeGen/X86/urem-seteq.ll b/llvm/test/CodeGen/X86/urem-seteq.ll --- a/llvm/test/CodeGen/X86/urem-seteq.ll +++ b/llvm/test/CodeGen/X86/urem-seteq.ll @@ -9,18 +9,27 @@ define i32 @test_urem_odd(i32 %X) nounwind { ; X86-LABEL: test_urem_odd: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %ecx # imm = 0xCCCCCCCD +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $2, %edx +; X86-NEXT: leal (%edx,%edx,4), %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $858993460, %ecx # imm = 0x33333334 -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_odd: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %ecx # imm = 0xCCCCCCCD +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: leal (%rcx,%rcx,4), %ecx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $858993460, %ecx # imm = 0x33333334 -; X64-NEXT: setb %al +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 0 @@ -31,18 +40,28 @@ define i32 @test_urem_odd_25(i32 %X) nounwind { ; X86-LABEL: test_urem_odd_25: ; X86: # %bb.0: -; X86-NEXT: imull $-1030792151, {{[0-9]+}}(%esp), %ecx # imm = 0xC28F5C29 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1374389535, %edx # imm = 0x51EB851F +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: leal (%edx,%edx,4), %eax +; X86-NEXT: leal (%eax,%eax,4), %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $171798692, %ecx # imm = 0xA3D70A4 -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_odd_25: ; X64: # %bb.0: -; X64-NEXT: imull $-1030792151, %edi, %ecx # imm = 0xC28F5C29 +; X64-NEXT: movl %edi, %eax +; X64-NEXT: imulq $1374389535, %rax, %rax # imm = 0x51EB851F +; X64-NEXT: shrq $35, %rax +; X64-NEXT: leal (%rax,%rax,4), %eax +; X64-NEXT: leal (%rax,%rax,4), %ecx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $171798692, %ecx # imm = 0xA3D70A4 -; X64-NEXT: setb %al +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 25 %cmp = icmp eq i32 %urem, 0 @@ -54,18 +73,27 @@ define i32 @test_urem_odd_bit30(i32 %X) nounwind { ; X86-LABEL: test_urem_odd_bit30: ; X86: # %bb.0: -; X86-NEXT: imull $1789569707, {{[0-9]+}}(%esp), %ecx # imm = 0x6AAAAAAB +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-11, %edx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $30, %edx +; X86-NEXT: imull $1073741827, %edx, %edx # imm = 0x40000003 ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $4, %ecx -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_odd_bit30: ; X64: # %bb.0: -; X64-NEXT: imull $1789569707, %edi, %ecx # imm = 0x6AAAAAAB +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $4294967285, %ecx # imm = 0xFFFFFFF5 +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $62, %rcx +; X64-NEXT: imull $1073741827, %ecx, %ecx # imm = 0x40000003 ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $4, %ecx -; X64-NEXT: setb %al +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 1073741827 %cmp = icmp eq i32 %urem, 0 @@ -77,18 +105,28 @@ define i32 @test_urem_odd_bit31(i32 %X) nounwind { ; X86-LABEL: test_urem_odd_bit31: ; X86: # %bb.0: -; X86-NEXT: imull $715827883, {{[0-9]+}}(%esp), %ecx # imm = 0x2AAAAAAB +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1073741823, %edx # imm = 0x3FFFFFFF +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $29, %edx +; X86-NEXT: imull $-2147483645, %edx, %edx # imm = 0x80000003 ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $2, %ecx -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_odd_bit31: ; X64: # %bb.0: -; X64-NEXT: imull $715827883, %edi, %ecx # imm = 0x2AAAAAAB +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: shlq $30, %rcx +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: shrq $61, %rcx +; X64-NEXT: imull $-2147483645, %ecx, %ecx # imm = 0x80000003 ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $2, %ecx -; X64-NEXT: setb %al +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 2147483651 %cmp = icmp eq i32 %urem, 0 @@ -103,23 +141,33 @@ define i16 @test_urem_even(i16 %X) nounwind { ; X86-LABEL: test_urem_even: ; X86: # %bb.0: -; X86-NEXT: imull $28087, {{[0-9]+}}(%esp), %eax # imm = 0x6DB7 -; X86-NEXT: rorw %ax -; X86-NEXT: movzwl %ax, %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrl %eax +; X86-NEXT: imull $18725, %eax, %edx # imm = 0x4925 +; X86-NEXT: shrl $17, %edx +; X86-NEXT: leal (%edx,%edx), %eax +; X86-NEXT: shll $4, %edx +; X86-NEXT: subl %eax, %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $4682, %ecx # imm = 0x124A -; X86-NEXT: setae %al +; X86-NEXT: cmpw %dx, %cx +; X86-NEXT: setne %al ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; ; X64-LABEL: test_urem_even: ; X64: # %bb.0: -; X64-NEXT: imull $28087, %edi, %eax # imm = 0x6DB7 -; X64-NEXT: rorw %ax -; X64-NEXT: movzwl %ax, %ecx +; X64-NEXT: movzwl %di, %ecx +; X64-NEXT: movl %ecx, %eax +; X64-NEXT: shrl %eax +; X64-NEXT: imull $18725, %eax, %edx # imm = 0x4925 +; X64-NEXT: shrl $17, %edx +; X64-NEXT: leal (%rdx,%rdx), %eax +; X64-NEXT: shll $4, %edx +; X64-NEXT: subl %eax, %edx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $4682, %ecx # imm = 0x124A -; X64-NEXT: setae %al +; X64-NEXT: cmpw %dx, %cx +; X64-NEXT: setne %al ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %urem = urem i16 %X, 14 @@ -131,20 +179,26 @@ define i32 @test_urem_even_100(i32 %X) nounwind { ; X86-LABEL: test_urem_even_100: ; X86: # %bb.0: -; X86-NEXT: imull $-1030792151, {{[0-9]+}}(%esp), %ecx # imm = 0xC28F5C29 -; X86-NEXT: rorl $2, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1374389535, %edx # imm = 0x51EB851F +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $5, %edx +; X86-NEXT: imull $100, %edx, %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $42949673, %ecx # imm = 0x28F5C29 -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_even_100: ; X64: # %bb.0: -; X64-NEXT: imull $-1030792151, %edi, %ecx # imm = 0xC28F5C29 -; X64-NEXT: rorl $2, %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: imulq $1374389535, %rax, %rax # imm = 0x51EB851F +; X64-NEXT: shrq $37, %rax +; X64-NEXT: imull $100, %eax, %ecx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $42949673, %ecx # imm = 0x28F5C29 -; X64-NEXT: setb %al +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 100 %cmp = icmp eq i32 %urem, 0 @@ -156,20 +210,27 @@ define i32 @test_urem_even_bit30(i32 %X) nounwind { ; X86-LABEL: test_urem_even_bit30: ; X86: # %bb.0: -; X86-NEXT: imull $-51622203, {{[0-9]+}}(%esp), %ecx # imm = 0xFCEC4EC5 -; X86-NEXT: rorl $3, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-415, %edx # imm = 0xFE61 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $30, %edx +; X86-NEXT: imull $1073741928, %edx, %edx # imm = 0x40000068 ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $4, %ecx -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_even_bit30: ; X64: # %bb.0: -; X64-NEXT: imull $-51622203, %edi, %ecx # imm = 0xFCEC4EC5 -; X64-NEXT: rorl $3, %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $4294966881, %ecx # imm = 0xFFFFFE61 +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $62, %rcx +; X64-NEXT: imull $1073741928, %ecx, %ecx # imm = 0x40000068 ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $4, %ecx -; X64-NEXT: setb %al +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 1073741928 %cmp = icmp eq i32 %urem, 0 @@ -181,20 +242,26 @@ define i32 @test_urem_even_bit31(i32 %X) nounwind { ; X86-LABEL: test_urem_even_bit31: ; X86: # %bb.0: -; X86-NEXT: imull $-1157956869, {{[0-9]+}}(%esp), %ecx # imm = 0xBAFAFAFB -; X86-NEXT: rorl %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $2147483547, %edx # imm = 0x7FFFFF9B +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $30, %edx +; X86-NEXT: imull $-2147483546, %edx, %edx # imm = 0x80000066 ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $2, %ecx -; X86-NEXT: setb %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_even_bit31: ; X64: # %bb.0: -; X64-NEXT: imull $-1157956869, %edi, %ecx # imm = 0xBAFAFAFB -; X64-NEXT: rorl %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: imulq $2147483547, %rax, %rax # imm = 0x7FFFFF9B +; X64-NEXT: shrq $62, %rax +; X64-NEXT: imull $-2147483546, %eax, %ecx # imm = 0x80000066 ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $2, %ecx -; X64-NEXT: setb %al +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 2147483750 %cmp = icmp eq i32 %urem, 0 @@ -210,18 +277,27 @@ define i32 @test_urem_odd_setne(i32 %X) nounwind { ; X86-LABEL: test_urem_odd_setne: ; X86: # %bb.0: -; X86-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %ecx # imm = 0xCCCCCCCD +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $2, %edx +; X86-NEXT: leal (%edx,%edx,4), %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $858993460, %ecx # imm = 0x33333334 -; X86-NEXT: setae %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_odd_setne: ; X64: # %bb.0: -; X64-NEXT: imull $-858993459, %edi, %ecx # imm = 0xCCCCCCCD +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $3435973837, %ecx # imm = 0xCCCCCCCD +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $34, %rcx +; X64-NEXT: leal (%rcx,%rcx,4), %ecx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $858993460, %ecx # imm = 0x33333334 -; X64-NEXT: setae %al +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: setne %al ; X64-NEXT: retq %urem = urem i32 %X, 5 %cmp = icmp ne i32 %urem, 0 @@ -233,18 +309,27 @@ define i32 @test_urem_negative_odd(i32 %X) nounwind { ; X86-LABEL: test_urem_negative_odd: ; X86: # %bb.0: -; X86-NEXT: imull $858993459, {{[0-9]+}}(%esp), %ecx # imm = 0x33333333 +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-2147483645, %edx # imm = 0x80000003 +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: shrl $31, %edx +; X86-NEXT: leal (%edx,%edx,4), %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $2, %ecx -; X86-NEXT: setae %al +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_negative_odd: ; X64: # %bb.0: -; X64-NEXT: imull $858993459, %edi, %ecx # imm = 0x33333333 +; X64-NEXT: movl %edi, %eax +; X64-NEXT: movl $2147483651, %ecx # imm = 0x80000003 +; X64-NEXT: imulq %rax, %rcx +; X64-NEXT: shrq $63, %rcx +; X64-NEXT: leal (%rcx,%rcx,4), %ecx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $2, %ecx -; X64-NEXT: setae %al +; X64-NEXT: addl %edi, %ecx +; X64-NEXT: setne %al ; X64-NEXT: retq %urem = urem i32 %X, -5 %cmp = icmp ne i32 %urem, 0 @@ -254,20 +339,30 @@ define i32 @test_urem_negative_even(i32 %X) nounwind { ; X86-LABEL: test_urem_negative_even: ; X86: # %bb.0: -; X86-NEXT: imull $-920350135, {{[0-9]+}}(%esp), %ecx # imm = 0xC9249249 -; X86-NEXT: rorl %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrl %eax +; X86-NEXT: movl $268435457, %edx # imm = 0x10000001 +; X86-NEXT: mull %edx +; X86-NEXT: shrl $27, %edx +; X86-NEXT: imull $-14, %edx, %edx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $2, %ecx -; X86-NEXT: setae %al +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: setne %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_negative_even: ; X64: # %bb.0: -; X64-NEXT: imull $-920350135, %edi, %ecx # imm = 0xC9249249 -; X64-NEXT: rorl %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shrl %eax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: shlq $28, %rcx +; X64-NEXT: addq %rax, %rcx +; X64-NEXT: shrq $59, %rcx +; X64-NEXT: imull $-14, %ecx, %ecx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $2, %ecx -; X64-NEXT: setae %al +; X64-NEXT: cmpl %ecx, %edi +; X64-NEXT: setne %al ; X64-NEXT: retq %urem = urem i32 %X, -14 %cmp = icmp ne i32 %urem, 0 @@ -337,19 +432,26 @@ define i32 @test_urem_allones(i32 %X) nounwind { ; X86-LABEL: test_urem_allones: ; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl $-1, %eax +; X86-NEXT: je .LBB14_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: .LBB14_2: ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: cmpl $2, %ecx -; X86-NEXT: setb %al +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: sete %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_allones: ; X64: # %bb.0: -; X64-NEXT: negl %edi +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: cmpl $-1, %edi +; X64-NEXT: cmovnel %edi, %ecx ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: cmpl $2, %edi -; X64-NEXT: setb %al +; X64-NEXT: testl %ecx, %ecx +; X64-NEXT: sete %al ; X64-NEXT: retq %urem = urem i32 %X, 4294967295 %cmp = icmp eq i32 %urem, 0 @@ -362,7 +464,16 @@ ; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=34366 define void @ossfuzz34366() { ; X86-LABEL: ossfuzz34366: +; X86: # %bb.0: +; X86-NEXT: cmpl $0, (%eax) +; X86-NEXT: sete (%eax) +; X86-NEXT: retl +; ; X64-LABEL: ossfuzz34366: +; X64: # %bb.0: +; X64-NEXT: cmpq $0, (%rax) +; X64-NEXT: sete (%rax) +; X64-NEXT: retq %L10 = load i448, ptr undef, align 4 %B18 = urem i448 %L10, -363419362147803445274661903944002267176820680343659030140745099590319644056698961663095525356881782780381260803133088966767300814307328 %C13 = icmp ule i448 %B18, 0 diff --git a/llvm/test/CodeGen/X86/ushl_sat.ll b/llvm/test/CodeGen/X86/ushl_sat.ll --- a/llvm/test/CodeGen/X86/ushl_sat.ll +++ b/llvm/test/CodeGen/X86/ushl_sat.ll @@ -111,7 +111,7 @@ ; X86: # %bb.0: ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll $7, %ecx ; X86-NEXT: addl %eax, %eax ; X86-NEXT: movl %eax, %edx diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll --- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll @@ -24,6 +24,7 @@ ; X64-NEXT: pcmpeqd %xmm5, %xmm0 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; X64-NEXT: pand %xmm1, %xmm0 +; X64-NEXT: andpd %xmm0, %xmm4 ; X64-NEXT: pcmpeqd %xmm1, %xmm1 ; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: por %xmm4, %xmm0 @@ -148,6 +149,7 @@ ; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0] ; X64-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm2[0,3] ; X64-NEXT: pcmpeqd %xmm5, %xmm0 +; X64-NEXT: pand %xmm0, %xmm6 ; X64-NEXT: pcmpeqd %xmm1, %xmm1 ; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: por %xmm6, %xmm0 @@ -269,6 +271,7 @@ ; X64-NEXT: pand %xmm1, %xmm3 ; X64-NEXT: por %xmm4, %xmm3 ; X64-NEXT: pcmpeqw %xmm3, %xmm0 +; X64-NEXT: pand %xmm0, %xmm2 ; X64-NEXT: pcmpeqd %xmm1, %xmm1 ; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: por %xmm2, %xmm0 @@ -444,6 +447,7 @@ ; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-NEXT: por %xmm4, %xmm1 ; X64-NEXT: pcmpeqb %xmm1, %xmm0 +; X64-NEXT: pand %xmm0, %xmm2 ; X64-NEXT: pcmpeqd %xmm1, %xmm1 ; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: por %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll --- a/llvm/test/CodeGen/X86/usub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll @@ -1100,14 +1100,14 @@ ; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64] -; SSE-NEXT: psubusw %xmm4, %xmm1 ; SSE-NEXT: psubusw %xmm4, %xmm0 -; SSE-NEXT: psubusw %xmm4, %xmm3 +; SSE-NEXT: psubusw %xmm4, %xmm1 ; SSE-NEXT: psubusw %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, 32(%rdi) +; SSE-NEXT: psubusw %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm3, 48(%rdi) -; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: movdqa %xmm2, 32(%rdi) ; SSE-NEXT: movdqa %xmm1, 16(%rdi) +; SSE-NEXT: movdqa %xmm0, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: PR48223: @@ -1117,14 +1117,14 @@ ; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64] -; AVX1-NEXT: vpsubusw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsubusw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsubusw %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpsubusw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, (%rdi) -; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi) -; AVX1-NEXT: vmovdqa %xmm2, 32(%rdi) +; AVX1-NEXT: vpsubusw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpsubusw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsubusw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vmovdqa %xmm3, 48(%rdi) +; AVX1-NEXT: vmovdqa %xmm2, 32(%rdi) +; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR48223: @@ -1132,10 +1132,10 @@ ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; AVX2-NEXT: vpsubusw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdi) +; AVX2-NEXT: vpsubusw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdi) +; AVX2-NEXT: vmovdqa %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1144,10 +1144,17 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; AVX512F-NEXT: vpsubusw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdi) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdi) +; AVX512F-NEXT: vpmaxuw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpmaxuw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [65472,65472,65472,65472,65472,65472,65472,65472,65472,65472,65472,65472,65472,65472,65472,65472] +; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpandq %zmm0, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/v8i1-masks.ll b/llvm/test/CodeGen/X86/v8i1-masks.ll --- a/llvm/test/CodeGen/X86/v8i1-masks.ll +++ b/llvm/test/CodeGen/X86/v8i1-masks.ll @@ -15,9 +15,16 @@ ; X86-NEXT: vmovups (%edx), %ymm0 ; X86-NEXT: vmovups (%ecx), %ymm1 ; X86-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 +; X86-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-NEXT: vmovups (%eax), %ymm2 ; X86-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 -; X86-NEXT: vandps %ymm1, %ymm0, %ymm0 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7] +; X86-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-NEXT: vmovaps %ymm0, (%eax) ; X86-NEXT: vzeroupper @@ -28,9 +35,16 @@ ; X64-NEXT: vmovups (%rdi), %ymm0 ; X64-NEXT: vmovups (%rsi), %ymm1 ; X64-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 +; X64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-NEXT: vmovups (%rdx), %ymm2 ; X64-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 -; X64-NEXT: vandps %ymm1, %ymm0, %ymm0 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7] +; X64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-NEXT: vmovaps %ymm0, (%rax) ; X64-NEXT: vzeroupper @@ -44,10 +58,16 @@ ; X86-AVX2-NEXT: vmovups (%edx), %ymm0 ; X86-AVX2-NEXT: vmovups (%ecx), %ymm1 ; X86-AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-AVX2-NEXT: vmovups (%eax), %ymm2 ; X86-AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 -; X86-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $31, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; X86-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax) ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -57,10 +77,16 @@ ; X64-AVX2-NEXT: vmovups (%rdi), %ymm0 ; X64-AVX2-NEXT: vmovups (%rsi), %ymm1 ; X64-AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 +; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vmovups (%rdx), %ymm2 ; X64-AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 -; X64-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $31, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax) ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -180,21 +206,25 @@ define <8 x i32> @and_mask_constant(<8 x i32> %v0, <8 x i32> %v1) { ; X86-LABEL: and_mask_constant: ; X86: ## %bb.0: -; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X86-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; X86-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm2 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: and_mask_constant: ; X64: ## %bb.0: -; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X64-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; X64-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm2 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; X64-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-NEXT: retq ; @@ -202,14 +232,26 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86-AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4],xmm0[5,6],xmm1[7] +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; X86-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: and_mask_constant: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4],xmm0[5,6],xmm1[7] +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X86-AVX512-LABEL: and_mask_constant: @@ -242,34 +284,60 @@ ; X86: ## %bb.0: ## %entry ; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-NEXT: vandps %ymm0, %ymm1, %ymm0 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 +; X86-NEXT: vpmovsxwd %xmm0, %xmm1 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X86-NEXT: vpmovsxwd %xmm0, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: two_ands: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vandps %ymm0, %ymm1, %ymm0 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 +; X64-NEXT: vpmovsxwd %xmm0, %xmm1 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-NEXT: vpmovsxwd %xmm0, %xmm0 +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-AVX2-LABEL: two_ands: ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm0 -; X86-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: two_ands: ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm0 -; X64-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X86-AVX512-LABEL: two_ands: @@ -300,46 +368,80 @@ ; X86: ## %bb.0: ## %entry ; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 -; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X86-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X86-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X86-NEXT: vandps %ymm0, %ymm1, %ymm0 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X86-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 +; X86-NEXT: vpmovsxwd %xmm0, %xmm1 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X86-NEXT: vpmovsxwd %xmm0, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: three_ands: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X64-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X64-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X64-NEXT: vandps %ymm0, %ymm1, %ymm0 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X64-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 +; X64-NEXT: vpmovsxwd %xmm0, %xmm1 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-NEXT: vpmovsxwd %xmm0, %xmm0 +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-AVX2-LABEL: three_ands: ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X86-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 -; X86-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: three_ands: ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 -; X64-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X86-AVX512-LABEL: three_ands: @@ -376,56 +478,98 @@ ; X86: ## %bb.0: ## %entry ; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 -; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X86-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X86-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X86-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X86-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X86-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X86-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X86-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-NEXT: vandps %ymm0, %ymm1, %ymm0 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 +; X86-NEXT: vpmovsxwd %xmm0, %xmm1 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X86-NEXT: vpmovsxwd %xmm0, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: four_ands: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X64-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X64-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X64-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X64-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X64-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vandps %ymm0, %ymm1, %ymm0 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 +; X64-NEXT: vpmovsxwd %xmm0, %xmm1 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-NEXT: vpmovsxwd %xmm0, %xmm0 +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-AVX2-LABEL: four_ands: ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X86-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X86-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X86-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X86-AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: four_ands: ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X64-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X64-AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X86-AVX512-LABEL: four_ands: @@ -466,66 +610,126 @@ ; X86: ## %bb.0: ## %entry ; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 -; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X86-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X86-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X86-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X86-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X86-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X86-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X86-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 +; X86-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X86-NEXT: vandps %ymm0, %ymm1, %ymm0 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X86-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 +; X86-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-NEXT: vpslld $31, %xmm1, %xmm1 +; X86-NEXT: vpsrad $31, %xmm1, %xmm1 +; X86-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; X86-NEXT: vpslld $31, %xmm0, %xmm0 +; X86-NEXT: vpsrad $31, %xmm0, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: five_ands: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X64-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X64-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X64-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X64-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X64-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; X64-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X64-NEXT: vandps %ymm0, %ymm1, %ymm0 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X64-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 +; X64-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-NEXT: vpslld $31, %xmm1, %xmm1 +; X64-NEXT: vpsrad $31, %xmm1, %xmm1 +; X64-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; X64-NEXT: vpslld $31, %xmm0, %xmm0 +; X64-NEXT: vpsrad $31, %xmm0, %xmm0 +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-AVX2-LABEL: five_ands: ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X86-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X86-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X86-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X86-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X86-AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] ; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 -; X86-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X86-AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: five_ands: ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X64-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X64-AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 -; X64-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X86-AVX512-LABEL: five_ands: @@ -570,34 +774,60 @@ ; X86: ## %bb.0: ## %entry ; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X86-NEXT: vpmovsxwd %xmm0, %xmm1 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X86-NEXT: vpmovsxwd %xmm0, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: two_or: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X64-NEXT: vpmovsxwd %xmm0, %xmm1 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-NEXT: vpmovsxwd %xmm0, %xmm0 +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-AVX2-LABEL: two_or: ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm0 -; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: two_or: ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm0 -; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X86-AVX512-LABEL: two_or: @@ -630,46 +860,80 @@ ; X86: ## %bb.0: ## %entry ; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 -; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X86-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X86-NEXT: vorps %ymm0, %ymm2, %ymm0 -; X86-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X86-NEXT: vpor %xmm0, %xmm2, %xmm0 +; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X86-NEXT: vpmovsxwd %xmm0, %xmm1 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X86-NEXT: vpmovsxwd %xmm0, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: three_or: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X64-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X64-NEXT: vorps %ymm0, %ymm2, %ymm0 -; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X64-NEXT: vpor %xmm0, %xmm2, %xmm0 +; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X64-NEXT: vpmovsxwd %xmm0, %xmm1 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-NEXT: vpmovsxwd %xmm0, %xmm0 +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-AVX2-LABEL: three_or: ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X86-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 -; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: three_or: ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 -; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X86-AVX512-LABEL: three_or: @@ -710,56 +974,98 @@ ; X86: ## %bb.0: ## %entry ; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 -; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X86-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X86-NEXT: vorps %ymm3, %ymm2, %ymm2 -; X86-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X86-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X86-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X86-NEXT: vpor %xmm3, %xmm2, %xmm2 +; X86-NEXT: vpor %xmm2, %xmm1, %xmm1 ; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X86-NEXT: vpmovsxwd %xmm0, %xmm1 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X86-NEXT: vpmovsxwd %xmm0, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: four_or: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X64-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-NEXT: vorps %ymm3, %ymm2, %ymm2 -; X64-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X64-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X64-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X64-NEXT: vpor %xmm3, %xmm2, %xmm2 +; X64-NEXT: vpor %xmm2, %xmm1, %xmm1 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X64-NEXT: vpmovsxwd %xmm0, %xmm1 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-NEXT: vpmovsxwd %xmm0, %xmm0 +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-AVX2-LABEL: four_or: ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X86-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X86-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0 -; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; X86-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: four_or: ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X64-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0 -; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; X64-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X86-AVX512-LABEL: four_or: @@ -806,66 +1112,126 @@ ; X86: ## %bb.0: ## %entry ; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 -; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X86-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X86-NEXT: vorps %ymm3, %ymm2, %ymm2 -; X86-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X86-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X86-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X86-NEXT: vpor %xmm3, %xmm2, %xmm2 +; X86-NEXT: vpor %xmm2, %xmm1, %xmm1 ; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 +; X86-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-NEXT: vorps %ymm0, %ymm2, %ymm0 -; X86-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X86-NEXT: vpor %xmm0, %xmm2, %xmm0 +; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X86-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-NEXT: vpslld $31, %xmm1, %xmm1 +; X86-NEXT: vpsrad $31, %xmm1, %xmm1 +; X86-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; X86-NEXT: vpslld $31, %xmm0, %xmm0 +; X86-NEXT: vpsrad $31, %xmm0, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: five_or: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X64-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-NEXT: vorps %ymm3, %ymm2, %ymm2 -; X64-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X64-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X64-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X64-NEXT: vpor %xmm3, %xmm2, %xmm2 +; X64-NEXT: vpor %xmm2, %xmm1, %xmm1 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; X64-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vorps %ymm0, %ymm2, %ymm0 -; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X64-NEXT: vpor %xmm0, %xmm2, %xmm0 +; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X64-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-NEXT: vpslld $31, %xmm1, %xmm1 +; X64-NEXT: vpsrad $31, %xmm1, %xmm1 +; X64-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; X64-NEXT: vpslld $31, %xmm0, %xmm0 +; X64-NEXT: vpsrad $31, %xmm0, %xmm0 +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-AVX2-LABEL: five_or: ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X86-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X86-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2 -; X86-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X86-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X86-AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] ; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 -; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X86-AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: five_or: ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2 -; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X64-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X64-AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 -; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X86-AVX512-LABEL: five_or: @@ -918,46 +1284,80 @@ ; X86: ## %bb.0: ## %entry ; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 -; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X86-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X86-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X86-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X86-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpmovsxwd %xmm0, %xmm1 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X86-NEXT: vpmovsxwd %xmm0, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: three_or_and: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X64-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X64-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X64-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X64-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpmovsxwd %xmm0, %xmm1 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-NEXT: vpmovsxwd %xmm0, %xmm0 +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-AVX2-LABEL: three_or_and: ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X86-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X86-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X86-AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X86-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: three_or_and: ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X64-AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X64-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X86-AVX512-LABEL: three_or_and: @@ -996,56 +1396,98 @@ ; X86: ## %bb.0: ## %entry ; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 -; X86-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X86-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X86-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X86-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X86-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X86-NEXT: vpmovsxwd %xmm0, %xmm1 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X86-NEXT: vpmovsxwd %xmm0, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: four_or_and: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; X64-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X64-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X64-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X64-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X64-NEXT: vpmovsxwd %xmm0, %xmm1 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-NEXT: vpmovsxwd %xmm0, %xmm0 +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-AVX2-LABEL: four_or_and: ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X86-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X86-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X86-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: four_or_and: ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X64-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X86-AVX512-LABEL: four_or_and: @@ -1088,66 +1530,116 @@ ; X86: ## %bb.0: ## %entry ; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 -; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X86-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X86-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X86-NEXT: vorps %ymm1, %ymm2, %ymm1 +; X86-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X86-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X86-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X86-NEXT: vpor %xmm1, %xmm2, %xmm1 ; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 +; X86-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X86-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X86-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X86-NEXT: vpmovsxwd %xmm0, %xmm1 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X86-NEXT: vpmovsxwd %xmm0, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: five_or_and: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X64-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X64-NEXT: vorps %ymm1, %ymm2, %ymm1 +; X64-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X64-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X64-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X64-NEXT: vpor %xmm1, %xmm2, %xmm1 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; X64-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X64-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X64-NEXT: vpmovsxwd %xmm0, %xmm1 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-NEXT: vpmovsxwd %xmm0, %xmm0 +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-AVX2-LABEL: five_or_and: ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X86-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X86-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X86-AVX2-NEXT: vorps %ymm1, %ymm2, %ymm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X86-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X86-AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X86-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X86-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: five_or_and: ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X64-AVX2-NEXT: vorps %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X64-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X64-AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X64-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X86-AVX512-LABEL: five_or_and: @@ -1196,56 +1688,98 @@ ; X86: ## %bb.0: ## %entry ; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 -; X86-NEXT: vxorps %ymm2, %ymm1, %ymm1 -; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X86-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X86-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X86-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X86-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X86-NEXT: vpmovsxwd %xmm0, %xmm1 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X86-NEXT: vpmovsxwd %xmm0, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: four_or_and_xor: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; X64-NEXT: vxorps %ymm2, %ymm1, %ymm1 -; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X64-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X64-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X64-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X64-NEXT: vpmovsxwd %xmm0, %xmm1 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-NEXT: vpmovsxwd %xmm0, %xmm0 +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-AVX2-LABEL: four_or_and_xor: ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X86-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X86-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X86-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: four_or_and_xor: ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 -; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 +; X64-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X86-AVX512-LABEL: four_or_and_xor: @@ -1290,66 +1824,116 @@ ; X86: ## %bb.0: ## %entry ; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 -; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X86-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X86-NEXT: vxorps %ymm3, %ymm2, %ymm2 +; X86-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X86-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X86-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3 +; X86-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X86-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 ; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-NEXT: vandps %ymm0, %ymm3, %ymm0 -; X86-NEXT: vxorps %ymm0, %ymm2, %ymm0 -; X86-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X86-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; X86-NEXT: vpand %xmm0, %xmm3, %xmm0 +; X86-NEXT: vpxor %xmm0, %xmm2, %xmm0 +; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpmovsxwd %xmm0, %xmm1 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X86-NEXT: vpmovsxwd %xmm0, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: five_or_and_xor: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X64-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-NEXT: vxorps %ymm3, %ymm2, %ymm2 +; X64-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X64-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X64-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 +; X64-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X64-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vandps %ymm0, %ymm3, %ymm0 -; X64-NEXT: vxorps %ymm0, %ymm2, %ymm0 -; X64-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X64-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; X64-NEXT: vpand %xmm0, %xmm3, %xmm0 +; X64-NEXT: vpxor %xmm0, %xmm2, %xmm0 +; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpmovsxwd %xmm0, %xmm1 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; X64-NEXT: vpmovsxwd %xmm0, %xmm0 +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-AVX2-LABEL: five_or_and_xor: ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X86-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X86-AVX2-NEXT: vxorps %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X86-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X86-AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 +; X86-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X86-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] ; X86-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm0 -; X86-AVX2-NEXT: vandps %ymm0, %ymm3, %ymm0 -; X86-AVX2-NEXT: vxorps %ymm0, %ymm2, %ymm0 -; X86-AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X86-AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpand %xmm0, %xmm3, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 +; X86-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: five_or_and_xor: ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-AVX2-NEXT: vxorps %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X64-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X64-AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 +; X64-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X64-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm0 -; X64-AVX2-NEXT: vandps %ymm0, %ymm3, %ymm0 -; X64-AVX2-NEXT: vxorps %ymm0, %ymm2, %ymm0 -; X64-AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X64-AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand %xmm0, %xmm3, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 +; X64-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X86-AVX512-LABEL: five_or_and_xor: @@ -1399,76 +1983,144 @@ ; X86: ## %bb.0: ## %entry ; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 -; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X86-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X86-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X86-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X86-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X86-NEXT: vpand %xmm3, %xmm2, %xmm2 ; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3 -; X86-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X86-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X86-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X86-NEXT: vpand %xmm3, %xmm2, %xmm2 ; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3 -; X86-NEXT: vxorps %ymm1, %ymm3, %ymm1 -; X86-NEXT: vxorps %ymm2, %ymm1, %ymm1 +; X86-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X86-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X86-NEXT: vpxor %xmm1, %xmm3, %xmm1 +; X86-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X86-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-NEXT: vpslld $31, %xmm1, %xmm1 +; X86-NEXT: vpsrad $31, %xmm1, %xmm1 +; X86-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; X86-NEXT: vpslld $31, %xmm0, %xmm0 +; X86-NEXT: vpsrad $31, %xmm0, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: six_or_and_xor: ; X64: ## %bb.0: ## %entry ; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X64-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X64-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X64-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X64-NEXT: vpand %xmm3, %xmm2, %xmm2 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; X64-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X64-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X64-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X64-NEXT: vpand %xmm3, %xmm2, %xmm2 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; X64-NEXT: vxorps %ymm1, %ymm3, %ymm1 -; X64-NEXT: vxorps %ymm2, %ymm1, %ymm1 +; X64-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X64-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X64-NEXT: vpxor %xmm1, %xmm3, %xmm1 +; X64-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X64-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-NEXT: vpslld $31, %xmm1, %xmm1 +; X64-NEXT: vpsrad $31, %xmm1, %xmm1 +; X64-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; X64-NEXT: vpslld $31, %xmm0, %xmm0 +; X64-NEXT: vpsrad $31, %xmm0, %xmm0 +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-AVX2-LABEL: six_or_and_xor: ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X86-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X86-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X86-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X86-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X86-AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X86-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X86-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X86-AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X86-AVX2-NEXT: vxorps %ymm1, %ymm3, %ymm1 -; X86-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1 +; X86-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X86-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm3, %xmm1 +; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1] ; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 -; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X86-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X86-AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: six_or_and_xor: ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 +; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X64-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X64-AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X64-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X64-AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-AVX2-NEXT: vxorps %ymm1, %ymm3, %ymm1 -; X64-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 +; X64-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm3, %xmm1 +; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 -; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X86-AVX512-LABEL: six_or_and_xor: diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll --- a/llvm/test/CodeGen/X86/var-permute-128.ll +++ b/llvm/test/CodeGen/X86/var-permute-128.ll @@ -226,69 +226,77 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind { ; SSE3-LABEL: var_shuffle_v16i8: ; SSE3: # %bb.0: +; SSE3-NEXT: pushq %rbp +; SSE3-NEXT: pushq %r15 +; SSE3-NEXT: pushq %r14 +; SSE3-NEXT: pushq %r13 +; SSE3-NEXT: pushq %r12 +; SSE3-NEXT: pushq %rbx ; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: andl $15, %ebp +; SSE3-NEXT: movzbl -24(%rsp,%rbp), %eax ; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r13d +; SSE3-NEXT: movzbl -24(%rsp,%r13), %eax ; SSE3-NEXT: movd %eax, %xmm4 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r12d +; SSE3-NEXT: movzbl -24(%rsp,%r12), %eax ; SSE3-NEXT: movd %eax, %xmm3 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r15d +; SSE3-NEXT: movzbl -24(%rsp,%r15), %eax ; SSE3-NEXT: movd %eax, %xmm5 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r14d +; SSE3-NEXT: movzbl -24(%rsp,%r14), %eax ; SSE3-NEXT: movd %eax, %xmm7 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %ebx +; SSE3-NEXT: movzbl -24(%rsp,%rbx), %eax ; SSE3-NEXT: movd %eax, %xmm8 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r11d +; SSE3-NEXT: movzbl -24(%rsp,%r11), %eax ; SSE3-NEXT: movd %eax, %xmm6 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r10d +; SSE3-NEXT: movzbl -24(%rsp,%r10), %eax ; SSE3-NEXT: movd %eax, %xmm9 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r9d +; SSE3-NEXT: movzbl -24(%rsp,%r9), %eax ; SSE3-NEXT: movd %eax, %xmm10 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r8d +; SSE3-NEXT: movzbl -24(%rsp,%r8), %eax ; SSE3-NEXT: movd %eax, %xmm11 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %edi +; SSE3-NEXT: movzbl -24(%rsp,%rdi), %eax ; SSE3-NEXT: movd %eax, %xmm12 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %esi +; SSE3-NEXT: movzbl -24(%rsp,%rsi), %eax ; SSE3-NEXT: movd %eax, %xmm13 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %edx +; SSE3-NEXT: movzbl -24(%rsp,%rdx), %eax ; SSE3-NEXT: movd %eax, %xmm14 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %ecx +; SSE3-NEXT: movzbl -24(%rsp,%rcx), %eax ; SSE3-NEXT: movd %eax, %xmm15 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm0 @@ -307,6 +315,12 @@ ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE3-NEXT: popq %rbx +; SSE3-NEXT: popq %r12 +; SSE3-NEXT: popq %r13 +; SSE3-NEXT: popq %r14 +; SSE3-NEXT: popq %r15 +; SSE3-NEXT: popq %rbp ; SSE3-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v16i8: @@ -490,69 +504,77 @@ define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %indices) nounwind { ; SSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8: ; SSE3: # %bb.0: +; SSE3-NEXT: pushq %rbp +; SSE3-NEXT: pushq %r15 +; SSE3-NEXT: pushq %r14 +; SSE3-NEXT: pushq %r13 +; SSE3-NEXT: pushq %r12 +; SSE3-NEXT: pushq %rbx ; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax -; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d +; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp ; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: andl $15, %ebp +; SSE3-NEXT: movzbl -24(%rsp,%rbp), %eax ; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r13d +; SSE3-NEXT: movzbl -24(%rsp,%r13), %eax ; SSE3-NEXT: movd %eax, %xmm4 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r12d +; SSE3-NEXT: movzbl -24(%rsp,%r12), %eax ; SSE3-NEXT: movd %eax, %xmm3 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r15d +; SSE3-NEXT: movzbl -24(%rsp,%r15), %eax ; SSE3-NEXT: movd %eax, %xmm5 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r14d +; SSE3-NEXT: movzbl -24(%rsp,%r14), %eax ; SSE3-NEXT: movd %eax, %xmm7 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %ebx +; SSE3-NEXT: movzbl -24(%rsp,%rbx), %eax ; SSE3-NEXT: movd %eax, %xmm8 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r11d +; SSE3-NEXT: movzbl -24(%rsp,%r11), %eax ; SSE3-NEXT: movd %eax, %xmm6 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r10d +; SSE3-NEXT: movzbl -24(%rsp,%r10), %eax ; SSE3-NEXT: movd %eax, %xmm9 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r9d +; SSE3-NEXT: movzbl -24(%rsp,%r9), %eax ; SSE3-NEXT: movd %eax, %xmm10 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %r8d +; SSE3-NEXT: movzbl -24(%rsp,%r8), %eax ; SSE3-NEXT: movd %eax, %xmm11 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %edi +; SSE3-NEXT: movzbl -24(%rsp,%rdi), %eax ; SSE3-NEXT: movd %eax, %xmm12 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %esi +; SSE3-NEXT: movzbl -24(%rsp,%rsi), %eax ; SSE3-NEXT: movd %eax, %xmm13 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %edx +; SSE3-NEXT: movzbl -24(%rsp,%rdx), %eax ; SSE3-NEXT: movd %eax, %xmm14 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE3-NEXT: andl $15, %eax -; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE3-NEXT: andl $15, %ecx +; SSE3-NEXT: movzbl -24(%rsp,%rcx), %eax ; SSE3-NEXT: movd %eax, %xmm15 -; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE3-NEXT: andl $15, %eax ; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE3-NEXT: movd %eax, %xmm0 @@ -571,6 +593,12 @@ ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; SSE3-NEXT: popq %rbx +; SSE3-NEXT: popq %r12 +; SSE3-NEXT: popq %r13 +; SSE3-NEXT: popq %r14 +; SSE3-NEXT: popq %r15 +; SSE3-NEXT: popq %rbp ; SSE3-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8: @@ -1156,24 +1184,30 @@ ; ; XOP-LABEL: indices_convert: ; XOP: # %bb.0: # %bb -; XOP-NEXT: vmovdqa (%rax), %xmm0 -; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1 -; XOP-NEXT: vpermil2pd $0, %xmm1, %xmm0, %xmm0, %xmm0 +; XOP-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,3,2,3] +; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; XOP-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; XOP-NEXT: vpermil2pd $0, %xmm0, (%rax), %xmm1, %xmm0 ; XOP-NEXT: vmovupd %xmm0, (%rax) +; XOP-NEXT: vzeroupper ; XOP-NEXT: retq ; ; AVX1-LABEL: indices_convert: ; AVX1: # %bb.0: # %bb -; AVX1-NEXT: vmovdqa (%rax), %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,3,2,3] +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vmovapd (%rax), %xmm1 +; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpermilpd %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX1-NEXT: vpermilpd %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-NEXT: vmovupd %xmm0, (%rax) +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: indices_convert: diff --git a/llvm/test/CodeGen/X86/var-permute-512.ll b/llvm/test/CodeGen/X86/var-permute-512.ll --- a/llvm/test/CodeGen/X86/var-permute-512.ll +++ b/llvm/test/CodeGen/X86/var-permute-512.ll @@ -170,12 +170,10 @@ ; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrw $5, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512F-NEXT: vpinsrw $5, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrw $6, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX512F-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrw $7, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax ; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax @@ -422,12 +420,10 @@ ; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $13, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $14, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $15, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax @@ -471,12 +467,10 @@ ; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $12, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $13, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $14, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax @@ -521,12 +515,10 @@ ; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $11, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $12, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 ; AVX512F-NEXT: vpextrb $13, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax @@ -647,12 +639,10 @@ ; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $13, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; AVX512BW-NEXT: vpinsrb $13, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $14, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; AVX512BW-NEXT: vpinsrb $14, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $15, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax @@ -696,12 +686,10 @@ ; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; AVX512BW-NEXT: vpinsrb $13, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax @@ -746,12 +734,10 @@ ; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $11, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $12, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 ; AVX512BW-NEXT: vpextrb $13, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax @@ -1093,21 +1079,21 @@ ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpextrd $3, %xmm3, %eax -; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm4 +; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm3 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm4, %eax +; AVX512F-NEXT: vmovd %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $1, %xmm4, %eax +; AVX512F-NEXT: vpextrd $1, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 ; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm5 -; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 +; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 ; AVX512F-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 ; AVX512F-NEXT: andl $63, %esi ; AVX512F-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpextrd $3, %xmm4, %eax +; AVX512F-NEXT: vpextrd $3, %xmm3, %eax ; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm1 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 @@ -1134,50 +1120,56 @@ ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm1, %xmm1 ; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm3 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vmovd %xmm4, %eax +; AVX512F-NEXT: vmovd %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $1, %xmm4, %eax +; AVX512F-NEXT: vpextrd $1, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $2, %xmm4, %eax +; AVX512F-NEXT: vpextrd $2, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $3, %xmm4, %eax -; AVX512F-NEXT: vextracti32x4 $2, %zmm5, %xmm4 +; AVX512F-NEXT: vpextrd $3, %xmm3, %eax +; AVX512F-NEXT: vextracti32x4 $2, %zmm5, %xmm3 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vmovd %xmm4, %eax +; AVX512F-NEXT: vmovd %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $1, %xmm4, %eax +; AVX512F-NEXT: vpextrd $1, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpextrd $2, %xmm4, %eax +; AVX512F-NEXT: vpextrd $2, %xmm3, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm1, %xmm6 -; AVX512F-NEXT: vpextrd $3, %xmm4, %eax +; AVX512F-NEXT: vpextrd $3, %xmm3, %eax ; AVX512F-NEXT: vextracti32x4 $3, %zmm5, %xmm1 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm4 +; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm3 ; AVX512F-NEXT: vmovd %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 -; AVX512F-NEXT: vmovd %xmm3, %eax +; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm3, %xmm3 +; AVX512F-NEXT: vpextrd $1, %xmm1, %eax +; AVX512F-NEXT: andl $63, %eax +; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm3, %xmm3 +; AVX512F-NEXT: vpextrd $2, %xmm1, %eax +; AVX512F-NEXT: andl $63, %eax +; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm3, %xmm3 +; AVX512F-NEXT: vmovd %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vmovd %eax, %xmm5 -; AVX512F-NEXT: vpextrd $1, %xmm3, %eax +; AVX512F-NEXT: vpextrd $1, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm5, %xmm5 -; AVX512F-NEXT: vpextrd $2, %xmm3, %eax +; AVX512F-NEXT: vpextrd $2, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm5, %xmm5 -; AVX512F-NEXT: vpextrd $3, %xmm3, %eax -; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512F-NEXT: vpextrd $3, %xmm4, %eax +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm5, %xmm5 ; AVX512F-NEXT: vmovd %xmm6, %eax @@ -1188,114 +1180,100 @@ ; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm5, %xmm5 ; AVX512F-NEXT: vpextrd $2, %xmm6, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm5, %xmm7 +; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm5, %xmm5 ; AVX512F-NEXT: vpextrd $3, %xmm6, %eax -; AVX512F-NEXT: vextracti32x4 $2, %zmm3, %xmm5 +; AVX512F-NEXT: vextracti32x4 $2, %zmm4, %xmm6 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm6 -; AVX512F-NEXT: vmovd %xmm5, %eax +; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm5, %xmm5 +; AVX512F-NEXT: vmovd %xmm6, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm6, %xmm6 -; AVX512F-NEXT: vpextrd $1, %xmm5, %eax +; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm5, %xmm5 +; AVX512F-NEXT: vpextrd $1, %xmm6, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm6, %xmm6 -; AVX512F-NEXT: vpextrd $2, %xmm5, %eax +; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm5, %xmm5 +; AVX512F-NEXT: vpextrd $2, %xmm6, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm6, %xmm6 +; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm5, %xmm5 +; AVX512F-NEXT: vpextrd $3, %xmm6, %eax +; AVX512F-NEXT: vextracti32x4 $3, %zmm4, %xmm4 +; AVX512F-NEXT: andl $63, %eax +; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm5, %xmm5 +; AVX512F-NEXT: vmovd %xmm4, %eax +; AVX512F-NEXT: andl $63, %eax +; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm5, %xmm5 +; AVX512F-NEXT: vpextrd $1, %xmm4, %eax +; AVX512F-NEXT: andl $63, %eax +; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm5, %xmm5 ; AVX512F-NEXT: vmovd %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vmovd %eax, %xmm7 +; AVX512F-NEXT: vmovd %eax, %xmm6 ; AVX512F-NEXT: vpextrd $1, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm7, %xmm7 +; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm6, %xmm6 ; AVX512F-NEXT: vpextrd $2, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm7, %xmm7 +; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm6, %xmm6 ; AVX512F-NEXT: vpextrd $3, %xmm2, %eax -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm7 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm7, %xmm7 -; AVX512F-NEXT: vmovd %xmm8, %eax +; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm6, %xmm6 +; AVX512F-NEXT: vmovd %xmm7, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm7, %xmm7 -; AVX512F-NEXT: vpextrd $1, %xmm8, %eax +; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm6, %xmm6 +; AVX512F-NEXT: vpextrd $1, %xmm7, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm7, %xmm7 -; AVX512F-NEXT: vpextrd $2, %xmm8, %eax +; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm6, %xmm6 +; AVX512F-NEXT: vpextrd $2, %xmm7, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm7, %xmm7 -; AVX512F-NEXT: vpextrd $3, %xmm8, %eax -; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm8 +; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm6, %xmm6 +; AVX512F-NEXT: vpextrd $3, %xmm7, %eax +; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm7 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm7 -; AVX512F-NEXT: vmovd %xmm8, %eax +; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm6, %xmm6 +; AVX512F-NEXT: vmovd %xmm7, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm7, %xmm7 -; AVX512F-NEXT: vpextrd $1, %xmm8, %eax +; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm6, %xmm6 +; AVX512F-NEXT: vpextrd $1, %xmm7, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm7, %xmm7 -; AVX512F-NEXT: vpextrd $2, %xmm8, %eax +; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm6, %xmm6 +; AVX512F-NEXT: vpextrd $2, %xmm7, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $10, %eax, %xmm7, %xmm7 -; AVX512F-NEXT: vpextrd $3, %xmm8, %eax +; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm6, %xmm6 +; AVX512F-NEXT: vpextrd $3, %xmm7, %eax ; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm2 ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $11, %eax, %xmm7, %xmm7 +; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm6 ; AVX512F-NEXT: vmovd %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $12, %eax, %xmm7, %xmm7 +; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm6, %xmm6 ; AVX512F-NEXT: vpextrd $1, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $13, %eax, %xmm7, %xmm7 +; AVX512F-NEXT: vpinsrb $13, %eax, %xmm6, %xmm6 ; AVX512F-NEXT: vpextrd $2, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $14, %eax, %xmm7, %xmm7 +; AVX512F-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6 ; AVX512F-NEXT: vpextrd $3, %xmm2, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $15, %eax, %xmm7, %xmm2 -; AVX512F-NEXT: vpextrd $3, %xmm5, %eax -; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm3 -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $11, %eax, %xmm6, %xmm5 -; AVX512F-NEXT: vmovd %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX512F-NEXT: vpextrd $1, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 -; AVX512F-NEXT: vpextrd $2, %xmm3, %eax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm6, %xmm2 +; AVX512F-NEXT: vpextrd $2, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX512F-NEXT: vpextrd $3, %xmm3, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $15, %eax, %xmm5, %xmm3 -; AVX512F-NEXT: vpextrd $1, %xmm1, %eax -; AVX512F-NEXT: andl $63, %eax -; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 -; AVX512F-NEXT: vpextrd $2, %xmm1, %eax +; AVX512F-NEXT: vpextrd $3, %xmm4, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm5, %xmm4 ; AVX512F-NEXT: vpextrd $3, %xmm1, %eax ; AVX512F-NEXT: andl $63, %eax ; AVX512F-NEXT: movzbl (%rsp,%rax), %eax -; AVX512F-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1 +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm3, %xmm1 ; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 ; AVX512F-NEXT: vcvtdq2ps %zmm2, %zmm2 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512F-NEXT: vpmovsxbd %xmm4, %zmm3 ; AVX512F-NEXT: vcvtdq2ps %zmm3, %zmm3 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 ; AVX512F-NEXT: vcvtdq2ps %zmm1, %zmm1 @@ -1344,21 +1322,21 @@ ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0 ; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm4 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm4, %eax +; AVX512BW-NEXT: vmovd %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $1, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm5 -; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 +; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 ; AVX512BW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 ; AVX512BW-NEXT: andl $63, %esi ; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rsi), %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrd $3, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm1 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0 @@ -1385,50 +1363,56 @@ ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm1, %xmm1 ; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax -; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm3 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vmovd %xmm4, %eax +; AVX512BW-NEXT: vmovd %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $1, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $2, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $2, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $3, %xmm4, %eax -; AVX512BW-NEXT: vextracti32x4 $2, %zmm5, %xmm4 +; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax +; AVX512BW-NEXT: vextracti32x4 $2, %zmm5, %xmm3 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vmovd %xmm4, %eax +; AVX512BW-NEXT: vmovd %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $1, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrd $2, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $2, %xmm3, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm1, %xmm6 -; AVX512BW-NEXT: vpextrd $3, %xmm4, %eax +; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax ; AVX512BW-NEXT: vextracti32x4 $3, %zmm5, %xmm1 ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm4 +; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm3 ; AVX512BW-NEXT: vmovd %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm4, %xmm4 -; AVX512BW-NEXT: vmovd %xmm3, %eax +; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm3, %xmm3 +; AVX512BW-NEXT: vpextrd $1, %xmm1, %eax +; AVX512BW-NEXT: andl $63, %eax +; AVX512BW-NEXT: vpinsrb $13, (%rsp,%rax), %xmm3, %xmm3 +; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax +; AVX512BW-NEXT: andl $63, %eax +; AVX512BW-NEXT: vpinsrb $14, (%rsp,%rax), %xmm3, %xmm3 +; AVX512BW-NEXT: vmovd %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vmovd %eax, %xmm5 -; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax +; AVX512BW-NEXT: vpextrd $1, %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrd $2, %xmm3, %eax +; AVX512BW-NEXT: vpextrd $2, %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax -; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512BW-NEXT: vpextrd $3, %xmm4, %eax +; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm5, %xmm5 ; AVX512BW-NEXT: vmovd %xmm6, %eax @@ -1439,114 +1423,100 @@ ; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrd $2, %xmm6, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm5, %xmm7 +; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm5, %xmm5 ; AVX512BW-NEXT: vpextrd $3, %xmm6, %eax -; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, %xmm5 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm4, %xmm6 ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm6 -; AVX512BW-NEXT: vmovd %xmm5, %eax +; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm5, %xmm5 +; AVX512BW-NEXT: vmovd %xmm6, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm6, %xmm6 -; AVX512BW-NEXT: vpextrd $1, %xmm5, %eax +; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrd $1, %xmm6, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm6, %xmm6 -; AVX512BW-NEXT: vpextrd $2, %xmm5, %eax +; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrd $2, %xmm6, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm6, %xmm6 +; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrd $3, %xmm6, %eax +; AVX512BW-NEXT: vextracti32x4 $3, %zmm4, %xmm4 +; AVX512BW-NEXT: andl $63, %eax +; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm5, %xmm5 +; AVX512BW-NEXT: vmovd %xmm4, %eax +; AVX512BW-NEXT: andl $63, %eax +; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm5, %xmm5 +; AVX512BW-NEXT: vpextrd $1, %xmm4, %eax +; AVX512BW-NEXT: andl $63, %eax +; AVX512BW-NEXT: vpinsrb $13, (%rsp,%rax), %xmm5, %xmm5 ; AVX512BW-NEXT: vmovd %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vmovd %eax, %xmm7 +; AVX512BW-NEXT: vmovd %eax, %xmm6 ; AVX512BW-NEXT: vpextrd $1, %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm7, %xmm7 +; AVX512BW-NEXT: vpinsrb $1, (%rsp,%rax), %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrd $2, %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm7, %xmm7 +; AVX512BW-NEXT: vpinsrb $2, (%rsp,%rax), %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrd $3, %xmm2, %eax -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm7 ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm7, %xmm7 -; AVX512BW-NEXT: vmovd %xmm8, %eax +; AVX512BW-NEXT: vpinsrb $3, (%rsp,%rax), %xmm6, %xmm6 +; AVX512BW-NEXT: vmovd %xmm7, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm7, %xmm7 -; AVX512BW-NEXT: vpextrd $1, %xmm8, %eax +; AVX512BW-NEXT: vpinsrb $4, (%rsp,%rax), %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrd $1, %xmm7, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm7, %xmm7 -; AVX512BW-NEXT: vpextrd $2, %xmm8, %eax +; AVX512BW-NEXT: vpinsrb $5, (%rsp,%rax), %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrd $2, %xmm7, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm7, %xmm7 -; AVX512BW-NEXT: vpextrd $3, %xmm8, %eax -; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm8 +; AVX512BW-NEXT: vpinsrb $6, (%rsp,%rax), %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrd $3, %xmm7, %eax +; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm7 ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm7, %xmm7 -; AVX512BW-NEXT: vmovd %xmm8, %eax +; AVX512BW-NEXT: vpinsrb $7, (%rsp,%rax), %xmm6, %xmm6 +; AVX512BW-NEXT: vmovd %xmm7, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm7, %xmm7 -; AVX512BW-NEXT: vpextrd $1, %xmm8, %eax +; AVX512BW-NEXT: vpinsrb $8, (%rsp,%rax), %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrd $1, %xmm7, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm7, %xmm7 -; AVX512BW-NEXT: vpextrd $2, %xmm8, %eax +; AVX512BW-NEXT: vpinsrb $9, (%rsp,%rax), %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrd $2, %xmm7, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm7, %xmm7 -; AVX512BW-NEXT: vpextrd $3, %xmm8, %eax +; AVX512BW-NEXT: vpinsrb $10, (%rsp,%rax), %xmm6, %xmm6 +; AVX512BW-NEXT: vpextrd $3, %xmm7, %eax ; AVX512BW-NEXT: vextracti32x4 $3, %zmm2, %xmm2 ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm7, %xmm7 +; AVX512BW-NEXT: vpinsrb $11, (%rsp,%rax), %xmm6, %xmm6 ; AVX512BW-NEXT: vmovd %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm7, %xmm7 +; AVX512BW-NEXT: vpinsrb $12, (%rsp,%rax), %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrd $1, %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm7, %xmm7 +; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrd $2, %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm7, %xmm7 +; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6 ; AVX512BW-NEXT: vpextrd $3, %xmm2, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm7, %xmm2 -; AVX512BW-NEXT: vpextrd $3, %xmm5, %eax -; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm3 -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm6, %xmm5 -; AVX512BW-NEXT: vmovd %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrd $1, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrd $2, %xmm3, %eax +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm6, %xmm2 +; AVX512BW-NEXT: vpextrd $2, %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax ; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX512BW-NEXT: vpextrd $3, %xmm3, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm3 -; AVX512BW-NEXT: vpextrd $1, %xmm1, %eax -; AVX512BW-NEXT: andl $63, %eax -; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 -; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax +; AVX512BW-NEXT: vpextrd $3, %xmm4, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4 +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm5, %xmm4 ; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax ; AVX512BW-NEXT: andl $63, %eax ; AVX512BW-NEXT: movzbl (%rsp,%rax), %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm4, %xmm1 +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm1 ; AVX512BW-NEXT: vpmovsxbd %xmm2, %zmm2 ; AVX512BW-NEXT: vcvtdq2ps %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512BW-NEXT: vpmovsxbd %xmm4, %zmm3 ; AVX512BW-NEXT: vcvtdq2ps %zmm3, %zmm3 ; AVX512BW-NEXT: vpmovsxbd %xmm1, %zmm1 ; AVX512BW-NEXT: vcvtdq2ps %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/vastart-defs-eflags.ll b/llvm/test/CodeGen/X86/vastart-defs-eflags.ll --- a/llvm/test/CodeGen/X86/vastart-defs-eflags.ll +++ b/llvm/test/CodeGen/X86/vastart-defs-eflags.ll @@ -9,11 +9,11 @@ ; CHECK-LABEL: check_flag: ; CHECK: ## %bb.0: ## %entry ; CHECK-NEXT: subq $56, %rsp -; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: je LBB0_4 ; CHECK-NEXT: ## %bb.3: ## %entry diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll @@ -808,8 +808,7 @@ ; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-64-NEXT: vextractps $2, %xmm0, %eax ; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 -; AVX1-64-NEXT: vmovq %xmm0, %rax -; AVX1-64-NEXT: movl %eax, %eax +; AVX1-64-NEXT: vmovd %xmm0, %eax ; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4 ; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 @@ -851,8 +850,7 @@ ; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-64-NEXT: vextractps $2, %xmm0, %eax ; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 -; AVX2-64-NEXT: vmovq %xmm0, %rax -; AVX2-64-NEXT: movl %eax, %eax +; AVX2-64-NEXT: vmovd %xmm0, %eax ; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm0 ; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX2-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1084,28 +1082,31 @@ ; ; AVX1-64-LABEL: uitofp_v4i64_v4f32: ; AVX1-64: # %bb.0: -; AVX1-64-NEXT: vpsrlq $1, %xmm0, %xmm1 -; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-64-NEXT: vpsrlq $1, %xmm2, %xmm3 -; AVX1-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-64-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; AVX1-64-NEXT: vorpd %ymm3, %ymm1, %ymm1 -; AVX1-64-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 +; AVX1-64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-64-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 +; AVX1-64-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3] +; AVX1-64-NEXT: vpsrlq $1, %xmm0, %xmm2 +; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-64-NEXT: vpsrlq $1, %xmm3, %xmm4 +; AVX1-64-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-64-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; AVX1-64-NEXT: vorpd %ymm4, %ymm2, %ymm2 +; AVX1-64-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm1 ; AVX1-64-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 ; AVX1-64-NEXT: vmovq %xmm1, %rax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX1-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[2,3] ; AVX1-64-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-64-NEXT: vmovq %xmm1, %rax ; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] ; AVX1-64-NEXT: vpextrq $1, %xmm1, %rax ; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] -; AVX1-64-NEXT: vaddps %xmm1, %xmm1, %xmm3 -; AVX1-64-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-64-NEXT: vblendvps %xmm0, %xmm3, %xmm1, %xmm0 +; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] +; AVX1-64-NEXT: vaddps %xmm1, %xmm1, %xmm2 +; AVX1-64-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-64-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-64-NEXT: vzeroupper ; AVX1-64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_ext_inreg.ll b/llvm/test/CodeGen/X86/vec_ext_inreg.ll --- a/llvm/test/CodeGen/X86/vec_ext_inreg.ll +++ b/llvm/test/CodeGen/X86/vec_ext_inreg.ll @@ -6,19 +6,20 @@ define <8 x i32> @a(<8 x i32> %a) nounwind { ; SSE-LABEL: a: ; SSE: # %bb.0: +; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: a: ; AVX1: # %bb.0: -; AVX1-NEXT: vpslld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -62,9 +63,9 @@ define <8 x i32> @d(<8 x i32> %a) nounwind { ; SSE-LABEL: d: ; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] -; SSE-NEXT: andps %xmm2, %xmm0 -; SSE-NEXT: andps %xmm2, %xmm1 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: d: diff --git a/llvm/test/CodeGen/X86/vec_fneg.ll b/llvm/test/CodeGen/X86/vec_fneg.ll --- a/llvm/test/CodeGen/X86/vec_fneg.ll +++ b/llvm/test/CodeGen/X86/vec_fneg.ll @@ -118,30 +118,15 @@ define <2 x float> @fneg_bitcast(i64 %i) nounwind { ; X32-SSE1-LABEL: fneg_bitcast: ; X32-SSE1: # %bb.0: -; X32-SSE1-NEXT: pushl %ebp -; X32-SSE1-NEXT: movl %esp, %ebp -; X32-SSE1-NEXT: andl $-16, %esp -; X32-SSE1-NEXT: subl $16, %esp -; X32-SSE1-NEXT: movl $-2147483648, %eax # imm = 0x80000000 -; X32-SSE1-NEXT: movl 12(%ebp), %ecx -; X32-SSE1-NEXT: xorl %eax, %ecx -; X32-SSE1-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X32-SSE1-NEXT: xorl 8(%ebp), %eax -; X32-SSE1-NEXT: movl %eax, (%esp) -; X32-SSE1-NEXT: movaps (%esp), %xmm0 -; X32-SSE1-NEXT: movl %ebp, %esp -; X32-SSE1-NEXT: popl %ebp +; X32-SSE1-NEXT: xorps %xmm0, %xmm0 +; X32-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X32-SSE1-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X32-SSE1-NEXT: retl ; ; X32-SSE2-LABEL: fneg_bitcast: ; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: movl $-2147483648, %eax # imm = 0x80000000 -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE2-NEXT: xorl %eax, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: xorl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movd %eax, %xmm0 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X32-SSE2-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X32-SSE2-NEXT: retl ; ; X64-SSE1-LABEL: fneg_bitcast: diff --git a/llvm/test/CodeGen/X86/vec_fpext.ll b/llvm/test/CodeGen/X86/vec_fpext.ll --- a/llvm/test/CodeGen/X86/vec_fpext.ll +++ b/llvm/test/CodeGen/X86/vec_fpext.ll @@ -186,14 +186,14 @@ ; X32-SSE: # %bb.0: # %entry ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] -; X32-SSE-NEXT: cvtps2pd 8(%ecx), %xmm0 # encoding: [0x0f,0x5a,0x41,0x08] -; X32-SSE-NEXT: cvtps2pd (%ecx), %xmm1 # encoding: [0x0f,0x5a,0x09] -; X32-SSE-NEXT: cvtps2pd 24(%ecx), %xmm2 # encoding: [0x0f,0x5a,0x51,0x18] -; X32-SSE-NEXT: cvtps2pd 16(%ecx), %xmm3 # encoding: [0x0f,0x5a,0x59,0x10] -; X32-SSE-NEXT: movups %xmm3, 32(%eax) # encoding: [0x0f,0x11,0x58,0x20] -; X32-SSE-NEXT: movups %xmm2, 48(%eax) # encoding: [0x0f,0x11,0x50,0x30] -; X32-SSE-NEXT: movups %xmm1, (%eax) # encoding: [0x0f,0x11,0x08] -; X32-SSE-NEXT: movups %xmm0, 16(%eax) # encoding: [0x0f,0x11,0x40,0x10] +; X32-SSE-NEXT: cvtps2pd (%ecx), %xmm0 # encoding: [0x0f,0x5a,0x01] +; X32-SSE-NEXT: cvtps2pd 8(%ecx), %xmm1 # encoding: [0x0f,0x5a,0x49,0x08] +; X32-SSE-NEXT: cvtps2pd 16(%ecx), %xmm2 # encoding: [0x0f,0x5a,0x51,0x10] +; X32-SSE-NEXT: cvtps2pd 24(%ecx), %xmm3 # encoding: [0x0f,0x5a,0x59,0x18] +; X32-SSE-NEXT: movups %xmm3, 48(%eax) # encoding: [0x0f,0x11,0x58,0x30] +; X32-SSE-NEXT: movups %xmm2, 32(%eax) # encoding: [0x0f,0x11,0x50,0x20] +; X32-SSE-NEXT: movups %xmm1, 16(%eax) # encoding: [0x0f,0x11,0x48,0x10] +; X32-SSE-NEXT: movups %xmm0, (%eax) # encoding: [0x0f,0x11,0x00] ; X32-SSE-NEXT: retl # encoding: [0xc3] ; ; X32-AVX-LABEL: fpext_frommem8: @@ -218,14 +218,14 @@ ; ; X64-SSE-LABEL: fpext_frommem8: ; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: cvtps2pd 8(%rdi), %xmm0 # encoding: [0x0f,0x5a,0x47,0x08] -; X64-SSE-NEXT: cvtps2pd (%rdi), %xmm1 # encoding: [0x0f,0x5a,0x0f] -; X64-SSE-NEXT: cvtps2pd 24(%rdi), %xmm2 # encoding: [0x0f,0x5a,0x57,0x18] -; X64-SSE-NEXT: cvtps2pd 16(%rdi), %xmm3 # encoding: [0x0f,0x5a,0x5f,0x10] -; X64-SSE-NEXT: movups %xmm3, 32(%rsi) # encoding: [0x0f,0x11,0x5e,0x20] -; X64-SSE-NEXT: movups %xmm2, 48(%rsi) # encoding: [0x0f,0x11,0x56,0x30] -; X64-SSE-NEXT: movups %xmm1, (%rsi) # encoding: [0x0f,0x11,0x0e] -; X64-SSE-NEXT: movups %xmm0, 16(%rsi) # encoding: [0x0f,0x11,0x46,0x10] +; X64-SSE-NEXT: cvtps2pd (%rdi), %xmm0 # encoding: [0x0f,0x5a,0x07] +; X64-SSE-NEXT: cvtps2pd 8(%rdi), %xmm1 # encoding: [0x0f,0x5a,0x4f,0x08] +; X64-SSE-NEXT: cvtps2pd 16(%rdi), %xmm2 # encoding: [0x0f,0x5a,0x57,0x10] +; X64-SSE-NEXT: cvtps2pd 24(%rdi), %xmm3 # encoding: [0x0f,0x5a,0x5f,0x18] +; X64-SSE-NEXT: movups %xmm3, 48(%rsi) # encoding: [0x0f,0x11,0x5e,0x30] +; X64-SSE-NEXT: movups %xmm2, 32(%rsi) # encoding: [0x0f,0x11,0x56,0x20] +; X64-SSE-NEXT: movups %xmm1, 16(%rsi) # encoding: [0x0f,0x11,0x4e,0x10] +; X64-SSE-NEXT: movups %xmm0, (%rsi) # encoding: [0x0f,0x11,0x06] ; X64-SSE-NEXT: retq # encoding: [0xc3] ; ; X64-AVX-LABEL: fpext_frommem8: diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -2570,28 +2570,31 @@ ; ; AVX1-LABEL: uitofp_4i64_to_4f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; AVX1-NEXT: vorpd %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3] +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; AVX1-NEXT: vorpd %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] ; AVX1-NEXT: vpextrq $1, %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] -; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm3 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm1, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] +; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -3658,8 +3661,8 @@ ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0 @@ -4372,23 +4375,24 @@ ; AVX1-NEXT: vmovapd (%rdi), %ymm0 ; AVX1-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm3 -; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 -; AVX1-NEXT: vorpd %ymm4, %ymm3, %ymm3 -; AVX1-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm3 +; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm4 +; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm5 +; AVX1-NEXT: vorpd %ymm5, %ymm4, %ymm4 +; AVX1-NEXT: vblendvpd %ymm3, %ymm4, %ymm0, %ymm0 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm6, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] ; AVX1-NEXT: vaddps %xmm0, %xmm0, %xmm3 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 @@ -4824,7 +4828,8 @@ ; AVX1-NEXT: vpsrlq $1, %xmm8, %xmm9 ; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 ; AVX1-NEXT: vorpd %ymm5, %ymm7, %ymm5 -; AVX1-NEXT: vblendvpd %ymm3, %ymm5, %ymm3, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm7 +; AVX1-NEXT: vblendvpd %ymm7, %ymm5, %ymm3, %ymm3 ; AVX1-NEXT: vpextrq $1, %xmm3, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm5 ; AVX1-NEXT: vmovq %xmm3, %rax @@ -4845,7 +4850,8 @@ ; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm6 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-NEXT: vorpd %ymm4, %ymm5, %ymm4 -; AVX1-NEXT: vblendvpd %ymm2, %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 +; AVX1-NEXT: vblendvpd %ymm5, %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vpextrq $1, %xmm2, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm10, %xmm4 ; AVX1-NEXT: vmovq %xmm2, %rax diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -505,7 +505,7 @@ ; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vec_setcc-2.ll b/llvm/test/CodeGen/X86/vec_setcc-2.ll --- a/llvm/test/CodeGen/X86/vec_setcc-2.ll +++ b/llvm/test/CodeGen/X86/vec_setcc-2.ll @@ -231,12 +231,14 @@ ; SSE2-LABEL: ugt_v2i64_splat: ; SSE2: ## %bb.0: ; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259898,9223372039002259898] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -303,11 +305,12 @@ ; SSE2-LABEL: uge_v2i64_splat: ; SSE2: ## %bb.0: ; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147484090,2147484090,2147484090,2147484090] -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259898,9223372039002259898] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 @@ -377,11 +380,12 @@ ; SSE2-LABEL: ult_v2i64_splat: ; SSE2: ## %bb.0: ; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147484090,2147484090,2147484090,2147484090] -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259898,9223372039002259898] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: retq ; @@ -448,15 +452,17 @@ ; SSE2-LABEL: ule_v2i64_splat: ; SSE2: ## %bb.0: ; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259898,9223372039002259898] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: ule_v2i64_splat: diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -422,73 +422,73 @@ ; SSE2-LABEL: smulo_v6i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: movd %r8d, %xmm0 -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: movd %esi, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: movd %r8d, %xmm1 +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: movd %esi, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE2-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE2-NEXT: movd {{.*#+}} xmm9 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE2-NEXT: movd %r9d, %xmm0 -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero -; SSE2-NEXT: pmuludq %xmm6, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE2-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm10 = mem[0],zero,zero,zero +; SSE2-NEXT: pmuludq %xmm10, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 -; SSE2-NEXT: pand %xmm1, %xmm8 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm9 -; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: paddd %xmm8, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm8, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; SSE2-NEXT: psubd %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE2-NEXT: pxor %xmm11, %xmm11 +; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm12 +; SSE2-NEXT: pand %xmm6, %xmm12 +; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm13 +; SSE2-NEXT: pand %xmm8, %xmm13 +; SSE2-NEXT: paddd %xmm12, %xmm13 +; SSE2-NEXT: pmuludq %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,3,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm7[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: pmuludq %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE2-NEXT: psubd %xmm13, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 -; SSE2-NEXT: pand %xmm3, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: paddd %xmm8, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: pmuludq %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE2-NEXT: psubd %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm6 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm11 +; SSE2-NEXT: pand %xmm10, %xmm11 +; SSE2-NEXT: paddd %xmm6, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: pmuludq %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE2-NEXT: psubd %xmm11, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE2-NEXT: movq %xmm0, 16(%rcx) ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: movq %xmm0, 16(%rdi) ; SSE2-NEXT: movdqa %xmm1, (%rdi) ; SSE2-NEXT: retq @@ -496,73 +496,73 @@ ; SSSE3-LABEL: smulo_v6i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %rdi, %rax -; SSSE3-NEXT: movd %r8d, %xmm0 -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: movd %esi, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: movd %r8d, %xmm1 +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movd %edx, %xmm2 +; SSSE3-NEXT: movd %esi, %xmm6 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSSE3-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSSE3-NEXT: movd {{.*#+}} xmm9 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSSE3-NEXT: movd %r9d, %xmm0 -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero -; SSSE3-NEXT: pmuludq %xmm6, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSSE3-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm10 = mem[0],zero,zero,zero +; SSSE3-NEXT: pmuludq %xmm10, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 -; SSSE3-NEXT: pand %xmm1, %xmm8 -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm9 -; SSSE3-NEXT: pand %xmm5, %xmm9 -; SSSE3-NEXT: paddd %xmm8, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm8, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; SSSE3-NEXT: psubd %xmm9, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSSE3-NEXT: pxor %xmm11, %xmm11 +; SSSE3-NEXT: pxor %xmm12, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm12 +; SSSE3-NEXT: pand %xmm6, %xmm12 +; SSSE3-NEXT: pxor %xmm13, %xmm13 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm13 +; SSSE3-NEXT: pand %xmm8, %xmm13 +; SSSE3-NEXT: paddd %xmm12, %xmm13 +; SSSE3-NEXT: pmuludq %xmm6, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,3,2,3] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm7[0] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-NEXT: pmuludq %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSSE3-NEXT: psubd %xmm13, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSSE3-NEXT: movdqa %xmm1, (%rcx) ; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm5, %xmm1 -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 -; SSSE3-NEXT: pand %xmm3, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: paddd %xmm8, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSSE3-NEXT: pmuludq %xmm3, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSSE3-NEXT: psubd %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm1 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm6, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm6 +; SSSE3-NEXT: pand %xmm4, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm11 +; SSSE3-NEXT: pand %xmm10, %xmm11 +; SSSE3-NEXT: paddd %xmm6, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,0,0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSSE3-NEXT: pmuludq %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSSE3-NEXT: psubd %xmm11, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSSE3-NEXT: movq %xmm0, 16(%rcx) ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm5, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSSE3-NEXT: movq %xmm0, 16(%rdi) ; SSSE3-NEXT: movdqa %xmm1, (%rdi) ; SSSE3-NEXT: retq @@ -1218,7 +1218,7 @@ ; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 @@ -2003,14 +2003,14 @@ ; SSE2-NEXT: psrad $24, %xmm4 ; SSE2-NEXT: movdqa %xmm4, (%rdi) ; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm5 ; SSE2-NEXT: psrad $31, %xmm5 -; SSE2-NEXT: movdqa %xmm5, 224(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm5, 240(%rdi) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: movdqa %xmm4, 240(%rdi) +; SSE2-NEXT: movdqa %xmm4, 224(%rdi) ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -2019,14 +2019,14 @@ ; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: movdqa %xmm3, 208(%rdi) ; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: movdqa %xmm4, 160(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm4, 176(%rdi) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 176(%rdi) +; SSE2-NEXT: movdqa %xmm3, 160(%rdi) ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -2035,14 +2035,14 @@ ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: movdqa %xmm2, 144(%rdi) ; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 96(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm3, 112(%rdi) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm2, 112(%rdi) +; SSE2-NEXT: movdqa %xmm2, 96(%rdi) ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -2051,14 +2051,14 @@ ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: movdqa %xmm1, 80(%rdi) ; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm2, 32(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm2, 48(%rdi) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: movdqa %xmm1, 48(%rdi) +; SSE2-NEXT: movdqa %xmm1, 32(%rdi) ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm0 @@ -2182,14 +2182,14 @@ ; SSSE3-NEXT: psrad $24, %xmm4 ; SSSE3-NEXT: movdqa %xmm4, (%rdi) ; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm5 ; SSSE3-NEXT: psrad $31, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, 224(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm5, 240(%rdi) +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm4 ; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, 240(%rdi) +; SSSE3-NEXT: movdqa %xmm4, 224(%rdi) ; SSSE3-NEXT: movdqa %xmm2, %xmm4 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -2198,14 +2198,14 @@ ; SSSE3-NEXT: psrad $31, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, 208(%rdi) ; SSSE3-NEXT: movdqa %xmm4, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm4 ; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, 160(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm4, 176(%rdi) +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm3 ; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, 176(%rdi) +; SSSE3-NEXT: movdqa %xmm3, 160(%rdi) ; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -2214,14 +2214,14 @@ ; SSSE3-NEXT: psrad $31, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, 144(%rdi) ; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm3 ; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, 96(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm3, 112(%rdi) +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, 112(%rdi) +; SSSE3-NEXT: movdqa %xmm2, 96(%rdi) ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -2230,14 +2230,14 @@ ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, 80(%rdi) ; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, 32(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm2, 48(%rdi) +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, 48(%rdi) +; SSSE3-NEXT: movdqa %xmm1, 32(%rdi) ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm0 @@ -2350,61 +2350,61 @@ ; SSE41-NEXT: movdqa %xmm4, 64(%rdi) ; SSE41-NEXT: pmovsxbd %xmm0, %xmm4 ; SSE41-NEXT: movdqa %xmm4, (%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm4 ; SSE41-NEXT: psrad $31, %xmm4 -; SSE41-NEXT: movdqa %xmm4, 224(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,3,3,3] +; SSE41-NEXT: movdqa %xmm4, 240(%rdi) +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm4 ; SSE41-NEXT: psrad $31, %xmm4 -; SSE41-NEXT: movdqa %xmm4, 240(%rdi) +; SSE41-NEXT: movdqa %xmm4, 224(%rdi) ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm3 ; SSE41-NEXT: psrad $31, %xmm3 ; SSE41-NEXT: movdqa %xmm3, 208(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm3 ; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: movdqa %xmm3, 160(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] +; SSE41-NEXT: movdqa %xmm3, 176(%rdi) +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm3 ; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: movdqa %xmm3, 176(%rdi) +; SSE41-NEXT: movdqa %xmm3, 160(%rdi) ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 ; SSE41-NEXT: movdqa %xmm2, 144(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: movdqa %xmm2, 96(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; SSE41-NEXT: movdqa %xmm2, 112(%rdi) +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: movdqa %xmm2, 112(%rdi) +; SSE41-NEXT: movdqa %xmm2, 96(%rdi) ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 ; SSE41-NEXT: movdqa %xmm1, 80(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: movdqa %xmm1, 32(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE41-NEXT: movdqa %xmm1, 48(%rdi) +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: movdqa %xmm1, 48(%rdi) +; SSE41-NEXT: movdqa %xmm1, 32(%rdi) ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 @@ -2494,39 +2494,39 @@ ; AVX1-NEXT: vmovdqa %xmm1, 64(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, (%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, 224(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 240(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, 224(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 208(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, 160(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 176(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, 160(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 144(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, 96(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 112(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, 96(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 80(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, 32(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 48(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, 32(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 16(%rdi) @@ -3262,9 +3262,9 @@ ; AVX-LABEL: smulo_v4i1: ; AVX: # %bb.0: ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpslld $31, %xmm0, %xmm1 -; AVX-NEXT: vpsrad $31, %xmm1, %xmm0 -; AVX-NEXT: vmovmskps %xmm1, %eax +; AVX-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX-NEXT: vmovmskps %xmm0, %eax +; AVX-NEXT: vpsrad $31, %xmm0, %xmm0 ; AVX-NEXT: movb %al, (%rdi) ; AVX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -510,7 +510,7 @@ ; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -597,7 +597,7 @@ ; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -138,10 +138,12 @@ ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: movd %xmm2, 8(%rdi) -; SSE2-NEXT: movq %xmm0, (%rdi) +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE2-NEXT: movd %xmm0, 8(%rdi) +; SSE2-NEXT: movq %xmm2, (%rdi) ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -158,10 +160,12 @@ ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSSE3-NEXT: movd %xmm2, 8(%rdi) -; SSSE3-NEXT: movq %xmm0, (%rdi) +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSSE3-NEXT: movd %xmm0, 8(%rdi) +; SSSE3-NEXT: movq %xmm2, (%rdi) ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; @@ -362,101 +366,101 @@ ; SSE2-LABEL: umulo_v6i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE2-NEXT: movd %r8d, %xmm0 -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: movd %esi, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE2-NEXT: movd %r9d, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movd %r8d, %xmm4 +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: movd %esi, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE2-NEXT: pmuludq %xmm1, %xmm6 +; SSE2-NEXT: movd %r9d, %xmm0 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0] +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,3,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE2-NEXT: pmuludq %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = mem[0,0,0,0] ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = mem[0,0,0,0] -; SSE2-NEXT: pmuludq %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm7 -; SSE2-NEXT: pxor %xmm5, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE2-NEXT: movq %xmm0, 16(%rcx) -; SSE2-NEXT: movdqa %xmm3, (%rcx) +; SSE2-NEXT: pmuludq %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm7 +; SSE2-NEXT: pxor %xmm3, %xmm7 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE2-NEXT: movq %xmm1, 16(%rcx) +; SSE2-NEXT: movdqa %xmm4, (%rcx) ; SSE2-NEXT: movq %xmm7, 16(%rdi) -; SSE2-NEXT: movdqa %xmm1, (%rdi) +; SSE2-NEXT: movdqa %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: umulo_v6i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %rdi, %rax +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSSE3-NEXT: movd %r8d, %xmm0 -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: movd %esi, %xmm3 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSSE3-NEXT: movd %r9d, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSSE3-NEXT: movd %r8d, %xmm4 +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: movd %esi, %xmm6 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSSE3-NEXT: pmuludq %xmm1, %xmm6 +; SSSE3-NEXT: movd %r9d, %xmm0 ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: pmuludq %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pmuludq %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0] +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: pmuludq %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,3,2,3] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSSE3-NEXT: pmuludq %xmm3, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 +; SSSE3-NEXT: pxor %xmm3, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = mem[0,0,0,0] ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = mem[0,0,0,0] -; SSSE3-NEXT: pmuludq %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm7 -; SSSE3-NEXT: pxor %xmm5, %xmm7 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSSE3-NEXT: movq %xmm0, 16(%rcx) -; SSSE3-NEXT: movdqa %xmm3, (%rcx) +; SSSE3-NEXT: pmuludq %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,3,2,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,3,2,3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm7 +; SSSE3-NEXT: pxor %xmm3, %xmm7 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSSE3-NEXT: movq %xmm1, 16(%rcx) +; SSSE3-NEXT: movdqa %xmm4, (%rcx) ; SSSE3-NEXT: movq %xmm7, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm1, (%rdi) +; SSSE3-NEXT: movdqa %xmm0, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: umulo_v6i32: @@ -1013,7 +1017,7 @@ ; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vpacksswb %xmm5, %xmm5, %xmm5 +; AVX2-NEXT: vpacksswb %xmm4, %xmm5, %xmm5 ; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm3 ; AVX2-NEXT: vpmovsxbd %xmm5, %ymm0 @@ -1723,14 +1727,14 @@ ; SSE2-NEXT: psrad $24, %xmm4 ; SSE2-NEXT: movdqa %xmm4, (%rdi) ; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm5 ; SSE2-NEXT: psrad $31, %xmm5 -; SSE2-NEXT: movdqa %xmm5, 224(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm5, 240(%rdi) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: movdqa %xmm4, 240(%rdi) +; SSE2-NEXT: movdqa %xmm4, 224(%rdi) ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -1739,14 +1743,14 @@ ; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: movdqa %xmm3, 208(%rdi) ; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: movdqa %xmm4, 160(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm4, 176(%rdi) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 176(%rdi) +; SSE2-NEXT: movdqa %xmm3, 160(%rdi) ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -1755,14 +1759,14 @@ ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: movdqa %xmm2, 144(%rdi) ; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 96(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm3, 112(%rdi) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm2, 112(%rdi) +; SSE2-NEXT: movdqa %xmm2, 96(%rdi) ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -1771,14 +1775,14 @@ ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: movdqa %xmm1, 80(%rdi) ; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm2, 32(%rdi) -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm2, 48(%rdi) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: movdqa %xmm1, 48(%rdi) +; SSE2-NEXT: movdqa %xmm1, 32(%rdi) ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm0 @@ -1886,14 +1890,14 @@ ; SSSE3-NEXT: psrad $24, %xmm4 ; SSSE3-NEXT: movdqa %xmm4, (%rdi) ; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm5 ; SSSE3-NEXT: psrad $31, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, 224(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm5, 240(%rdi) +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm4 ; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, 240(%rdi) +; SSSE3-NEXT: movdqa %xmm4, 224(%rdi) ; SSSE3-NEXT: movdqa %xmm2, %xmm4 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -1902,14 +1906,14 @@ ; SSSE3-NEXT: psrad $31, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, 208(%rdi) ; SSSE3-NEXT: movdqa %xmm4, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm4 ; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, 160(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm4, 176(%rdi) +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm3 ; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, 176(%rdi) +; SSSE3-NEXT: movdqa %xmm3, 160(%rdi) ; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -1918,14 +1922,14 @@ ; SSSE3-NEXT: psrad $31, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, 144(%rdi) ; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm3 ; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, 96(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm3, 112(%rdi) +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, 112(%rdi) +; SSSE3-NEXT: movdqa %xmm2, 96(%rdi) ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -1934,14 +1938,14 @@ ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, 80(%rdi) ; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, 32(%rdi) -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm2, 48(%rdi) +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, 48(%rdi) +; SSSE3-NEXT: movdqa %xmm1, 32(%rdi) ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm0 @@ -2030,61 +2034,61 @@ ; SSE41-NEXT: movdqa %xmm0, 64(%rdi) ; SSE41-NEXT: pmovsxbd %xmm8, %xmm0 ; SSE41-NEXT: movdqa %xmm0, (%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: movdqa %xmm0, 224(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE41-NEXT: movdqa %xmm0, 240(%rdi) +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: movdqa %xmm0, 240(%rdi) +; SSE41-NEXT: movdqa %xmm0, 224(%rdi) ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 208(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: movdqa %xmm0, 160(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE41-NEXT: movdqa %xmm0, 176(%rdi) +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: movdqa %xmm0, 176(%rdi) +; SSE41-NEXT: movdqa %xmm0, 160(%rdi) ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 144(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: movdqa %xmm0, 96(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] +; SSE41-NEXT: movdqa %xmm0, 112(%rdi) +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: movdqa %xmm0, 112(%rdi) +; SSE41-NEXT: movdqa %xmm0, 96(%rdi) ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 80(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: movdqa %xmm0, 32(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] +; SSE41-NEXT: movdqa %xmm0, 48(%rdi) +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: movdqa %xmm0, 48(%rdi) +; SSE41-NEXT: movdqa %xmm0, 32(%rdi) ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 @@ -2170,39 +2174,39 @@ ; AVX1-NEXT: vmovdqa %xmm0, 64(%rdi) ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 224(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 240(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, 224(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 208(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 160(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 176(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, 160(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 144(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 96(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 112(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, 96(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 80(%rdi) -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, 32(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 48(%rdi) +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, 32(%rdi) ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, 16(%rdi) diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -640,7 +640,7 @@ ; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -987,7 +987,9 @@ ; ; GFNIAVX1-LABEL: test_bitreverse_v32i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX1-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX1-NEXT: # ymm1 = mem[0,1,0,1] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: test_bitreverse_v32i8: @@ -1176,7 +1178,9 @@ ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX1-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX1-NEXT: # ymm1 = mem[0,1,0,1] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: test_bitreverse_v16i16: @@ -1376,7 +1380,9 @@ ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX1-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX1-NEXT: # ymm1 = mem[0,1,0,1] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: test_bitreverse_v8i32: @@ -1580,7 +1586,9 @@ ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX1-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX1-NEXT: # ymm1 = mem[0,1,0,1] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: test_bitreverse_v4i64: @@ -1855,7 +1863,8 @@ ; ; GFNIAVX1-LABEL: test_bitreverse_v64i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNIAVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX1-NEXT: # ymm2 = mem[0,1,0,1] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 ; GFNIAVX1-NEXT: retq @@ -2176,7 +2185,8 @@ ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNIAVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX1-NEXT: # ymm2 = mem[0,1,0,1] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 @@ -2534,7 +2544,8 @@ ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNIAVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX1-NEXT: # ymm2 = mem[0,1,0,1] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 @@ -2900,7 +2911,8 @@ ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNIAVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX1-NEXT: # ymm2 = mem[0,1,0,1] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll --- a/llvm/test/CodeGen/X86/vector-bo-select.ll +++ b/llvm/test/CodeGen/X86/vector-bo-select.ll @@ -519,9 +519,9 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512-NEXT: vsubps %zmm2, %zmm1, %zmm0 -; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1} +; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 +; AVX512-NEXT: vsubps %zmm2, %zmm1, %zmm1 {%k1} +; AVX512-NEXT: vmovaps %zmm1, %zmm0 ; AVX512-NEXT: retq %s = select <16 x i1> %b, <16 x float> zeroinitializer, <16 x float> %y %r = fsub <16 x float> %x, %s @@ -2682,16 +2682,16 @@ ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: psubd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm1 +; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: mul_v4i32: @@ -2741,26 +2741,26 @@ ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm5 ; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1] ; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: paddd %xmm5, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: psubd %xmm5, %xmm4 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: psubd %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE2-NEXT: pandn %xmm6, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm5, %xmm1 +; SSE2-NEXT: pmuludq %xmm3, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm3, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] @@ -2822,32 +2822,32 @@ ; SSE2-LABEL: mul_v8i32_cast_cond: ; SSE2: # %bb.0: ; SSE2-NEXT: movd %edi, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: paddd %xmm6, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 -; SSE2-NEXT: psubd %xmm5, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,2,4,8] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] +; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pand %xmm6, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: psubd %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm6, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] +; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm5 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm5, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm1 +; SSE2-NEXT: pmuludq %xmm4, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] diff --git a/llvm/test/CodeGen/X86/vector-compare-all_of.ll b/llvm/test/CodeGen/X86/vector-compare-all_of.ll --- a/llvm/test/CodeGen/X86/vector-compare-all_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-all_of.ll @@ -45,36 +45,16 @@ ; SSE-NEXT: negq %rax ; SSE-NEXT: retq ; -; AVX1-LABEL: test_v4f64_sext: -; AVX1: # %bb.0: -; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vtestpd %ymm1, %ymm0 -; AVX1-NEXT: sbbq %rax, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4f64_sext: -; AVX2: # %bb.0: -; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vtestpd %ymm1, %ymm0 -; AVX2-NEXT: sbbq %rax, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v4f64_sext: -; AVX512: # %bb.0: -; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vtestpd %ymm1, %ymm0 -; AVX512-NEXT: sbbq %rax, %rax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: test_v4f64_sext: +; AVX: # %bb.0: +; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vandpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vandpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %c = fcmp ogt <4 x double> %a0, %a1 %s = sext <4 x i1> %c to <4 x i64> %1 = shufflevector <4 x i64> %s, <4 x i64> undef, <4 x i32> @@ -91,11 +71,12 @@ ; SSE-NEXT: cmpltpd %xmm1, %xmm3 ; SSE-NEXT: cmpltpd %xmm0, %xmm2 ; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: movmskps %xmm2, %ecx -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: cmpl $15, %ecx -; SSE-NEXT: sete %al -; SSE-NEXT: negq %rax +; SSE-NEXT: packssdw %xmm3, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: cltq ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: test_v4f64_legal_sext: @@ -103,10 +84,12 @@ ; AVX1OR2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 ; AVX1OR2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1OR2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1OR2-NEXT: xorl %eax, %eax -; AVX1OR2-NEXT: vtestps %xmm1, %xmm0 -; AVX1OR2-NEXT: sbbq %rax, %rax +; AVX1OR2-NEXT: vpackssdw %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vmovd %xmm0, %eax +; AVX1OR2-NEXT: cltq ; AVX1OR2-NEXT: vzeroupper ; AVX1OR2-NEXT: retq ; @@ -173,36 +156,18 @@ ; SSE-NEXT: negl %eax ; SSE-NEXT: retq ; -; AVX1-LABEL: test_v8f32_sext: -; AVX1: # %bb.0: -; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: vtestps %ymm1, %ymm0 -; AVX1-NEXT: sbbl %eax, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8f32_sext: -; AVX2: # %bb.0: -; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vtestps %ymm1, %ymm0 -; AVX2-NEXT: sbbl %eax, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v8f32_sext: -; AVX512: # %bb.0: -; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vtestps %ymm1, %ymm0 -; AVX512-NEXT: sbbl %eax, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: test_v8f32_sext: +; AVX: # %bb.0: +; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %c = fcmp ogt <8 x float> %a0, %a1 %s = sext <8 x i1> %c to <8 x i32> %1 = shufflevector <8 x i32> %s, <8 x i32> undef, <8 x i32> @@ -357,30 +322,31 @@ ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vtestpd %xmm1, %xmm0 -; AVX1-NEXT: sbbq %rax, %rax +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v4i64_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vtestpd %ymm1, %ymm0 -; AVX2-NEXT: sbbq %rax, %rax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v4i64_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vtestpd %ymm1, %ymm0 -; AVX512-NEXT: sbbq %rax, %rax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <4 x i64> %a0, %a1 @@ -400,25 +366,30 @@ ; SSE2-NEXT: pxor %xmm4, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: movmskps %xmm2, %ecx -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: cmpl $15, %ecx -; SSE2-NEXT: sete %al -; SSE2-NEXT: negq %rax +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: cltq ; SSE2-NEXT: retq ; ; SSE42-LABEL: test_v4i64_legal_sext: @@ -426,11 +397,12 @@ ; SSE42-NEXT: pcmpgtq %xmm3, %xmm1 ; SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE42-NEXT: packssdw %xmm1, %xmm0 -; SSE42-NEXT: movmskps %xmm0, %ecx -; SSE42-NEXT: xorl %eax, %eax -; SSE42-NEXT: cmpl $15, %ecx -; SSE42-NEXT: sete %al -; SSE42-NEXT: negq %rax +; SSE42-NEXT: packssdw %xmm1, %xmm1 +; SSE42-NEXT: pand %xmm0, %xmm1 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE42-NEXT: pand %xmm1, %xmm0 +; SSE42-NEXT: movd %xmm0, %eax +; SSE42-NEXT: cltq ; SSE42-NEXT: retq ; ; AVX1-LABEL: test_v4i64_legal_sext: @@ -440,10 +412,12 @@ ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: vtestps %xmm1, %xmm0 -; AVX1-NEXT: sbbq %rax, %rax +; AVX1-NEXT: vpackssdw %xmm2, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cltq ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -452,10 +426,12 @@ ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: vtestps %xmm1, %xmm0 -; AVX2-NEXT: sbbq %rax, %rax +; AVX2-NEXT: vpackssdw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: cltq ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -529,30 +505,37 @@ ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vtestps %xmm1, %xmm0 -; AVX1-NEXT: sbbl %eax, %eax +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v8i32_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vtestps %ymm1, %ymm0 -; AVX2-NEXT: sbbl %eax, %eax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8i32_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vtestps %ymm1, %ymm0 -; AVX512-NEXT: sbbl %eax, %eax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <8 x i32> %a0, %a1 @@ -687,11 +670,13 @@ ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %ecx -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF -; AVX1-NEXT: sete %al -; AVX1-NEXT: negl %eax +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -699,11 +684,15 @@ ; AVX2-LABEL: test_v16i16_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: cmpl $-1, %ecx -; AVX2-NEXT: sete %al -; AVX2-NEXT: negl %eax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -711,11 +700,15 @@ ; AVX512-LABEL: test_v16i16_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovmskb %ymm0, %ecx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: cmpl $-1, %ecx -; AVX512-NEXT: sete %al -; AVX512-NEXT: negl %eax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -853,30 +846,52 @@ ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %eax -; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF -; AVX1-NEXT: sete %al -; AVX1-NEXT: negb %al +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v32i8_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %eax -; AVX2-NEXT: cmpl $-1, %eax -; AVX2-NEXT: sete %al -; AVX2-NEXT: negb %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v32i8_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovmskb %ymm0, %eax -; AVX512-NEXT: cmpl $-1, %eax -; AVX512-NEXT: sete %al -; AVX512-NEXT: negb %al +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <32 x i8> %a0, %a1 @@ -1547,27 +1562,32 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: movmskpd %xmm0, %eax -; SSE2-NEXT: cmpl $3, %eax -; SSE2-NEXT: sete %al +; SSE2-NEXT: movmskpd %xmm0, %ecx +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: shrb %al +; SSE2-NEXT: andb %cl, %al ; SSE2-NEXT: retq ; ; SSE42-LABEL: select_v2i8: ; SSE42: # %bb.0: ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: pxor %xmm0, %xmm1 -; SSE42-NEXT: ptest %xmm1, %xmm1 -; SSE42-NEXT: sete %al +; SSE42-NEXT: pcmpeqq %xmm0, %xmm1 +; SSE42-NEXT: movmskpd %xmm1, %ecx +; SSE42-NEXT: movl %ecx, %eax +; SSE42-NEXT: shrb %al +; SSE42-NEXT: andb %cl, %al ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: select_v2i8: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero ; AVX1OR2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1OR2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vptest %xmm0, %xmm0 -; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vmovmskpd %xmm0, %ecx +; AVX1OR2-NEXT: movl %ecx, %eax +; AVX1OR2-NEXT: shrb %al +; AVX1OR2-NEXT: andb %cl, %al ; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: select_v2i8: diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll --- a/llvm/test/CodeGen/X86/vector-compare-any_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-any_of.ll @@ -46,10 +46,11 @@ ; AVX-LABEL: test_v4f64_sext: ; AVX: # %bb.0: ; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: vtestpd %ymm0, %ymm0 -; AVX-NEXT: setne %al -; AVX-NEXT: negq %rax +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vorpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %c = fcmp ogt <4 x double> %a0, %a1 @@ -68,10 +69,12 @@ ; SSE-NEXT: cmpltpd %xmm1, %xmm3 ; SSE-NEXT: cmpltpd %xmm0, %xmm2 ; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: movmskps %xmm2, %ecx -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: negl %ecx -; SSE-NEXT: sbbq %rax, %rax +; SSE-NEXT: packssdw %xmm3, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: cltq ; SSE-NEXT: retq ; ; AVX1OR2-LABEL: test_v4f64_legal_sext: @@ -79,10 +82,12 @@ ; AVX1OR2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 ; AVX1OR2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1OR2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: xorl %eax, %eax -; AVX1OR2-NEXT: vtestps %xmm0, %xmm0 -; AVX1OR2-NEXT: setne %al -; AVX1OR2-NEXT: negq %rax +; AVX1OR2-NEXT: vpackssdw %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vmovd %xmm0, %eax +; AVX1OR2-NEXT: cltq ; AVX1OR2-NEXT: vzeroupper ; AVX1OR2-NEXT: retq ; @@ -151,10 +156,13 @@ ; AVX-LABEL: test_v8f32_sext: ; AVX: # %bb.0: ; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: vtestps %ymm0, %ymm0 -; AVX-NEXT: setne %al -; AVX-NEXT: negl %eax +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %c = fcmp ogt <8 x float> %a0, %a1 @@ -304,30 +312,31 @@ ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: vtestpd %xmm0, %xmm0 -; AVX1-NEXT: setne %al -; AVX1-NEXT: negq %rax +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v4i64_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: vtestpd %ymm0, %ymm0 -; AVX2-NEXT: setne %al -; AVX2-NEXT: negq %rax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v4i64_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: vtestpd %ymm0, %ymm0 -; AVX512-NEXT: setne %al -; AVX512-NEXT: negq %rax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <4 x i64> %a0, %a1 @@ -347,24 +356,30 @@ ; SSE2-NEXT: pxor %xmm4, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: movmskps %xmm2, %ecx -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: negl %ecx -; SSE2-NEXT: sbbq %rax, %rax +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: cltq ; SSE2-NEXT: retq ; ; SSE42-LABEL: test_v4i64_legal_sext: @@ -372,10 +387,12 @@ ; SSE42-NEXT: pcmpgtq %xmm3, %xmm1 ; SSE42-NEXT: pcmpgtq %xmm2, %xmm0 ; SSE42-NEXT: packssdw %xmm1, %xmm0 -; SSE42-NEXT: movmskps %xmm0, %ecx -; SSE42-NEXT: xorl %eax, %eax -; SSE42-NEXT: negl %ecx -; SSE42-NEXT: sbbq %rax, %rax +; SSE42-NEXT: packssdw %xmm1, %xmm1 +; SSE42-NEXT: por %xmm0, %xmm1 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE42-NEXT: por %xmm1, %xmm0 +; SSE42-NEXT: movd %xmm0, %eax +; SSE42-NEXT: cltq ; SSE42-NEXT: retq ; ; AVX1-LABEL: test_v4i64_legal_sext: @@ -385,10 +402,12 @@ ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: vtestps %xmm0, %xmm0 -; AVX1-NEXT: setne %al -; AVX1-NEXT: negq %rax +; AVX1-NEXT: vpackssdw %xmm2, %xmm2, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cltq ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -397,10 +416,12 @@ ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: vtestps %xmm0, %xmm0 -; AVX2-NEXT: setne %al -; AVX2-NEXT: negq %rax +; AVX2-NEXT: vpackssdw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: cltq ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -473,30 +494,37 @@ ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: vtestps %xmm0, %xmm0 -; AVX1-NEXT: setne %al -; AVX1-NEXT: negl %eax +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v8i32_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: vtestps %ymm0, %ymm0 -; AVX2-NEXT: setne %al -; AVX2-NEXT: negl %eax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8i32_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: vtestps %ymm0, %ymm0 -; AVX512-NEXT: setne %al -; AVX512-NEXT: negl %eax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %c = icmp sgt <8 x i32> %a0, %a1 @@ -624,10 +652,13 @@ ; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %ecx -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: negl %ecx -; AVX1-NEXT: sbbl %eax, %eax +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -635,10 +666,15 @@ ; AVX2-LABEL: test_v16i16_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: negl %ecx -; AVX2-NEXT: sbbl %eax, %eax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -646,10 +682,15 @@ ; AVX512-LABEL: test_v16i16_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovmskb %ymm0, %ecx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: negl %ecx -; AVX512-NEXT: sbbl %eax, %eax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -786,10 +827,15 @@ ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %ecx -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: negl %ecx -; AVX1-NEXT: sbbl %eax, %eax +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -797,10 +843,17 @@ ; AVX2-LABEL: test_v32i8_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmovmskb %ymm0, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: negl %ecx -; AVX2-NEXT: sbbl %eax, %eax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -808,10 +861,17 @@ ; AVX512-LABEL: test_v32i8_sext: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovmskb %ymm0, %ecx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: negl %ecx -; AVX512-NEXT: sbbl %eax, %eax +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1430,9 +1490,10 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: movmskpd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: setne %al +; SSE2-NEXT: movmskpd %xmm0, %ecx +; SSE2-NEXT: movl %ecx, %eax +; SSE2-NEXT: shrb %al +; SSE2-NEXT: orb %cl, %al ; SSE2-NEXT: retq ; ; SSE42-LABEL: select_v2i8: @@ -1440,9 +1501,10 @@ ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: pcmpeqq %xmm0, %xmm1 -; SSE42-NEXT: movmskpd %xmm1, %eax -; SSE42-NEXT: testl %eax, %eax -; SSE42-NEXT: setne %al +; SSE42-NEXT: movmskpd %xmm1, %ecx +; SSE42-NEXT: movl %ecx, %eax +; SSE42-NEXT: shrb %al +; SSE42-NEXT: orb %cl, %al ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: select_v2i8: @@ -1450,8 +1512,10 @@ ; AVX1OR2-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero ; AVX1OR2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero ; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vtestpd %xmm0, %xmm0 -; AVX1OR2-NEXT: setne %al +; AVX1OR2-NEXT: vmovmskpd %xmm0, %ecx +; AVX1OR2-NEXT: movl %ecx, %eax +; AVX1OR2-NEXT: shrb %al +; AVX1OR2-NEXT: orb %cl, %al ; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: select_v2i8: diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -7539,8 +7539,7 @@ ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-NEXT: vextractps $2, %xmm0, %eax ; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: movl %eax, %eax +; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 @@ -7656,28 +7655,31 @@ ; ; AVX1-LABEL: constrained_vector_uitofp_v4f32_v4i64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; AVX1-NEXT: vorpd %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3] +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; AVX1-NEXT: vorpd %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] ; AVX1-NEXT: vpextrq $1, %xmm1, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] -; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm3 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm1, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] +; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-ext-logic.ll b/llvm/test/CodeGen/X86/vector-ext-logic.ll --- a/llvm/test/CodeGen/X86/vector-ext-logic.ll +++ b/llvm/test/CodeGen/X86/vector-ext-logic.ll @@ -260,25 +260,19 @@ define <8 x i32> @bool_zext_and(<8 x i1> %x, <8 x i1> %y) { ; SSE2-LABEL: bool_zext_and: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: retq ; ; AVX2-LABEL: bool_zext_and: ; AVX2: # %bb.0: +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %xz = zext <8 x i1> %x to <8 x i32> %yz = zext <8 x i1> %y to <8 x i32> diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -1269,31 +1269,31 @@ ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX1-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX1-NEXT: vpcmpeqb %xmm0, %xmm3, %xmm3 -; AVX1-NEXT: vpmovsxbd %xmm3, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] +; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-NEXT: vmovdqu 4096(%rdi,%rax,4), %xmm5 ; AVX1-NEXT: vmovdqu 4112(%rdi,%rax,4), %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,2,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,2,3,3] ; AVX1-NEXT: vpsllq %xmm1, %xmm7, %xmm8 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] -; AVX1-NEXT: vpsllq %xmm1, %xmm5, %xmm9 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1] +; AVX1-NEXT: vpsllq %xmm1, %xmm6, %xmm9 ; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm9[1,3],xmm8[1,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[2,2,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[2,2,3,3] ; AVX1-NEXT: vpsllq %xmm1, %xmm9, %xmm10 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1] -; AVX1-NEXT: vpsllq %xmm1, %xmm6, %xmm11 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] +; AVX1-NEXT: vpsllq %xmm1, %xmm5, %xmm11 ; AVX1-NEXT: vshufps {{.*#+}} xmm10 = xmm11[1,3],xmm10[1,3] ; AVX1-NEXT: vpsllq %xmm2, %xmm7, %xmm7 -; AVX1-NEXT: vpsllq %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3],xmm7[1,3] -; AVX1-NEXT: vblendvps %xmm4, %xmm8, %xmm5, %xmm4 -; AVX1-NEXT: vpsllq %xmm2, %xmm9, %xmm5 ; AVX1-NEXT: vpsllq %xmm2, %xmm6, %xmm6 -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm6[1,3],xmm5[1,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,3],xmm7[1,3] +; AVX1-NEXT: vblendvps %xmm4, %xmm8, %xmm6, %xmm4 +; AVX1-NEXT: vpsllq %xmm2, %xmm9, %xmm6 +; AVX1-NEXT: vpsllq %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3],xmm6[1,3] ; AVX1-NEXT: vblendvps %xmm3, %xmm10, %xmm5, %xmm3 -; AVX1-NEXT: vmovups %xmm4, 4096(%rdi,%rax,4) -; AVX1-NEXT: vmovups %xmm3, 4112(%rdi,%rax,4) +; AVX1-NEXT: vmovups %xmm4, 4112(%rdi,%rax,4) +; AVX1-NEXT: vmovups %xmm3, 4096(%rdi,%rax,4) ; AVX1-NEXT: addq $8, %rax ; AVX1-NEXT: jne .LBB8_1 ; AVX1-NEXT: # %bb.2: # %exit @@ -1340,9 +1340,11 @@ ; AVX512F-NEXT: .p2align 4, 0x90 ; AVX512F-NEXT: .LBB8_1: # %loop ; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3 -; AVX512F-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3 +; AVX512F-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3 +; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-NEXT: vpblendmd %zmm0, %zmm1, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm4 ; AVX512F-NEXT: vprolvd %zmm3, %zmm4, %zmm3 ; AVX512F-NEXT: vmovdqu %ymm3, 4096(%rdi,%rax,4) @@ -1357,15 +1359,18 @@ ; AVX512VL-NEXT: vpbroadcastd %edx, %ymm0 ; AVX512VL-NEXT: vpbroadcastd %ecx, %ymm1 ; AVX512VL-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VL-NEXT: .p2align 4, 0x90 ; AVX512VL-NEXT: .LBB8_1: # %loop ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1 -; AVX512VL-NEXT: vpblendmd %ymm0, %ymm1, %ymm2 {%k1} -; AVX512VL-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm3 -; AVX512VL-NEXT: vprolvd %ymm2, %ymm3, %ymm2 -; AVX512VL-NEXT: vmovdqu %ymm2, 4096(%rdi,%rax,4) +; AVX512VL-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512VL-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3 +; AVX512VL-NEXT: vpmovsxbd %xmm3, %zmm3 +; AVX512VL-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512VL-NEXT: vpblendmd %ymm0, %ymm1, %ymm3 {%k1} +; AVX512VL-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm4 +; AVX512VL-NEXT: vprolvd %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vmovdqu %ymm3, 4096(%rdi,%rax,4) ; AVX512VL-NEXT: addq $8, %rax ; AVX512VL-NEXT: jne .LBB8_1 ; AVX512VL-NEXT: # %bb.2: # %exit @@ -1504,11 +1509,11 @@ ; XOPAVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; XOPAVX2-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3 ; XOPAVX2-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3 -; XOPAVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 -; XOPAVX2-NEXT: vprotd %xmm4, 4112(%rdi,%rax,4), %xmm4 -; XOPAVX2-NEXT: vprotd %xmm3, 4096(%rdi,%rax,4), %xmm3 -; XOPAVX2-NEXT: vmovdqu %xmm3, 4096(%rdi,%rax,4) -; XOPAVX2-NEXT: vmovdqu %xmm4, 4112(%rdi,%rax,4) +; XOPAVX2-NEXT: vprotd %xmm3, 4096(%rdi,%rax,4), %xmm4 +; XOPAVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 +; XOPAVX2-NEXT: vprotd %xmm3, 4112(%rdi,%rax,4), %xmm3 +; XOPAVX2-NEXT: vmovdqu %xmm3, 4112(%rdi,%rax,4) +; XOPAVX2-NEXT: vmovdqu %xmm4, 4096(%rdi,%rax,4) ; XOPAVX2-NEXT: addq $8, %rax ; XOPAVX2-NEXT: jne .LBB8_1 ; XOPAVX2-NEXT: # %bb.2: # %exit diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -1317,9 +1317,9 @@ ; SSE41-LABEL: constant_funnnel_v4i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] @@ -1328,13 +1328,13 @@ ; ; AVX1-LABEL: constant_funnnel_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -1088,21 +1088,21 @@ ; AVX1-LABEL: constant_funnnel_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5],xmm0[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll @@ -293,9 +293,9 @@ ; SSE41-LABEL: constant_funnnel_v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] @@ -304,13 +304,13 @@ ; ; AVX1-LABEL: constant_funnnel_v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -824,8 +824,10 @@ ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] +; AVX512F-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2,1] ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 @@ -843,8 +845,10 @@ ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] +; AVX512VL-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2,1] ; AVX512VL-NEXT: vpmullw %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpaddw %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -1066,9 +1066,9 @@ ; XOPAVX1-LABEL: splatvar_funnnel_v8i16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; @@ -1381,9 +1381,9 @@ ; SSE41-LABEL: constant_funnnel_v4i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] @@ -1392,13 +1392,13 @@ ; ; AVX1-LABEL: constant_funnnel_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -884,9 +884,9 @@ ; XOPAVX1-LABEL: splatvar_funnnel_v16i16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 @@ -1139,21 +1139,21 @@ ; AVX1-LABEL: constant_funnnel_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5],xmm0[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll @@ -307,9 +307,9 @@ ; SSE41-LABEL: constant_funnnel_v2i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] @@ -318,13 +318,13 @@ ; ; AVX1-LABEL: constant_funnnel_v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -3158,6 +3158,27 @@ ; F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; F16C-NEXT: addq $40, %rsp ; F16C-NEXT: retq +; +; AVX512-LABEL: cvt_2f64_to_2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: retq %1 = fptrunc <2 x double> %a0 to <2 x half> %2 = bitcast <2 x half> %1 to <2 x i16> ret <2 x i16> %2 @@ -3286,8 +3307,9 @@ ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: addq $72, %rsp ; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> @@ -3418,8 +3440,9 @@ ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: addq $72, %rsp ; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> @@ -4121,9 +4144,10 @@ ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] -; AVX512-NEXT: vmovaps %xmm0, (%rbx) +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vmovdqa %xmm0, (%rbx) ; AVX512-NEXT: addq $64, %rsp ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: retq @@ -4946,7 +4970,7 @@ ; F16C-NEXT: movzwl %ax, %eax ; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; F16C-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; F16C-NEXT: vcvttps2dq %xmm0, %xmm0 ; F16C-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; F16C-NEXT: retq @@ -4962,7 +4986,7 @@ ; AVX512-NEXT: movzwl %ax, %eax ; AVX512-NEXT: vmovd %eax, %xmm0 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleave.ll b/llvm/test/CodeGen/X86/vector-interleave.ll --- a/llvm/test/CodeGen/X86/vector-interleave.ll +++ b/llvm/test/CodeGen/X86/vector-interleave.ll @@ -540,12 +540,13 @@ ; AVX1-NEXT: vmovups 16(%rdi), %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,0,1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,0,1,1] ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-NEXT: vmovups %xmm1, 48(%rsi) -; AVX1-NEXT: vmovups %xmm3, 32(%rsi) -; AVX1-NEXT: vmovups %xmm0, 16(%rsi) -; AVX1-NEXT: vmovups %xmm2, (%rsi) +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovups %ymm1, 32(%rsi) +; AVX1-NEXT: vmovups %ymm0, (%rsi) +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: splat2_i32: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll @@ -134,33 +134,33 @@ ; SSE-LABEL: load_i16_stride2_vf16: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa 48(%rdi), %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm4[0] -; SSE-NEXT: psrad $16, %xmm3 ; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: packssdw %xmm3, %xmm1 -; SSE-NEXT: psrad $16, %xmm2 ; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm6, (%rsi) -; SSE-NEXT: movdqa %xmm5, 16(%rsi) +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: psrad $16, %xmm3 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm6, 16(%rsi) +; SSE-NEXT: movdqa %xmm5, (%rsi) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) ; SSE-NEXT: movdqa %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride2_vf16: @@ -188,60 +188,22 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 16(%rdx) ; AVX1-ONLY-NEXT: retq ; -; AVX2-SLOW-LABEL: load_i16_stride2_vf16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: load_i16_stride2_vf16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: load_i16_stride2_vf16: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-ONLY-LABEL: load_i16_stride2_vf16: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq ; ; AVX512-LABEL: load_i16_stride2_vf16: ; AVX512: # %bb.0: @@ -262,32 +224,32 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind { ; SSE-LABEL: load_i16_stride2_vf32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa 80(%rdi), %xmm4 -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: movdqa 112(%rdi), %xmm6 -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm7 +; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa 112(%rdi), %xmm4 +; SSE-NEXT: movdqa 64(%rdi), %xmm2 +; SSE-NEXT: movdqa 80(%rdi), %xmm6 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm9 ; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa 48(%rdi), %xmm9 +; SSE-NEXT: movdqa 48(%rdi), %xmm8 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm8[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm10[0] ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm6[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm2[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm11[0] @@ -299,25 +261,25 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm11[0] ; SSE-NEXT: psrad $16, %xmm9 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: packssdw %xmm9, %xmm1 +; SSE-NEXT: psrad $16, %xmm8 ; SSE-NEXT: psrad $16, %xmm3 -; SSE-NEXT: packssdw %xmm9, %xmm3 -; SSE-NEXT: psrad $16, %xmm7 -; SSE-NEXT: psrad $16, %xmm2 -; SSE-NEXT: packssdw %xmm7, %xmm2 +; SSE-NEXT: packssdw %xmm8, %xmm3 ; SSE-NEXT: psrad $16, %xmm6 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: packssdw %xmm6, %xmm1 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: packssdw %xmm6, %xmm2 ; SSE-NEXT: psrad $16, %xmm4 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: packssdw %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm12, 32(%rsi) -; SSE-NEXT: movdqa %xmm10, 48(%rsi) -; SSE-NEXT: movdqa %xmm8, (%rsi) -; SSE-NEXT: movdqa %xmm5, 16(%rsi) -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm1, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) +; SSE-NEXT: movdqa %xmm12, 48(%rsi) +; SSE-NEXT: movdqa %xmm10, 32(%rsi) +; SSE-NEXT: movdqa %xmm7, 16(%rsi) +; SSE-NEXT: movdqa %xmm5, (%rsi) +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm2, 32(%rdx) ; SSE-NEXT: movdqa %xmm3, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride2_vf32: @@ -365,115 +327,50 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 16(%rdx) ; AVX1-ONLY-NEXT: retq ; -; AVX2-SLOW-LABEL: load_i16_stride2_vf32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,2],ymm4[0,2],ymm5[4,6],ymm4[4,6] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,2],ymm5[0,2],ymm6[4,6],ymm5[4,6] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rdx) -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: load_i16_stride2_vf32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm6 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,2],ymm5[0,2],ymm6[4,6],ymm5[4,6] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2],ymm6[0,2],ymm4[4,6],ymm6[4,6] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rdx) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: load_i16_stride2_vf32: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,2],ymm5[0,2],ymm6[4,6],ymm5[4,6] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2],ymm6[0,2],ymm4[4,6],ymm6[4,6] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-ONLY-LABEL: load_i16_stride2_vf32: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm1, %ymm5 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,1,3] +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm2, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = [6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] +; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] +; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 32(%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i16_stride2_vf32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-NEXT: vpsrld $16, %zmm0, %zmm2 -; AVX512F-NEXT: vpsrld $16, %zmm1, %zmm3 -; AVX512F-NEXT: vpmovdw %zmm1, 32(%rsi) +; AVX512F-NEXT: vpsrld $16, %zmm1, %zmm2 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vpsrld $16, %zmm0, %zmm3 ; AVX512F-NEXT: vpmovdw %zmm0, (%rsi) -; AVX512F-NEXT: vpmovdw %zmm3, 32(%rdx) -; AVX512F-NEXT: vpmovdw %zmm2, (%rdx) +; AVX512F-NEXT: vpmovdw %zmm1, 32(%rsi) +; AVX512F-NEXT: vpmovdw %zmm3, (%rdx) +; AVX512F-NEXT: vmovdqa %ymm2, 32(%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -501,39 +398,39 @@ ; SSE-LABEL: load_i16_stride2_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movdqa 160(%rdi), %xmm11 -; SSE-NEXT: movdqa 176(%rdi), %xmm2 +; SSE-NEXT: movdqa 128(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm13 -; SSE-NEXT: movdqa 80(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm9 -; SSE-NEXT: movdqa 112(%rdi), %xmm3 +; SSE-NEXT: movdqa 144(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa 96(%rdi), %xmm13 +; SSE-NEXT: movdqa 112(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm10 +; SSE-NEXT: movdqa 80(%rdi), %xmm11 +; SSE-NEXT: movdqa (%rdi), %xmm9 ; SSE-NEXT: movdqa 16(%rdi), %xmm15 ; SSE-NEXT: movdqa 32(%rdi), %xmm8 ; SSE-NEXT: movdqa 48(%rdi), %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -546,373 +443,255 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa 176(%rdi), %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa 128(%rdi), %xmm12 +; SSE-NEXT: movdqa 160(%rdi), %xmm12 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa 208(%rdi), %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: movdqa 192(%rdi), %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,2,2,3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: movdqa 208(%rdi), %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa 240(%rdi), %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa 192(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa 224(%rdi), %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0] ; SSE-NEXT: psrad $16, %xmm15 -; SSE-NEXT: psrad $16, %xmm10 -; SSE-NEXT: packssdw %xmm15, %xmm10 +; SSE-NEXT: psrad $16, %xmm9 +; SSE-NEXT: packssdw %xmm15, %xmm9 ; SSE-NEXT: psrad $16, %xmm14 ; SSE-NEXT: psrad $16, %xmm8 ; SSE-NEXT: packssdw %xmm14, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrad $16, %xmm3 +; SSE-NEXT: psrad $16, %xmm11 +; SSE-NEXT: psrad $16, %xmm10 +; SSE-NEXT: packssdw %xmm11, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrad $16, %xmm2 ; SSE-NEXT: psrad $16, %xmm13 -; SSE-NEXT: packssdw %xmm3, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrad $16, %xmm3 -; SSE-NEXT: psrad $16, %xmm9 -; SSE-NEXT: packssdw %xmm3, %xmm9 -; SSE-NEXT: psrad $16, %xmm4 -; SSE-NEXT: psrad $16, %xmm12 -; SSE-NEXT: packssdw %xmm4, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrad $16, %xmm3 +; SSE-NEXT: packssdw %xmm2, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: psrad $16, %xmm11 -; SSE-NEXT: packssdw %xmm3, %xmm11 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrad $16, %xmm2 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm2, %xmm0 +; SSE-NEXT: packssdw %xmm11, %xmm2 ; SSE-NEXT: psrad $16, %xmm5 +; SSE-NEXT: psrad $16, %xmm12 +; SSE-NEXT: packssdw %xmm5, %xmm12 +; SSE-NEXT: psrad $16, %xmm4 ; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: packssdw %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm6, 96(%rsi) -; SSE-NEXT: movdqa %xmm7, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movdqa %xmm1, 112(%rdx) -; SSE-NEXT: movdqa %xmm0, 96(%rdx) -; SSE-NEXT: movdqa %xmm11, 80(%rdx) -; SSE-NEXT: movdqa %xmm12, 64(%rdx) -; SSE-NEXT: movdqa %xmm9, 48(%rdx) -; SSE-NEXT: movdqa %xmm13, 32(%rdx) +; SSE-NEXT: packssdw %xmm4, %xmm1 +; SSE-NEXT: psrad $16, %xmm3 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm6, 112(%rsi) +; SSE-NEXT: movdqa %xmm7, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 16(%rsi) +; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rsi) +; SSE-NEXT: movdqa %xmm0, 112(%rdx) +; SSE-NEXT: movdqa %xmm1, 96(%rdx) +; SSE-NEXT: movdqa %xmm12, 80(%rdx) +; SSE-NEXT: movdqa %xmm2, 64(%rdx) +; SSE-NEXT: movdqa %xmm13, 48(%rdx) +; SSE-NEXT: movdqa %xmm10, 32(%rdx) ; SSE-NEXT: movdqa %xmm8, 16(%rdx) -; SSE-NEXT: movdqa %xmm10, (%rdx) +; SSE-NEXT: movdqa %xmm9, (%rdx) ; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride2_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $24, %rsp -; AVX1-ONLY-NEXT: vpxor %xmm11, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2],xmm11[3],xmm0[4],xmm11[5],xmm0[6],xmm11[7] -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm11[1],xmm9[2],xmm11[3],xmm9[4],xmm11[5],xmm9[6],xmm11[7] +; AVX1-ONLY-NEXT: vpxor %xmm10, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm10[1],xmm13[2],xmm10[3],xmm13[4],xmm10[5],xmm13[6],xmm10[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] ; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm11[1],xmm10[2],xmm11[3],xmm10[4],xmm11[5],xmm10[6],xmm11[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm11[1],xmm7[2],xmm11[3],xmm7[4],xmm11[5],xmm7[6],xmm11[7] -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm11[1],xmm5[2],xmm11[3],xmm5[4],xmm11[5],xmm5[6],xmm11[7] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm11[1],xmm4[2],xmm11[3],xmm4[4],xmm11[5],xmm4[6],xmm11[7] -; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm10[1],xmm0[2],xmm10[3],xmm0[4],xmm10[5],xmm0[6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm10[1],xmm0[2],xmm10[3],xmm0[4],xmm10[5],xmm0[6],xmm10[7] +; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm11[1],xmm8[2],xmm11[3],xmm8[4],xmm11[5],xmm8[6],xmm11[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm11[1],xmm6[2],xmm11[3],xmm6[4],xmm11[5],xmm6[6],xmm11[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm10[1],xmm3[2],xmm10[3],xmm3[4],xmm10[5],xmm3[6],xmm10[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm10[1],xmm4[2],xmm10[3],xmm4[4],xmm10[5],xmm4[6],xmm10[7] ; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm3, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm11[1],xmm13[2],xmm11[3],xmm13[4],xmm11[5],xmm13[6],xmm11[7] -; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm10[1],xmm9[2],xmm10[3],xmm9[4],xmm10[5],xmm9[6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4],xmm10[5],xmm12[6],xmm10[7] +; AVX1-ONLY-NEXT: vpackusdw %xmm3, %xmm7, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm11[1],xmm15[2],xmm11[3],xmm15[4],xmm11[5],xmm15[6],xmm11[7] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm11[1],xmm6[2],xmm11[3],xmm6[4],xmm11[5],xmm6[6],xmm11[7] -; AVX1-ONLY-NEXT: vpackusdw %xmm14, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm10[1],xmm6[2],xmm10[3],xmm6[4],xmm10[5],xmm6[6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm2[0],xmm10[1],xmm2[2],xmm10[3],xmm2[4],xmm10[5],xmm2[6],xmm10[7] +; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm14, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm11[1],xmm3[2],xmm11[3],xmm3[4],xmm11[5],xmm3[6],xmm11[7] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm11[1],xmm0[2],xmm11[3],xmm0[4],xmm11[5],xmm0[6],xmm11[7] -; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm8, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm11[1],xmm8[2],xmm11[3],xmm8[4],xmm11[5],xmm8[6],xmm11[7] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm11[1],xmm2[2],xmm11[3],xmm2[4],xmm11[5],xmm2[6],xmm11[7] -; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpackusdw %xmm3, %xmm0, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm10[1],xmm15[2],xmm10[3],xmm15[4],xmm10[5],xmm15[6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm10[1],xmm8[2],xmm10[3],xmm8[4],xmm10[5],xmm8[6],xmm10[7] +; AVX1-ONLY-NEXT: vpackusdw %xmm14, %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm10[1],xmm3[2],xmm10[3],xmm3[4],xmm10[5],xmm3[6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm10[1],xmm1[2],xmm10[3],xmm1[4],xmm10[5],xmm1[6],xmm10[7] +; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm10[1],xmm5[2],xmm10[3],xmm5[4],xmm10[5],xmm5[6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm4[0],xmm10[1],xmm4[2],xmm10[3],xmm4[4],xmm10[5],xmm4[6],xmm10[7] +; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm11, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm3 -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm8 -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm8 +; AVX1-ONLY-NEXT: vpackusdw %xmm6, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 96(%rsi) +; AVX1-ONLY-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm9 +; AVX1-ONLY-NEXT: vpackusdw %xmm6, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm9 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 96(%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 32(%rsi) ; AVX1-ONLY-NEXT: vmovdqa %xmm14, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, 32(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 48(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 96(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 16(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 64(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 80(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 112(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 64(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 80(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, 16(%rdx) ; AVX1-ONLY-NEXT: addq $24, %rsp ; AVX1-ONLY-NEXT: retq ; -; AVX2-SLOW-LABEL: load_i16_stride2_vf64: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,2],ymm2[0,2],ymm7[4,6],ymm2[4,6] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm10[0,2],ymm7[0,2],ymm10[4,6],ymm7[4,6] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,1,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,2],ymm10[0,2],ymm11[4,6],ymm10[4,6] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,1,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,2],ymm11[0,2],ymm12[4,6],ymm11[4,6] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,1,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-SLOW-NEXT: vmovaps %ymm11, 64(%rsi) -; AVX2-SLOW-NEXT: vmovaps %ymm10, (%rsi) -; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%rsi) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 96(%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 32(%rdx) -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: load_i16_stride2_vf64: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm10 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,2],ymm2[0,2],ymm10[4,6],ymm2[4,6] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm11 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,2],ymm10[0,2],ymm11[4,6],ymm10[4,6] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm12 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,2],ymm11[0,2],ymm12[4,6],ymm11[4,6] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2],ymm12[0,2],ymm9[4,6],ymm12[4,6] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,1,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-NEXT: vmovaps %ymm9, 64(%rsi) -; AVX2-FAST-NEXT: vmovaps %ymm11, (%rsi) -; AVX2-FAST-NEXT: vmovaps %ymm10, 96(%rsi) -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm4, (%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 96(%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 32(%rdx) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: load_i16_stride2_vf64: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm8, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,2],ymm2[0,2],ymm10[4,6],ymm2[4,6] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,2],ymm10[0,2],ymm11[4,6],ymm10[4,6] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm5, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,2],ymm11[0,2],ymm12[4,6],ymm11[4,6] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2],ymm12[0,2],ymm9[4,6],ymm12[4,6] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-ONLY-LABEL: load_i16_stride2_vf64: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm6, %ymm3 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm5, %ymm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm3[2,3],ymm11[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm4, %ymm11 +; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm2, %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,1,3] +; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm8, %ymm12 +; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm7, %ymm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,1,3] +; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm1, %ymm9 +; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,1,3] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = [6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] +; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] +; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,1,3] +; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm8, %ymm6 +; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,1,3] +; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-ONLY-NEXT: vmovdqa %ymm9, 96(%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 32(%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 64(%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 96(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 64(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 32(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%rdx) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i16_stride2_vf64: ; AVX512F: # %bb.0: @@ -920,19 +699,18 @@ ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512F-NEXT: vpmovdw %zmm1, %ymm4 -; AVX512F-NEXT: vpsrld $16, %zmm1, %zmm1 +; AVX512F-NEXT: vpsrld $16, %zmm1, %zmm4 ; AVX512F-NEXT: vpsrld $16, %zmm0, %zmm5 ; AVX512F-NEXT: vpsrld $16, %zmm3, %zmm6 ; AVX512F-NEXT: vpsrld $16, %zmm2, %zmm7 -; AVX512F-NEXT: vpmovdw %zmm0, (%rsi) -; AVX512F-NEXT: vmovdqa %ymm4, 32(%rsi) ; AVX512F-NEXT: vpmovdw %zmm2, 64(%rsi) +; AVX512F-NEXT: vpmovdw %zmm0, (%rsi) ; AVX512F-NEXT: vpmovdw %zmm3, 96(%rsi) +; AVX512F-NEXT: vpmovdw %zmm1, 32(%rsi) ; AVX512F-NEXT: vpmovdw %zmm7, 64(%rdx) ; AVX512F-NEXT: vpmovdw %zmm6, 96(%rdx) ; AVX512F-NEXT: vpmovdw %zmm5, (%rdx) -; AVX512F-NEXT: vpmovdw %zmm1, 32(%rdx) +; AVX512F-NEXT: vpmovdw %zmm4, 32(%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -964,7 +742,9 @@ } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX2: {{.*}} -; AVX2-ONLY: {{.*}} +; AVX2-FAST: {{.*}} +; AVX2-FAST-PERLANE: {{.*}} +; AVX2-SLOW: {{.*}} ; AVX512BW-FAST: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} ; AVX512BW-ONLY-SLOW: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -603,25 +603,23 @@ ; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14],ymm3[15] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] ; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> ; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 @@ -640,36 +638,34 @@ ; ; AVX512F-LABEL: load_i16_stride3_vf16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm3 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512F-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm3 ; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] ; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm4 ; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] ; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm6 +; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm6 ; AVX512F-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] ; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13] ; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,6,7,4] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15] @@ -710,19 +706,19 @@ ; SSE-LABEL: load_i16_stride3_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $56, %rsp -; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: movdqa 144(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm6 +; SSE-NEXT: movdqa 128(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm9 -; SSE-NEXT: movdqa 160(%rdi), %xmm8 -; SSE-NEXT: movdqa 80(%rdi), %xmm12 +; SSE-NEXT: movdqa 96(%rdi), %xmm9 +; SSE-NEXT: movdqa 112(%rdi), %xmm8 +; SSE-NEXT: movdqa 80(%rdi), %xmm14 +; SSE-NEXT: movdqa 64(%rdi), %xmm10 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm11 +; SSE-NEXT: movdqa 32(%rdi), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm13 -; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa 32(%rdi), %xmm14 -; SSE-NEXT: movdqa 48(%rdi), %xmm0 -; SSE-NEXT: movdqa 64(%rdi), %xmm11 +; SSE-NEXT: movdqa 48(%rdi), %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm11, %xmm2 @@ -765,7 +761,7 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[2,0] -; SSE-NEXT: movdqa 112(%rdi), %xmm6 +; SSE-NEXT: movdqa 160(%rdi), %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] @@ -776,7 +772,7 @@ ; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa 128(%rdi), %xmm2 +; SSE-NEXT: movdqa 176(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] @@ -925,21 +921,21 @@ ; SSE-NEXT: pandn %xmm6, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movdqa %xmm1, 32(%rdx) -; SSE-NEXT: movdqa %xmm9, 48(%rdx) -; SSE-NEXT: movdqa %xmm13, (%rdx) -; SSE-NEXT: movdqa %xmm7, 16(%rdx) -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm5, 48(%rcx) -; SSE-NEXT: movdqa %xmm3, (%rcx) -; SSE-NEXT: movdqa %xmm4, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movdqa %xmm1, 48(%rdx) +; SSE-NEXT: movdqa %xmm9, 32(%rdx) +; SSE-NEXT: movdqa %xmm13, 16(%rdx) +; SSE-NEXT: movdqa %xmm7, (%rdx) +; SSE-NEXT: movdqa %xmm0, 48(%rcx) +; SSE-NEXT: movdqa %xmm5, 32(%rcx) +; SSE-NEXT: movdqa %xmm3, 16(%rcx) +; SSE-NEXT: movdqa %xmm4, (%rcx) ; SSE-NEXT: addq $56, %rsp ; SSE-NEXT: retq ; @@ -960,7 +956,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u> +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] @@ -988,16 +984,14 @@ ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,8,9,14,15,14,15,8,9,14,15,12,13,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm14 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm11[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,1,6,7,12,13,0,0,0,1,6,7,12,13] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,6,7,0,1,0,1,6,7,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm11 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm11[5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0,1],xmm8[2],xmm6[3,4],xmm8[5],xmm6[6,7] @@ -1009,10 +1003,9 @@ ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm0 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,2,3,8,9,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm4 @@ -1022,7 +1015,7 @@ ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3],mem[4],xmm5[5,6],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] @@ -1052,23 +1045,22 @@ ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm3 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14],ymm5[15] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm2 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3],ymm5[4],ymm2[5,6],ymm5[7],ymm2[8],ymm5[9],ymm2[10,11],ymm5[12],ymm2[13,14],ymm5[15] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX2-ONLY-NEXT: vmovdqa 176(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm9, %xmm9 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1,2],ymm9[3,4,5,6,7],ymm3[8,9,10],ymm9[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm7 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm9 = ymm7[2,3,0,1] ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7],ymm7[8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13,14],ymm9[15] @@ -1079,42 +1071,39 @@ ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm10 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm4, %ymm2, %ymm10 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX2-ONLY-NEXT: vpshufb %ymm12, %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm11 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX2-ONLY-NEXT: vpshufb %ymm12, %ymm11, %ymm11 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0,1,2],ymm13[3,4,5,6,7],ymm10[8,9,10],ymm13[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7,8,9],ymm13[10],ymm11[11,12],ymm13[13],ymm11[14,15] -; AVX2-ONLY-NEXT: vpshufb %ymm12, %ymm11, %ymm11 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm13 = ymm11[0,1,2],ymm13[3,4,5,6,7],ymm11[8,9,10],ymm13[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7,8,9],ymm13[10],ymm10[11,12],ymm13[13],ymm10[14,15] +; AVX2-ONLY-NEXT: vpshufb %ymm12, %ymm10, %ymm10 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm12 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0,1,2],ymm12[3,4,5,6,7],ymm10[8,9,10],ymm12[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] +; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] @@ -1123,11 +1112,11 @@ ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rsi) ; AVX2-ONLY-NEXT: vmovdqa %ymm9, (%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 32(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -1141,69 +1130,67 @@ ; AVX512F-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm4 ; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm4[3,4,5,6,7] ; AVX512F-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3 -; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512F-NEXT: vmovdqa %ymm0, %ymm4 +; AVX512F-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm4 +; AVX512F-NEXT: vpermq {{.*#+}} ymm10 = ymm4[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm10[1],ymm4[2,3],ymm10[4],ymm4[5,6],ymm10[7],ymm4[8],ymm10[9],ymm4[10,11],ymm10[12],ymm4[13,14],ymm10[15] +; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm10 ; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] ; AVX512F-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm10 ; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10 ; AVX512F-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4],xmm10[5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-NEXT: vmovdqa %ymm11, %ymm12 -; AVX512F-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm12 -; AVX512F-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7,8,9],ymm13[10],ymm12[11,12],ymm13[13],ymm12[14,15] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512F-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-NEXT: vmovdqa %ymm12, %ymm13 +; AVX512F-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm13 +; AVX512F-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15] +; AVX512F-NEXT: vpshufb %ymm11, %ymm13, %ymm11 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13] ; AVX512F-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm13 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,6,7,4] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10 -; AVX512F-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm11 -; AVX512F-NEXT: vpermq {{.*#+}} ymm5 = ymm11[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm11[1,2],ymm5[3],ymm11[4,5],ymm5[6],ymm11[7],ymm5[8],ymm11[9,10],ymm5[11],ymm11[12,13],ymm5[14],ymm11[15] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm13 = ymm11[0,1,2],ymm13[3,4,5,6,7],ymm11[8,9,10],ymm13[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512F-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm12 +; AVX512F-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512F-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512F-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0 -; AVX512F-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7],ymm8[8],ymm0[9,10],ymm8[11],ymm0[12,13],ymm8[14],ymm0[15] -; AVX512F-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm1 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512F-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm10, (%rdx) @@ -1247,22 +1234,22 @@ ; SSE-LABEL: load_i16_stride3_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $408, %rsp # imm = 0x198 -; SSE-NEXT: movdqa 192(%rdi), %xmm11 +; SSE-NEXT: movdqa 240(%rdi), %xmm11 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm4 +; SSE-NEXT: movdqa 80(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm15 +; SSE-NEXT: movdqa 224(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm13 +; SSE-NEXT: movdqa 192(%rdi), %xmm13 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm5 -; SSE-NEXT: movdqa 80(%rdi), %xmm9 +; SSE-NEXT: movdqa 208(%rdi), %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa 16(%rdi), %xmm10 +; SSE-NEXT: movdqa 32(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa 48(%rdi), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm15 -; SSE-NEXT: movdqa 32(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa 64(%rdi), %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm10, %xmm1 @@ -1307,7 +1294,7 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm1[2,0] -; SSE-NEXT: movdqa 208(%rdi), %xmm4 +; SSE-NEXT: movdqa 256(%rdi), %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] @@ -1319,7 +1306,7 @@ ; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: movdqa 272(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] @@ -1332,14 +1319,14 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm5 +; SSE-NEXT: movdqa 112(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: movdqa 144(%rdi), %xmm14 +; SSE-NEXT: movdqa 96(%rdi), %xmm14 ; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 176(%rdi), %xmm1 +; SSE-NEXT: movdqa 128(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] @@ -1352,16 +1339,16 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm2 +; SSE-NEXT: movdqa 304(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm12 +; SSE-NEXT: movdqa 288(%rdi), %xmm12 ; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 368(%rdi), %xmm11 +; SSE-NEXT: movdqa 320(%rdi), %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,2,1] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] @@ -1373,16 +1360,16 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm2 +; SSE-NEXT: movdqa 160(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm8 +; SSE-NEXT: movdqa 144(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 128(%rdi), %xmm1 +; SSE-NEXT: movdqa 176(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] @@ -1394,14 +1381,14 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm1 +; SSE-NEXT: movdqa 336(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: movdqa 352(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa 320(%rdi), %xmm1 +; SSE-NEXT: movdqa 368(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] @@ -1690,91 +1677,94 @@ ; SSE-NEXT: pandn %xmm13, %xmm6 ; SSE-NEXT: por %xmm5, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) +; SSE-NEXT: movaps %xmm2, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, (%rsi) +; SSE-NEXT: movaps %xmm2, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 80(%rsi) +; SSE-NEXT: movaps %xmm5, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 16(%rsi) -; SSE-NEXT: movdqa %xmm3, 96(%rdx) -; SSE-NEXT: movdqa %xmm8, 32(%rdx) -; SSE-NEXT: movdqa %xmm10, 112(%rdx) -; SSE-NEXT: movdqa %xmm14, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movaps %xmm5, (%rsi) +; SSE-NEXT: movdqa %xmm3, 112(%rdx) +; SSE-NEXT: movdqa %xmm8, 48(%rdx) +; SSE-NEXT: movdqa %xmm10, 96(%rdx) +; SSE-NEXT: movdqa %xmm14, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movdqa %xmm6, 96(%rcx) -; SSE-NEXT: movdqa %xmm11, 112(%rcx) -; SSE-NEXT: movdqa %xmm0, 64(%rcx) -; SSE-NEXT: movdqa %xmm15, 80(%rcx) -; SSE-NEXT: movdqa %xmm1, 32(%rcx) -; SSE-NEXT: movdqa %xmm4, 48(%rcx) -; SSE-NEXT: movdqa %xmm9, (%rcx) -; SSE-NEXT: movdqa %xmm7, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movdqa %xmm6, 112(%rcx) +; SSE-NEXT: movdqa %xmm11, 96(%rcx) +; SSE-NEXT: movdqa %xmm0, 80(%rcx) +; SSE-NEXT: movdqa %xmm15, 64(%rcx) +; SSE-NEXT: movdqa %xmm1, 48(%rcx) +; SSE-NEXT: movdqa %xmm4, 32(%rcx) +; SSE-NEXT: movdqa %xmm9, 16(%rcx) +; SSE-NEXT: movdqa %xmm7, (%rcx) ; SSE-NEXT: addq $408, %rsp # imm = 0x198 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $440, %rsp # imm = 0x1B8 -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm10[2],xmm0[3,4],xmm10[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2],xmm0[3,4],xmm14[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm11[1],xmm3[2,3],xmm11[4],xmm3[5,6],xmm11[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0,1],xmm0[2],xmm15[3,4],xmm0[5],xmm15[6,7] +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm5[1],xmm8[2,3],xmm5[4],xmm8[5,6],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 @@ -1795,148 +1785,143 @@ ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0],xmm1[1],xmm9[2,3],xmm1[4],xmm9[5,6],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0],xmm0[1],xmm12[2,3],xmm0[4],xmm12[5,6],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm10[0,1],mem[2],xmm10[3,4],mem[5],xmm10[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm14[0,1],mem[2],xmm14[3,4],mem[5],xmm14[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,8,9,14,15,14,15,8,9,14,15,12,13,14,15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm15[2],mem[3,4],xmm15[5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2],mem[3,4],xmm0[5],mem[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,1,6,7,12,13,0,0,0,1,6,7,12,13] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm11[2],xmm7[3,4],xmm11[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,0,1,0,1,6,7,12,13] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm15 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm15[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm8[0,1],mem[2],xmm8[3,4],mem[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm15 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm15[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2],xmm0[3,4],mem[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm15 +; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm10[0,1],mem[2],xmm10[3,4],mem[5],xmm10[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm15 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm15 +; AVX1-ONLY-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm6[2],mem[3,4],xmm6[5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm15[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm3[2],xmm9[3,4],xmm3[5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1],xmm6[2],xmm12[3,4],xmm6[5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1],xmm13[2],xmm15[3,4],xmm13[5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1],xmm7[2],xmm11[3,4],xmm7[5],xmm11[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,2,3,8,9,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm0[0,1,2,3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2],xmm0[3,4],mem[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1,2,3,4],xmm11[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm8[2],mem[3,4],xmm8[5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3,4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm9[2],mem[3,4],xmm9[5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0],xmm15[1],xmm13[2,3],xmm15[4],xmm13[5,6],xmm15[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw $219, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2],mem[3,4],xmm0[5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm12[2],xmm6[3,4],xmm12[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0],xmm0[1],mem[2,3],xmm0[4],mem[5,6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm3[1],mem[2,3],xmm3[4],mem[5,6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufd $230, (%rsp), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm10[1],mem[2,3],xmm10[4],mem[5,6],xmm10[7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm3[0],mem[1],xmm3[2,3],mem[4],xmm3[5,6],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0],xmm6[1],mem[2,3],xmm6[4],mem[5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3],mem[4],xmm5[5,6],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm15[0],mem[1],xmm15[2,3],mem[4],xmm15[5,6],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm3 ; AVX1-ONLY-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] @@ -1946,381 +1931,356 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 112(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm6, 96(%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm6, 32(%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 112(%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 48(%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm6, 64(%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rdx) +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm6, 80(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 48(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 96(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 16(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 64(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 80(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 112(%rcx) -; AVX1-ONLY-NEXT: addq $440, %rsp # imm = 0x1B8 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 96(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 112(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 64(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 80(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm14, 32(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 48(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, (%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX1-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i16_stride3_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $168, %rsp +; AVX2-ONLY-NEXT: subq $136, %rsp ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm8, %ymm9, %ymm0 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6],ymm4[7],ymm0[8],ymm4[9],ymm0[10,11],ymm4[12],ymm0[13,14],ymm4[15] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm7, %ymm9, %ymm15 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm3, %ymm5, %ymm10 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6],ymm4[7],ymm0[8],ymm4[9],ymm0[10,11],ymm4[12],ymm0[13,14],ymm4[15] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm11, %ymm12, %ymm0 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm3, %ymm5, %ymm10 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> ; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm12, %ymm11, %ymm4 ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm12, %ymm11, %ymm8 -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm9, %ymm7, %ymm8 -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm9, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm9, %ymm8, %ymm12 +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm9, %ymm8, %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm5, %ymm3, %ymm9 -; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 176(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm11 -; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm2, %ymm1, %ymm8 -; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm12 -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1],xmm7[2],xmm14[3,4],xmm7[5],xmm14[6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm15[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2,3],ymm0[4],ymm15[5,6],ymm0[7],ymm15[8],ymm0[9],ymm15[10,11],ymm0[12],ymm15[13,14],ymm0[15] -; AVX2-ONLY-NEXT: vmovdqa %ymm6, %ymm2 -; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm15[2],xmm6[3,4],xmm15[5],xmm6[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm10[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3],ymm0[4],ymm10[5,6],ymm0[7],ymm10[8],ymm0[9],ymm10[10,11],ymm0[12],ymm10[13,14],ymm0[15] -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm10[2],xmm3[3,4],xmm10[5],xmm3[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm5, %ymm3, %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm1, %ymm6, %ymm4 +; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm6, %ymm1, %ymm14 +; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm6, %ymm1, %ymm8 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm11[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6],ymm0[7],ymm11[8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14],ymm0[15] -; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3,4,5,6,7],ymm15[8,9,10],ymm1[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX2-ONLY-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 176(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm13 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0,1,2],ymm13[3,4,5,6,7],ymm1[8,9,10],ymm13[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX2-ONLY-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm13 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0,1,2],ymm13[3,4,5,6,7],ymm1[8,9,10],ymm13[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6],ymm1[7],ymm4[8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13,14],ymm1[15] +; AVX2-ONLY-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm4 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7],ymm11[8,9,10],ymm4[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,4,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm5 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm14[2],xmm7[3,4],xmm14[5],xmm7[6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm11[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7,8,9],ymm4[10],ymm11[11,12],ymm4[13],ymm11[14,15] -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0,1],xmm6[2],xmm15[3,4],xmm6[5],xmm15[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm11 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1,2],ymm11[3,4,5,6,7],ymm4[8,9,10],ymm11[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7,8,9],ymm11[10],ymm9[11,12],ymm11[13],ymm9[14,15] -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm10[0,1],xmm3[2],xmm10[3,4],xmm3[5],xmm10[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm11 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0,1,2],ymm11[3,4,5,6,7],ymm9[8,9,10],ymm11[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7,8,9],ymm11[10],ymm8[11,12],ymm11[13],ymm8[14,15] -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm5 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1],ymm5[2],ymm12[3,4],ymm5[5],ymm12[6,7,8,9],ymm5[10],ymm12[11,12],ymm5[13],ymm12[14,15] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX2-ONLY-NEXT: vpshufb %ymm12, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm13 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0,1,2],ymm13[3,4,5,6,7],ymm5[8,9,10],ymm13[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm5[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0,1],ymm13[2],ymm5[3,4],ymm13[5],ymm5[6,7,8,9],ymm13[10],ymm5[11,12],ymm13[13],ymm5[14,15] +; AVX2-ONLY-NEXT: vpshufb %ymm12, %ymm13, %ymm13 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7],ymm8[8,9,10],ymm5[11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,6,7,4] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm11[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm11[1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7],ymm8[8],ymm11[9,10],ymm8[11],ymm11[12,13],ymm8[14],ymm11[15] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0],xmm7[1],xmm14[2,3],xmm7[4],xmm14[5,6],xmm7[7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm5 = ymm13[0,1,2],ymm5[3,4,5,6,7],ymm13[8,9,10],ymm5[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm9[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm13[2],ymm9[3,4],ymm13[5],ymm9[6,7,8,9],ymm13[10],ymm9[11,12],ymm13[13],ymm9[14,15] +; AVX2-ONLY-NEXT: vpshufb %ymm12, %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm13 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm13 = ymm9[0,1,2],ymm13[3,4,5,6,7],ymm9[8,9,10],ymm13[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm14[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7,8,9],ymm13[10],ymm14[11,12],ymm13[13],ymm14[14,15] +; AVX2-ONLY-NEXT: vpshufb %ymm12, %ymm13, %ymm12 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm4[0,1],xmm1[2],xmm4[3,4],xmm1[5],xmm4[6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm13[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7],ymm12[8],ymm13[9,10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm12, %ymm12 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm2[5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm14[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm14[1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7],ymm8[8],ymm14[9,10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm15[1],xmm6[2,3],xmm15[4],xmm6[5,6],xmm15[7] -; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7],ymm12[8],ymm14[9,10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5,6],xmm7[7] +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm12, %ymm7 ; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm6 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm14[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm14[1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7],ymm8[8],ymm14[9,10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3],xmm10[4],xmm3[5,6],xmm10[7] -; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm12[2,3,0,1] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7],ymm8[8],ymm12[9,10],ymm8[11],ymm12[12,13],ymm8[14],ymm12[15] -; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm11[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm11[1,2],ymm7[3],ymm11[4,5],ymm7[6],ymm11[7],ymm7[8],ymm11[9,10],ymm7[11],ymm11[12,13],ymm7[14],ymm11[15] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7] +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm10, %xmm10 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7],ymm10[8],ymm8[9,10],ymm10[11],ymm8[12,13],ymm10[14],ymm8[15] +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm8, %ymm3 +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6],xmm4[7] ; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 64(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 96(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 64(%rcx) +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, 96(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 96(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 64(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 96(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 32(%rcx) -; AVX2-ONLY-NEXT: addq $168, %rsp +; AVX2-ONLY-NEXT: addq $136, %rsp ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i16_stride3_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-NEXT: vmovdqa64 224(%rdi), %ymm20 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm21 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm22 +; AVX512F-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512F-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm1 -; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14],ymm3[15] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm6 -; AVX512F-NEXT: vmovdqa 272(%rdi), %xmm8 -; AVX512F-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] -; AVX512F-NEXT: vmovdqa %xmm2, %xmm14 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512F-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX512F-NEXT: vpternlogq $202, %ymm22, %ymm5, %ymm1 +; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512F-NEXT: vpshufb %ymm1, %ymm2, %ymm3 +; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm9 +; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6],xmm9[7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm4 +; AVX512F-NEXT: vpternlogq $202, %ymm8, %ymm23, %ymm4 +; AVX512F-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6],ymm7[7],ymm4[8],ymm7[9],ymm4[10,11],ymm7[12],ymm4[13,14],ymm7[15] +; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm12 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1],xmm12[2],xmm13[3,4],xmm12[5],xmm13[6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512F-NEXT: vpshufb %xmm14, %xmm7, %xmm7 ; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,4,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 320(%rdi), %ymm22 -; AVX512F-NEXT: vmovdqa64 352(%rdi), %ymm23 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2],ymm7[3,4,5,6,7],ymm4[8,9,10],ymm7[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm17 +; AVX512F-NEXT: vmovdqa64 224(%rdi), %ymm19 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm21 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm3 +; AVX512F-NEXT: vpternlogq $202, %ymm19, %ymm21, %ymm3 +; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] +; AVX512F-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-NEXT: vmovdqa 272(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 256(%rdi), %xmm4 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm15 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7] +; AVX512F-NEXT: vmovdqa %xmm4, %xmm7 +; AVX512F-NEXT: vmovdqa %xmm2, %xmm10 +; AVX512F-NEXT: vpshufb %xmm14, %xmm15, %xmm14 +; AVX512F-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm14 = ymm3[0,1,2],ymm14[3,4,5,6,7],ymm3[8,9,10],ymm14[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX512F-NEXT: vmovdqa64 352(%rdi), %ymm16 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm14 +; AVX512F-NEXT: vpternlogq $202, %ymm15, %ymm16, %ymm14 +; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm14[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3],ymm2[4],ymm14[5,6],ymm2[7],ymm14[8],ymm2[9],ymm14[10,11],ymm2[12],ymm14[13,14],ymm2[15] +; AVX512F-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa 304(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 288(%rdi), %xmm14 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm1[1],xmm14[2,3],xmm1[4],xmm14[5,6],xmm1[7] +; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm18 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512F-NEXT: vpternlogq $202, %ymm21, %ymm19, %ymm3 +; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm7[2],xmm10[3,4],xmm7[5],xmm10[6,7] +; AVX512F-NEXT: vmovdqa64 %xmm10, %xmm24 +; AVX512F-NEXT: vmovdqa64 %xmm7, %xmm25 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512F-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512F-NEXT: vmovdqa64 %xmm7, %xmm26 +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0,1,2],ymm6[3,4,5,6,7],ymm4[8,9,10],ymm6[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512F-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm6 +; AVX512F-NEXT: vpternlogq $202, %ymm16, %ymm15, %ymm6 ; AVX512F-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512F-NEXT: vpshufb %ymm11, %ymm6, %ymm12 -; AVX512F-NEXT: vmovdqa 304(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX512F-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512F-NEXT: vmovdqa %xmm1, %xmm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512F-NEXT: vpshufb %xmm15, %xmm13, %xmm13 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm18 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm24 -; AVX512F-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm10 -; AVX512F-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6],ymm12[7],ymm10[8],ymm12[9],ymm10[10,11],ymm12[12],ymm10[13,14],ymm12[15] -; AVX512F-NEXT: vpshufb %ymm11, %ymm10, %ymm10 -; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm11 -; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7] -; AVX512F-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm10[3,4,5,6,7] -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm16 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm15 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512F-NEXT: vpternlogq $202, %ymm15, %ymm16, %ymm5 -; AVX512F-NEXT: vpermq {{.*#+}} ymm10 = ymm5[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6],ymm10[7],ymm5[8],ymm10[9],ymm5[10,11],ymm10[12],ymm5[13,14],ymm10[15] -; AVX512F-NEXT: vpshufb %ymm3, %ymm5, %ymm2 -; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm5 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm10 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1],xmm5[2],xmm10[3,4],xmm5[5],xmm10[6,7] -; AVX512F-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm19 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512F-NEXT: vpternlogq $202, %ymm23, %ymm22, %ymm1 -; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] -; AVX512F-NEXT: vmovdqa64 %xmm6, %xmm25 -; AVX512F-NEXT: vmovdqa64 %xmm4, %xmm26 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512F-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-NEXT: vmovdqa %ymm9, %ymm1 -; AVX512F-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm1 -; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512F-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %xmm14, %xmm7 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7] -; AVX512F-NEXT: vmovdqa64 %xmm8, %xmm27 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX512F-NEXT: vpshufb %xmm2, %xmm14, %xmm14 -; AVX512F-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm17 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512F-NEXT: vpternlogq $202, %ymm13, %ymm24, %ymm1 -; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX512F-NEXT: vmovdqa64 %ymm28, %ymm3 -; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] -; AVX512F-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512F-NEXT: vpternlogq $202, %ymm16, %ymm15, %ymm3 -; AVX512F-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm10[2],xmm5[3,4],xmm10[5],xmm5[6,7] -; AVX512F-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512F-NEXT: vpshufb %ymm3, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0,1],xmm1[2],xmm14[3,4],xmm1[5],xmm14[6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX512F-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm6[5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm20 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm4 +; AVX512F-NEXT: vpternlogq $202, %ymm5, %ymm22, %ymm4 +; AVX512F-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0,1],xmm9[2],xmm11[3,4],xmm9[5],xmm11[6,7] +; AVX512F-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm4[5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vmovdqa %ymm2, %ymm6 +; AVX512F-NEXT: vpternlogq $202, %ymm23, %ymm8, %ymm6 +; AVX512F-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512F-NEXT: vpshufb %ymm3, %ymm6, %ymm3 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] +; AVX512F-NEXT: vmovdqa64 %xmm26, %xmm7 +; AVX512F-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm4 +; AVX512F-NEXT: vpternlogq $226, %ymm22, %ymm2, %ymm5 +; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512F-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512F-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vpternlogq $226, %ymm23, %ymm0, %ymm8 +; AVX512F-NEXT: vpermq {{.*#+}} ymm6 = ymm8[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7],ymm6[8],ymm8[9,10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] +; AVX512F-NEXT: vpshufb %ymm3, %ymm6, %ymm6 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm12[1],xmm13[2,3],xmm12[4],xmm13[5,6],xmm12[7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX512F-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512F-NEXT: vpternlogq $202, %ymm15, %ymm16, %ymm2 +; AVX512F-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1,2],ymm6[3],ymm2[4,5],ymm6[6],ymm2[7],ymm6[8],ymm2[9,10],ymm6[11],ymm2[12,13],ymm6[14],ymm2[15] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm14[2],xmm1[3,4],xmm14[5],xmm1[6,7] +; AVX512F-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vpternlogq $202, %ymm21, %ymm19, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm3 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX512F-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm13 -; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15] -; AVX512F-NEXT: vpternlogq $226, %ymm16, %ymm0, %ymm15 -; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm15[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm15[1,2],ymm3[3],ymm15[4,5],ymm3[6],ymm15[7],ymm3[8],ymm15[9,10],ymm3[11],ymm15[12,13],ymm3[14],ymm15[15] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm5[1],xmm10[2,3],xmm5[4],xmm10[5,6],xmm5[7] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512F-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] -; AVX512F-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512F-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 -; AVX512F-NEXT: vextracti32x4 $2, %zmm5, %xmm5 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm2[5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm9 -; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm9[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7],ymm3[8],ymm9[9,10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] -; AVX512F-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0 -; AVX512F-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7],ymm5[8],ymm0[9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa64 %xmm27, %xmm4 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7] -; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512F-NEXT: vmovdqa64 %xmm26, %xmm5 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX512F-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-NEXT: vextracti32x4 $2, %zmm4, %xmm4 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm2, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm5, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -372,147 +372,153 @@ ; ; AVX2-SLOW-LABEL: load_i16_stride4_vf8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = mem[0,1],ymm3[0,1] +; AVX2-SLOW-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7],ymm4[8],ymm5[9,10,11],ymm4[12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-SLOW-NEXT: vpackusdw %xmm6, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = mem[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3],ymm6[4],ymm5[5,6,7],ymm6[8],ymm5[9,10,11],ymm6[12],ymm5[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-SLOW-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm4, (%rsi) ; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rdx) ; AVX2-SLOW-NEXT: vmovdqa %xmm6, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%r8) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride4_vf8: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = mem[0,1],ymm3[0,1] +; AVX2-FAST-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7],ymm4[8],ymm5[9,10,11],ymm4[12],ymm5[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-NEXT: vpackusdw %xmm6, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = mem[0,1],ymm1[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3],ymm6[4],ymm5[5,6,7],ymm6[8],ymm5[9,10,11],ymm6[12],ymm5[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm4, (%rsi) ; AVX2-FAST-NEXT: vmovdqa %xmm5, (%rdx) ; AVX2-FAST-NEXT: vmovdqa %xmm6, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %xmm1, (%r8) +; AVX2-FAST-NEXT: vmovdqa %xmm0, (%r8) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride4_vf8: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm4 = mem[0,1],ymm3[0,1] +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7],ymm4[8],ymm5[9,10,11],ymm4[12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm6 = mem[0,1],ymm1[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3],ymm6[4],ymm5[5,6,7],ymm6[8],ymm5[9,10,11],ymm6[12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%r8) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -643,491 +649,500 @@ ; ; AVX1-ONLY-LABEL: load_i16_stride4_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm1[1,2,3],xmm5[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm1[1,2,3],xmm6[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm1[1,2,3],xmm7[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm1[1,2,3],xmm8[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm9[0],xmm1[1,2,3],xmm9[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0],xmm1[1,2,3],xmm4[4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm0[1,2,3],xmm3[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm0[1,2,3],xmm4[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm0[1,2,3],xmm5[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm0[1,2,3],xmm6[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0],xmm0[1,2,3],xmm8[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm0[1,2,3],xmm7[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm1[1,2,3],xmm3[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm11, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm11, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm11, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride4_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = mem[0,1],ymm3[0,1] +; AVX2-SLOW-NEXT: vpxor %xmm8, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm8[1,2,3],ymm4[4],ymm8[5,6,7],ymm4[8],ymm8[9,10,11],ymm4[12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = mem[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1,2,3],ymm5[4],ymm8[5,6,7],ymm5[8],ymm8[9,10,11],ymm5[12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-SLOW-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4],ymm8[5,6,7],ymm7[8],ymm8[9,10,11],ymm7[12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX2-SLOW-NEXT: vpackusdw %xmm9, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vpackusdw %xmm7, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm7, %ymm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm8[1,2,3],ymm11[4],ymm8[5,6,7],ymm11[8],ymm8[9,10,11],ymm11[12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX2-SLOW-NEXT: vpackusdw %xmm11, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vpackusdw %xmm8, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rsi) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride4_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,2,3,0,2,4,6] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm4 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm12 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm11 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = mem[0,1],ymm5[0,1] +; AVX2-FAST-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1,2,3],ymm4[4],ymm7[5,6,7],ymm4[8],ymm7[9,10,11],ymm4[12],ymm7[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX2-FAST-NEXT: vpackusdw %xmm8, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = mem[0,1],ymm1[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3],ymm8[4],ymm7[5,6,7],ymm8[8],ymm7[9,10,11],ymm8[12],ymm7[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-NEXT: vpackusdw %xmm8, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm7, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,2,3,0,2,4,6] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm10 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm10 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm13 -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm12 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,2,3,1,3,5,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [1,3,2,3,1,3,5,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-FAST-NEXT: vmovdqa %ymm4, (%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm4, (%rsi) +; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm8, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride4_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm4 = mem[0,1],ymm3[0,1] +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7],ymm4[8],ymm5[9,10,11],ymm4[12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm6 = mem[0,1],ymm1[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm5[1,2,3],ymm6[4],ymm5[5,6,7],ymm6[8],ymm5[9,10,11],ymm6[12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm7, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm6, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm5[1,2,3],ymm8[4],ymm5[5,6,7],ymm8[8],ymm5[9,10,11],ymm8[12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm9, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm8, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm5[1,2,3],ymm11[4],ymm5[5,6,7],ymm11[8],ymm5[9,10,11],ymm11[12],ymm5[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm11, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm5, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm1, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm11[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i16_stride4_vf16: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpmovqw %ymm0, %xmm0 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512F-SLOW-NEXT: vpmovqw %zmm3, %xmm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm7 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm7 +; AVX512F-SLOW-NEXT: vpmovqw %ymm7, %xmm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-SLOW-NEXT: vpmovqw %zmm7, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm3, %zmm5 -; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm7, %zmm4 +; AVX512F-SLOW-NEXT: vpmovqw %zmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[2,0,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm3, %zmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm7, %zmm8 ; AVX512F-SLOW-NEXT: vpmovqw %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm3, %zmm3 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm7, %zmm3 ; AVX512F-SLOW-NEXT: vpmovqw %zmm3, %xmm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512F-SLOW-NEXT: vmovdqa %ymm2, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa %ymm5, (%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm4, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%r8) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i16_stride4_vf16: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] -; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm6 -; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,2,3,4,6,12,14] -; AVX512F-FAST-NEXT: vpermt2d %ymm4, %ymm8, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512F-FAST-NEXT: vpmovqw %zmm4, %xmm9 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpsrlq $16, %zmm4, %zmm3 -; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm5 -; AVX512F-FAST-NEXT: vpermt2d %ymm0, %ymm8, %ymm5 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm4, %zmm0 -; AVX512F-FAST-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm7 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-FAST-NEXT: vpmovqw %zmm7, %xmm8 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm7, %zmm2 +; AVX512F-FAST-NEXT: vpmovqw %zmm2, %xmm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,2,3,1,3,5,7] +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm3 +; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm5 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm7, %zmm5 +; AVX512F-FAST-NEXT: vpmovqw %zmm5, %xmm5 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm4, %zmm3 -; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa %ymm7, (%rsi) -; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa %ymm2, (%r8) +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm7, %zmm2 +; AVX512F-FAST-NEXT: vpmovqw %zmm2, %xmm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa %ymm4, (%rsi) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-FAST-NEXT: vmovdqa %ymm3, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%r8) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -1165,11 +1180,11 @@ ; SSE-LABEL: load_i16_stride4_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $248, %rsp -; SSE-NEXT: movdqa 224(%rdi), %xmm3 +; SSE-NEXT: movdqa 160(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm4 +; SSE-NEXT: movdqa 128(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm5 +; SSE-NEXT: movdqa 144(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1199,7 +1214,7 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,0,2,4,5,6,7] @@ -1208,18 +1223,18 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 192(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa 224(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,0,2,4,5,6,7] @@ -1228,18 +1243,18 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 160(%rdi), %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,0,2,4,5,6,7] @@ -1386,212 +1401,214 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movapd %xmm12, 32(%rdx) -; SSE-NEXT: movapd %xmm8, (%rdx) +; SSE-NEXT: movapd %xmm12, (%rdx) +; SSE-NEXT: movapd %xmm8, 48(%rdx) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movapd %xmm15, 32(%rcx) -; SSE-NEXT: movapd %xmm9, (%rcx) -; SSE-NEXT: movapd %xmm13, 48(%rcx) +; SSE-NEXT: movapd %xmm15, (%rcx) +; SSE-NEXT: movapd %xmm9, 48(%rcx) +; SSE-NEXT: movapd %xmm13, 32(%rcx) ; SSE-NEXT: movapd %xmm10, 16(%rcx) -; SSE-NEXT: movapd %xmm2, 32(%r8) -; SSE-NEXT: movapd %xmm7, (%r8) -; SSE-NEXT: movapd %xmm14, 48(%r8) +; SSE-NEXT: movapd %xmm2, (%r8) +; SSE-NEXT: movapd %xmm7, 48(%r8) +; SSE-NEXT: movapd %xmm14, 32(%r8) ; SSE-NEXT: movapd %xmm3, 16(%r8) ; SSE-NEXT: addq $248, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride4_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $232, %rsp -; AVX1-ONLY-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: subq $248, %rsp +; AVX1-ONLY-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm0[1,2,3],xmm8[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm5[1,2,3],xmm11[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3],xmm3[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm5[1,2,3],xmm1[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm5[1,2,3],xmm15[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0],xmm5[1,2,3],xmm1[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0],xmm5[1,2,3],xmm2[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm9, %xmm8 -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm5[1,2,3],xmm13[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0],xmm5[1,2,3],xmm14[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm9, %xmm8 -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm5[1,2,3],xmm9[4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm0[1,2,3],xmm5[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0],xmm0[1,2,3],xmm6[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm3, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm7[0],xmm0[1,2,3],xmm7[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vpackusdw %xmm3, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0],xmm0[1,2,3],xmm9[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1,2,3],xmm8[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0],xmm0[1,2,3],xmm12[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm0[1,2,3],xmm10[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm0[1,2,3],xmm13[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm3, %xmm14, %xmm3 +; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0],xmm0[1,2,3],xmm15[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm0[1,2,3],xmm14[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm0[1,2,3],xmm11[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $231, (%rsp), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -1599,235 +1616,242 @@ ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX1-ONLY-NEXT: addq $232, %rsp +; AVX1-ONLY-NEXT: addq $248, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride4_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $184, %rsp -; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: subq $232, %rsp +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7],ymm0[8],ymm3[9,10,11],ymm0[12],ymm3[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm2[0,1] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4],ymm3[5,6,7],ymm1[8],ymm3[9,10,11],ymm1[12],ymm3[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4],ymm3[5,6,7],ymm1[8],ymm3[9,10,11],ymm1[12],ymm3[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm9, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7],ymm2[8],ymm3[9,10,11],ymm2[12],ymm3[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7],ymm0[8],ymm3[9,10,11],ymm0[12],ymm3[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm6[0,1] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4],ymm3[5,6,7],ymm1[8],ymm3[9,10,11],ymm1[12],ymm3[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4],ymm3[5,6,7],ymm1[8],ymm3[9,10,11],ymm1[12],ymm3[13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7],ymm2[8],ymm3[9,10,11],ymm2[12],ymm3[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm8 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm9 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm8[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm15 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm5 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm5 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -1835,36 +1859,36 @@ ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) @@ -1874,375 +1898,398 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm11, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, 32(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-SLOW-NEXT: addq $184, %rsp +; AVX2-SLOW-NEXT: addq $232, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride4_vf32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $136, %rsp -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm13 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm13[0,1] +; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[0,1],ymm6[0,1] ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm8 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm14 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm10 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 176(%rdi), %xmm10 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm10[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm15 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = mem[0,1],ymm11[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7],ymm3[8],ymm1[9,10,11],ymm3[12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm15 +; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm2 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm12 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm10 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm7 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm13, %xmm12 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX2-FAST-NEXT: vmovdqa 144(%rdi), %xmm14 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm1 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm14, %xmm4 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,2,3,1,3,5,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm7[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm13 = mem[3,1,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm11[0,1],xmm7[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm2 ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm6[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm11[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[2,0,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm3, (%r8) ; AVX2-FAST-NEXT: addq $136, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride4_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $168, %rsp -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: subq $280, %rsp # imm = 0x118 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm1[0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7],ymm0[8],ymm3[9,10,11],ymm0[12],ymm3[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm2[0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4],ymm3[5,6,7],ymm1[8],ymm3[9,10,11],ymm1[12],ymm3[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4],ymm3[5,6,7],ymm1[8],ymm3[9,10,11],ymm1[12],ymm3[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7],ymm4[8],ymm3[9,10,11],ymm4[12],ymm3[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7],ymm0[8],ymm3[9,10,11],ymm0[12],ymm3[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm8[0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4],ymm3[5,6,7],ymm1[8],ymm3[9,10,11],ymm1[12],ymm3[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4],ymm3[5,6,7],ymm1[8],ymm3[9,10,11],ymm1[12],ymm3[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7],ymm4[8],ymm3[9,10,11],ymm4[12],ymm3[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm7, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm15, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm11, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm11, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm15, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm14, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm12 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -2251,12 +2298,12 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-PERLANE-NEXT: addq $168, %rsp +; AVX2-FAST-PERLANE-NEXT: addq $280, %rsp # imm = 0x118 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -2264,97 +2311,97 @@ ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vpmovqw %ymm2, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm17 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm17[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpmovqw %zmm1, %xmm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm9 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm6 +; AVX512F-SLOW-NEXT: vpmovqw %ymm6, %xmm6 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpmovqw %zmm1, %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vpmovqw %ymm3, %xmm3 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpmovqw %zmm0, %xmm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm10[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm14[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm7 ; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm14 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm1, %zmm9 -; AVX512F-SLOW-NEXT: vpmovqw %zmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm12[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,2,2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm11, %ymm7, %ymm16 +; AVX512F-SLOW-NEXT: vpmovqw %ymm16, %xmm16 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] +; AVX512F-SLOW-NEXT: vpmovqw %zmm0, %xmm15 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm0, %zmm9 -; AVX512F-SLOW-NEXT: vpmovqw %zmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm8[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[3,1,2,3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm1, %zmm10 +; AVX512F-SLOW-NEXT: vpmovqw %zmm10, %xmm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm13[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm14[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm0, %zmm12 +; AVX512F-SLOW-NEXT: vpmovqw %zmm12, %xmm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm17[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] ; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm1, %zmm13 ; AVX512F-SLOW-NEXT: vpmovqw %zmm13, %xmm13 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm8[0,1,2,0,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm11[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm7[2,0,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] @@ -2362,26 +2409,26 @@ ; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm13[0,1,2,3],zmm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqw %zmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,3,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm0, %zmm0 @@ -2389,7 +2436,7 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512F-SLOW-NEXT: vzeroupper @@ -2399,66 +2446,64 @@ ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] -; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6] ; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 ; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> ; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm8 ; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm10 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,2,2,3,4,6,12,14] -; AVX512F-FAST-NEXT: vpermt2d %ymm7, %ymm11, %ymm10 -; AVX512F-FAST-NEXT: vpmovqw %zmm1, %xmm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] +; AVX512F-FAST-NEXT: vpmovqw %zmm1, %xmm10 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm10 -; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm4, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm13 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm14 -; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm4 -; AVX512F-FAST-NEXT: vpermt2d %ymm13, %ymm11, %ymm4 -; AVX512F-FAST-NEXT: vpmovqw %zmm0, %xmm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm4, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm12 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX512F-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm14 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-FAST-NEXT: vpmovqw %zmm0, %xmm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm4[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm13 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm12 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-FAST-NEXT: vpsrlq $16, %zmm1, %zmm13 -; AVX512F-FAST-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm1, %zmm12 +; AVX512F-FAST-NEXT: vpmovqw %zmm12, %xmm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-FAST-NEXT: vpsrlq $16, %zmm0, %zmm13 -; AVX512F-FAST-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm12[0,1,2,3],zmm9[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [1,3,2,3,1,3,5,7] -; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm12, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm13 -; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm15 -; AVX512F-FAST-NEXT: vpermt2d %ymm13, %ymm11, %ymm15 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm1, %zmm13 -; AVX512F-FAST-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13 -; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm10 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm0, %zmm12 +; AVX512F-FAST-NEXT: vpmovqw %zmm12, %xmm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,2,3],zmm9[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [1,3,2,3,1,3,5,7] +; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm11, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm12 +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm11, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm1, %zmm14 +; AVX512F-FAST-NEXT: vpmovqw %zmm14, %xmm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 +; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm2 -; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm12, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm3 -; AVX512F-FAST-NEXT: vpermt2d %ymm2, %ymm11, %ymm3 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512F-FAST-NEXT: vpmovqw %zmm2, %xmm2 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm13[4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm13, %ymm11, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7] ; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm3 ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm6 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] @@ -2467,7 +2512,7 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm4 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-FAST-NEXT: vpsrlq $48, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpmovqw %zmm0, %xmm0 @@ -2531,20 +2576,20 @@ ; SSE-LABEL: load_i16_stride4_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $824, %rsp # imm = 0x338 -; SSE-NEXT: movdqa 352(%rdi), %xmm3 +; SSE-NEXT: movdqa 288(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm4 +; SSE-NEXT: movdqa 256(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm5 +; SSE-NEXT: movdqa 272(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm6 +; SSE-NEXT: movdqa 48(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] @@ -2568,7 +2613,7 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 368(%rdi), %xmm0 +; SSE-NEXT: movdqa 304(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2579,9 +2624,9 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2590,9 +2635,9 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa 96(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa 112(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2603,9 +2648,9 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm1 +; SSE-NEXT: movdqa 320(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm0 +; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2614,9 +2659,9 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: movdqa 352(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 304(%rdi), %xmm0 +; SSE-NEXT: movdqa 368(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2627,9 +2672,9 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm1 +; SSE-NEXT: movdqa 128(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm0 +; SSE-NEXT: movdqa 144(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2638,9 +2683,9 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 224(%rdi), %xmm2 +; SSE-NEXT: movdqa 160(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2650,18 +2695,18 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm1 +; SSE-NEXT: movdqa 384(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 464(%rdi), %xmm0 +; SSE-NEXT: movdqa 400(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 480(%rdi), %xmm2 +; SSE-NEXT: movdqa 416(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 496(%rdi), %xmm0 +; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,0,2,4,5,6,7] @@ -2670,18 +2715,18 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm1 +; SSE-NEXT: movdqa 192(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm0 +; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 160(%rdi), %xmm2 +; SSE-NEXT: movdqa 224(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,0,2,4,5,6,7] @@ -2690,19 +2735,19 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 384(%rdi), %xmm0 +; SSE-NEXT: movdqa 448(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm1 +; SSE-NEXT: movdqa 464(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 416(%rdi), %xmm2 +; SSE-NEXT: movdqa 480(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 496(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,0,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] @@ -2793,576 +2838,581 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,1,2,0,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,0,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm9[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm9[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm5[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm9[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm12[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm9[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,1,2,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm6[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,0,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[0,1,2,0,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[2,0,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm15[0,1,2,0,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm5[0],xmm12[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm5[0],xmm15[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: pshuflw $116, (%rsp), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm8[0],xmm6[1] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 16(%rdx) -; SSE-NEXT: movapd %xmm3, 96(%rcx) -; SSE-NEXT: movapd %xmm7, 32(%rcx) -; SSE-NEXT: movapd %xmm10, 112(%rcx) -; SSE-NEXT: movapd %xmm13, 48(%rcx) -; SSE-NEXT: movapd %xmm14, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 80(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rcx) -; SSE-NEXT: movapd %xmm9, 112(%r8) -; SSE-NEXT: movapd %xmm6, 96(%r8) -; SSE-NEXT: movapd %xmm0, 80(%r8) -; SSE-NEXT: movapd %xmm15, 64(%r8) -; SSE-NEXT: movapd %xmm12, 48(%r8) -; SSE-NEXT: movapd %xmm1, 32(%r8) -; SSE-NEXT: movapd %xmm2, 16(%r8) -; SSE-NEXT: movapd %xmm4, (%r8) +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: pshuflw $116, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm3[0],xmm15[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movapd %xmm8, 112(%rcx) +; SSE-NEXT: movapd %xmm9, 96(%rcx) +; SSE-NEXT: movapd %xmm10, 80(%rcx) +; SSE-NEXT: movapd %xmm11, 64(%rcx) +; SSE-NEXT: movapd %xmm12, 48(%rcx) +; SSE-NEXT: movapd %xmm14, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movapd %xmm15, 112(%r8) +; SSE-NEXT: movapd %xmm0, 96(%r8) +; SSE-NEXT: movapd %xmm1, 80(%r8) +; SSE-NEXT: movapd %xmm4, 64(%r8) +; SSE-NEXT: movapd %xmm5, 48(%r8) +; SSE-NEXT: movapd %xmm6, 32(%r8) +; SSE-NEXT: movapd %xmm7, 16(%r8) +; SSE-NEXT: movapd %xmm13, (%r8) ; SSE-NEXT: addq $824, %rsp # imm = 0x338 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride4_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $824, %rsp # imm = 0x338 -; AVX1-ONLY-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm7[1,2,3],xmm4[4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: subq $776, %rsp # imm = 0x308 +; AVX1-ONLY-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3],xmm3[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm0[1,2,3],xmm4[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm0[1,2,3],xmm6[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm0[1,2,3],xmm9[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm3, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm15 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1,2,3],xmm2[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm7[1,2,3],xmm15[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0],xmm7[1,2,3],xmm13[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm7[1,2,3],xmm14[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm7[1,2,3],xmm3[4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm0[1,2,3],xmm3[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm0[1,2,3],xmm8[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm6, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm0[1,2,3],xmm10[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0],xmm0[1,2,3],xmm11[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm7[1,2,3],xmm5[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm0, %xmm8, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0],xmm0[1,2,3],xmm12[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0],xmm0[1,2,3],xmm13[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm6, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm8[0],xmm7[1,2,3],xmm8[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0],xmm7[1,2,3],xmm6[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0],xmm7[1,2,3],xmm6[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm7[1,2,3],xmm12[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm10, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpackusdw %xmm9, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm0[1,2,3],xmm7[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm14[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0],xmm0[1,2,3],xmm14[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpackusdw %xmm6, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm15[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] @@ -3376,60 +3426,60 @@ ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[3,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[3,1,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -3442,8 +3492,7 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[3,1,2,3] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,0,2,3,4,5,6,7] @@ -3470,7 +3519,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $116, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -3491,7 +3540,8 @@ ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -3508,107 +3558,116 @@ ; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw $231, (%rsp), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 80(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm15, (%r8) -; AVX1-ONLY-NEXT: addq $824, %rsp # imm = 0x338 +; AVX1-ONLY-NEXT: vmovaps %ymm15, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) +; AVX1-ONLY-NEXT: addq $776, %rsp # imm = 0x308 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride4_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $696, %rsp # imm = 0x2B8 +; AVX2-SLOW-NEXT: subq $904, %rsp # imm = 0x388 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm0[0,1] ; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[0,1],ymm3[0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 @@ -3616,459 +3675,483 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 272(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[0,1],ymm2[0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 368(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 336(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[0,1],ymm2[0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 400(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[0,1],ymm2[0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 496(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm9, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-SLOW-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 272(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 336(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd $232, (%rsp), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 496(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-SLOW-NEXT: vmovdqa 400(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm15 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm14[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd $231, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm13 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm11 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm15 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm15[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm8 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = mem[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm13 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm13 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm13 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm9 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm13 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm12[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm15[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, (%rsp), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $116, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi) @@ -4077,396 +4160,438 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 64(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm14, (%r8) -; AVX2-SLOW-NEXT: addq $696, %rsp # imm = 0x2B8 +; AVX2-SLOW-NEXT: vmovdqa %ymm15, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r8) +; AVX2-SLOW-NEXT: addq $904, %rsp # imm = 0x388 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride4_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: subq $824, %rsp # imm = 0x338 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm1[0,1] +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpxor %xmm11, %xmm11, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm11[1,2,3],ymm0[4],ymm11[5,6,7],ymm0[8],ymm11[9,10,11],ymm0[12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 144(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm1[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1,2,3],ymm1[4],ymm11[5,6,7],ymm1[8],ymm11[9,10,11],ymm1[12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm2 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 272(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm11 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm13 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vmovdqa 432(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm11[1,2,3],ymm0[4],ymm11[5,6,7],ymm0[8],ymm11[9,10,11],ymm0[12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 400(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm1[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1,2,3],ymm1[4],ymm11[5,6,7],ymm1[8],ymm11[9,10,11],ymm1[12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpackusdw %xmm0, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm1[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1,2,3],ymm1[4],ymm11[5,6,7],ymm1[8],ymm11[9,10,11],ymm1[12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[0,1],ymm2[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm11[1,2,3],ymm2[4],ymm11[5,6,7],ymm2[8],ymm11[9,10,11],ymm2[12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-FAST-NEXT: vpackusdw %xmm7, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm1[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1,2,3],ymm1[4],ymm11[5,6,7],ymm1[8],ymm11[9,10,11],ymm1[12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX2-FAST-NEXT: vpackusdw %xmm7, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa 272(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = mem[0,1],ymm5[0,1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm11[1,2,3],ymm7[4],ymm11[5,6,7],ymm7[8],ymm11[9,10,11],ymm7[12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-NEXT: vpackusdw %xmm8, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm7, %xmm1 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm9 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm14 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm8 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm13 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm15 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm9 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm10 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %xmm10 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 176(%rdi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm9 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm6 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm8 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm9 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm3 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 144(%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm13 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 432(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 400(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm0 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm6 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,1,2,3] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm14 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[3,1,2,3] ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[3,1,2,3] ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm14 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm13 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm11 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm10 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = mem[3,1,2,3] ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[3,1,2,3] ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm11 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = mem[3,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm14 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm10 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[0,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm9 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm8 = mem[3,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[2,0,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm8[2,0,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw $231, (%rsp), %xmm15 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,3,1,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm4, (%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm4, (%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 96(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%r8) +; AVX2-FAST-NEXT: addq $824, %rsp # imm = 0x338 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride4_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $696, %rsp # imm = 0x2B8 +; AVX2-FAST-PERLANE-NEXT: subq $936, %rsp # imm = 0x3A8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm0[0,1] ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[0,1],ymm3[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 @@ -4474,229 +4599,293 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm1[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 272(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[0,1],ymm2[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 368(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 336(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm1[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[0,1],ymm2[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[0,1],ymm1[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 400(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[0,1],ymm2[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 496(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7],ymm2[8],ymm0[9,10,11],ymm2[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6,7],ymm3[8],ymm0[9,10,11],ymm3[12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 336(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 272(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 496(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm9, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 400(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm12, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,1,2,3] @@ -4710,90 +4899,53 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, (%rsp), %xmm14 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm13 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm12 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm13 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[3,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -4801,13 +4953,39 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -4826,11 +5004,11 @@ ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -4844,24 +5022,24 @@ ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm15[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] @@ -4869,506 +5047,479 @@ ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, (%rsp), %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%r8) -; AVX2-FAST-PERLANE-NEXT: addq $696, %rsp # imm = 0x2B8 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r8) +; AVX2-FAST-PERLANE-NEXT: addq $936, %rsp # imm = 0x3A8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i16_stride4_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $88, %rsp -; AVX512F-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm26 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm26[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm14 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vmovdqa64 112(%rdi), %xmm18 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm18[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 496(%rdi), %xmm16 +; AVX512F-SLOW-NEXT: subq $136, %rsp +; AVX512F-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm16 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm16[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 480(%rdi), %xmm17 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm17[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 224(%rdi), %xmm21 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm21[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,0,2,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 368(%rdi), %xmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,1,0,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm28 +; AVX512F-SLOW-NEXT: vmovdqa64 208(%rdi), %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm15[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,0,2,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa64 464(%rdi), %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm27 -; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm27[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm28 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm21[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm19[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm20[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm24 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm26[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[3,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm30 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm18[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 80(%rdi), %xmm23 +; AVX512F-SLOW-NEXT: vmovdqa 496(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm23 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm31 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm27[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm16[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm17[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm29 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm18 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm19[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm17 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,1,2,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm20[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,0,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm20 +; AVX512F-SLOW-NEXT: vmovdqa64 464(%rdi), %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm12[0,1,0,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 336(%rdi), %xmm17 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm26[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm28[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm23[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm25[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm18[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm20[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm17[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm19[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm16[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm21[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm26[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm28[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm30 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm23[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm25[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm24 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm18[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm20[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm21 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,2,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm17[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm19[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[2,0,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm31 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm28, %ymm14 +; AVX512F-SLOW-NEXT: vpmovqw %ymm14, %xmm14 +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm26 +; AVX512F-SLOW-NEXT: vpmovqw %zmm26, %xmm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm25, %ymm14 +; AVX512F-SLOW-NEXT: vpmovqw %ymm14, %xmm14 +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm23 +; AVX512F-SLOW-NEXT: vpmovqw %zmm23, %xmm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm20, %ymm0 ; AVX512F-SLOW-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512F-SLOW-NEXT: vpmovqw %zmm0, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm18 +; AVX512F-SLOW-NEXT: vpmovqw %zmm18, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm19, %ymm1 ; AVX512F-SLOW-NEXT: vpmovqw %ymm1, %xmm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512F-SLOW-NEXT: vpmovqw %zmm1, %xmm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm13[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vpmovqw %ymm2, %xmm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512F-SLOW-NEXT: vpmovqw %zmm2, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14 -; AVX512F-SLOW-NEXT: vpmovqw %ymm14, %xmm14 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm21 -; AVX512F-SLOW-NEXT: vpmovqw %zmm21, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm12[0,1,2,3],zmm13[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm0, %zmm13 -; AVX512F-SLOW-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm1, %zmm14 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm17 +; AVX512F-SLOW-NEXT: vpmovqw %zmm17, %xmm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm26, %zmm1 +; AVX512F-SLOW-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm14 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm23, %zmm14 ; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm13[0,1,2,3],zmm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm2, %zmm13 -; AVX512F-SLOW-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm21, %zmm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm18, %zmm1 +; AVX512F-SLOW-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm17, %zmm14 ; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm13[0,1,2,3],zmm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm14 = mem[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm0, %zmm14 -; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm11 = mem[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm1, %zmm14 -; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm12 = mem[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm8[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm2, %zmm14 -; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm5[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm21, %zmm14 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm26, %zmm14 ; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm8[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm8 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm14 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm11 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm11 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpmovqw %zmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpmovqw %zmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX512F-SLOW-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm13 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm15 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm23, %zmm15 +; AVX512F-SLOW-NEXT: vpmovqw %zmm15, %xmm15 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm14[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm14 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm15 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm18, %zmm15 +; AVX512F-SLOW-NEXT: vpmovqw %zmm15, %xmm15 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm12 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm0, %ymm15 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm17, %zmm15 +; AVX512F-SLOW-NEXT: vpmovqw %zmm15, %xmm15 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm26, %zmm9 +; AVX512F-SLOW-NEXT: vpmovqw %zmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm23, %zmm9 +; AVX512F-SLOW-NEXT: vpmovqw %zmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm8[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm18, %zmm6 +; AVX512F-SLOW-NEXT: vpmovqw %zmm6, %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm21, %zmm3 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm17, %zmm3 ; AVX512F-SLOW-NEXT: vpmovqw %zmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm5[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rsi) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, (%rsi) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 64(%rdx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rcx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, (%rcx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r8) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512F-SLOW-NEXT: addq $88, %rsp +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, (%rsi) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, (%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 64(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, (%r8) +; AVX512F-SLOW-NEXT: addq $136, %rsp ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i16_stride4_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa64 256(%rdi), %zmm23 -; AVX512F-FAST-NEXT: vmovdqa64 384(%rdi), %zmm26 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512F-FAST-NEXT: vmovdqa64 256(%rdi), %zmm22 +; AVX512F-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] -; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] -; AVX512F-FAST-NEXT: vmovdqa64 224(%rdi), %ymm24 -; AVX512F-FAST-NEXT: vpermd %ymm24, %ymm1, %ymm10 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u> -; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %ymm25 -; AVX512F-FAST-NEXT: vpermd %ymm25, %ymm1, %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,2,3,4,6,12,14] -; AVX512F-FAST-NEXT: vpermt2d %ymm0, %ymm7, %ymm3 -; AVX512F-FAST-NEXT: vpmovqw %zmm4, %xmm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,2,3,0,2,4,6] +; AVX512F-FAST-NEXT: vmovdqa64 224(%rdi), %ymm23 +; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm8, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29] +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %ymm24 +; AVX512F-FAST-NEXT: vpermd %ymm24, %ymm8, %ymm10 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-FAST-NEXT: vpmovqw %zmm4, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 96(%rdi), %ymm27 -; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm1, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %ymm28 -; AVX512F-FAST-NEXT: vpermd %ymm28, %ymm1, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm12 -; AVX512F-FAST-NEXT: vpermt2d %ymm9, %ymm7, %ymm12 -; AVX512F-FAST-NEXT: vpmovqw %zmm30, %xmm9 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm9[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 480(%rdi), %ymm16 -; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa64 448(%rdi), %ymm17 -; AVX512F-FAST-NEXT: vpermd %ymm17, %ymm1, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm13 -; AVX512F-FAST-NEXT: vpermt2d %ymm9, %ymm7, %ymm13 -; AVX512F-FAST-NEXT: vpmovqw %zmm26, %xmm9 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 352(%rdi), %ymm18 -; AVX512F-FAST-NEXT: vpermd %ymm18, %ymm1, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa64 320(%rdi), %ymm20 -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm15 -; AVX512F-FAST-NEXT: vpermt2d %ymm14, %ymm7, %ymm15 -; AVX512F-FAST-NEXT: vpmovqw %zmm23, %xmm14 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm14[0,1,2,3],zmm9[4,5,6,7] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm11, %ymm11 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 96(%rdi), %ymm25 +; AVX512F-FAST-NEXT: vpermd %ymm25, %ymm8, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm11 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %ymm26 +; AVX512F-FAST-NEXT: vpermd %ymm26, %ymm8, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX512F-FAST-NEXT: vpmovqw %zmm2, %xmm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm11[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %ymm15 +; AVX512F-FAST-NEXT: vpermd %ymm15, %ymm8, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm12 +; AVX512F-FAST-NEXT: vmovdqa64 448(%rdi), %ymm16 +; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm8, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm13 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX512F-FAST-NEXT: vpmovqw %zmm1, %xmm13 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa64 352(%rdi), %ymm17 +; AVX512F-FAST-NEXT: vpermd %ymm17, %ymm8, %ymm12 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 320(%rdi), %ymm19 +; AVX512F-FAST-NEXT: vpermd %ymm19, %ymm8, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm8 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm14[6,7] +; AVX512F-FAST-NEXT: vpmovqw %zmm22, %xmm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm8[0,1,2,3],zmm18[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7] ; AVX512F-FAST-NEXT: vpsrlq $16, %zmm4, %zmm14 ; AVX512F-FAST-NEXT: vpmovqw %zmm14, %xmm14 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpsrlq $16, %zmm30, %zmm8 -; AVX512F-FAST-NEXT: vpmovqw %zmm8, %xmm8 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm11[4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm2, %zmm6 +; AVX512F-FAST-NEXT: vpmovqw %zmm6, %xmm6 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm3[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpsrlq $16, %zmm26, %zmm3 +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm1, %zmm3 ; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpsrlq $16, %zmm23, %zmm3 -; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [1,3,2,3,1,3,5,7] -; AVX512F-FAST-NEXT: vpermd %ymm24, %ymm15, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm0 -; AVX512F-FAST-NEXT: vpermd %ymm25, %ymm15, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm1 -; AVX512F-FAST-NEXT: vpermt2d %ymm0, %ymm7, %ymm1 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm4, %zmm0 -; AVX512F-FAST-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm15, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm13 -; AVX512F-FAST-NEXT: vpermd %ymm28, %ymm15, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm14 -; AVX512F-FAST-NEXT: vpermt2d %ymm13, %ymm7, %ymm14 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm30, %zmm13 -; AVX512F-FAST-NEXT: vpmovqw %zmm13, %xmm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm13[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm15, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm1 -; AVX512F-FAST-NEXT: vpermd %ymm17, %ymm15, %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm11 -; AVX512F-FAST-NEXT: vpermt2d %ymm1, %ymm7, %ymm11 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm26, %zmm1 -; AVX512F-FAST-NEXT: vpmovqw %zmm1, %xmm1 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 -; AVX512F-FAST-NEXT: vpermd %ymm18, %ymm15, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm2 -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm15, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm6 -; AVX512F-FAST-NEXT: vpermt2d %ymm2, %ymm7, %ymm6 -; AVX512F-FAST-NEXT: vpsrlq $32, %zmm23, %zmm2 -; AVX512F-FAST-NEXT: vpmovqw %zmm2, %xmm2 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm11[4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm6 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpmovqw %zmm4, %xmm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm22, %zmm6 +; AVX512F-FAST-NEXT: vpmovqw %zmm6, %xmm6 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm3[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [1,3,2,3,1,3,5,7] +; AVX512F-FAST-NEXT: vpermd %ymm23, %ymm27, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm6 +; AVX512F-FAST-NEXT: vpermd %ymm24, %ymm27, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm4, %zmm11 +; AVX512F-FAST-NEXT: vpmovqw %zmm11, %xmm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm12 +; AVX512F-FAST-NEXT: vpermd %ymm25, %ymm27, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm13 +; AVX512F-FAST-NEXT: vpermd %ymm26, %ymm27, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm2, %zmm14 +; AVX512F-FAST-NEXT: vpmovqw %zmm14, %xmm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm13[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm15, %ymm27, %ymm12 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm14 +; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm27, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm15 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm1, %zmm15 +; AVX512F-FAST-NEXT: vpmovqw %zmm15, %xmm15 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm15 +; AVX512F-FAST-NEXT: vpermd %ymm17, %ymm27, %ymm14 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm10 +; AVX512F-FAST-NEXT: vpermd %ymm19, %ymm27, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm7 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm22, %zmm10 +; AVX512F-FAST-NEXT: vpmovqw %zmm10, %xmm10 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm15[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 ; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm30, %zmm4 -; AVX512F-FAST-NEXT: vpmovqw %zmm4, %xmm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm4, %zmm3 +; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm4 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm26, %zmm4 -; AVX512F-FAST-NEXT: vpmovqw %zmm4, %xmm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm23, %zmm4 -; AVX512F-FAST-NEXT: vpmovqw %zmm4, %xmm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 64(%rsi) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, (%rsi) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, (%rdx) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%rcx) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vpmovqw %zmm2, %xmm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm14, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm22, %zmm3 +; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, 64(%rsi) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, (%rsi) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 64(%rdx) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, (%rdx) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 64(%rcx) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, (%rcx) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%r8) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512F-FAST-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -263,38 +263,38 @@ ; ; AVX1-ONLY-LABEL: load_i16_stride5_vf4: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,8,9,6,7,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,10,11,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovq %xmm0, (%rsi) +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovq %xmm3, (%rsi) ; AVX1-ONLY-NEXT: vmovq %xmm4, (%rdx) ; AVX1-ONLY-NEXT: vmovq %xmm5, (%rcx) ; AVX1-ONLY-NEXT: vmovq %xmm6, (%r8) -; AVX1-ONLY-NEXT: vmovq %xmm1, (%r9) +; AVX1-ONLY-NEXT: vmovq %xmm0, (%r9) ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride5_vf4: @@ -304,26 +304,27 @@ ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm1[1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0],xmm4[1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm1[2],xmm6[3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovq %xmm3, (%rsi) -; AVX2-SLOW-NEXT: vmovq %xmm4, (%rdx) -; AVX2-SLOW-NEXT: vmovq %xmm5, (%rcx) -; AVX2-SLOW-NEXT: vmovq %xmm6, (%r8) -; AVX2-SLOW-NEXT: vmovq %xmm0, (%r9) +; AVX2-SLOW-NEXT: vmovq %xmm5, (%rdx) +; AVX2-SLOW-NEXT: vmovq %xmm6, (%rcx) +; AVX2-SLOW-NEXT: vmovq %xmm0, (%r8) +; AVX2-SLOW-NEXT: vmovq %xmm1, (%r9) ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride5_vf4: @@ -332,25 +333,26 @@ ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm1[1],xmm5[2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0],xmm4[1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm1[2],xmm6[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FAST-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FAST-NEXT: vmovq %xmm5, (%rcx) -; AVX2-FAST-NEXT: vmovq %xmm6, (%r8) -; AVX2-FAST-NEXT: vmovq %xmm0, (%r9) +; AVX2-FAST-NEXT: vmovq %xmm5, (%rdx) +; AVX2-FAST-NEXT: vmovq %xmm6, (%rcx) +; AVX2-FAST-NEXT: vmovq %xmm0, (%r8) +; AVX2-FAST-NEXT: vmovq %xmm1, (%r9) ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride5_vf4: @@ -359,25 +361,26 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm1[1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0],xmm4[1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm1[2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm3, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm4, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm5, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm6, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm5, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm6, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm1, (%r9) ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i16_stride5_vf4: @@ -395,21 +398,20 @@ ; AVX512F-SLOW-NEXT: vmovd %r10d, %xmm4 ; AVX512F-SLOW-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpextrw $3, %xmm1, %eax -; AVX512F-SLOW-NEXT: vpinsrw $2, %eax, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vmovd %xmm2, %eax -; AVX512F-SLOW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] +; AVX512F-SLOW-NEXT: vpinsrw $3, %eax, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],mem[1,2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovq %xmm3, (%rsi) -; AVX512F-SLOW-NEXT: vmovq %xmm1, (%rdx) +; AVX512F-SLOW-NEXT: vmovq %xmm4, (%rdx) ; AVX512F-SLOW-NEXT: vmovq %xmm5, (%rcx) ; AVX512F-SLOW-NEXT: vmovq %xmm6, (%r8) ; AVX512F-SLOW-NEXT: vmovq %xmm0, (%r9) @@ -420,58 +422,92 @@ ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpextrw $5, %xmm0, %eax +; AVX512F-FAST-NEXT: vpinsrw $1, %eax, %xmm0, %xmm3 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] ; AVX512F-FAST-NEXT: vpextrw $7, %xmm1, %eax ; AVX512F-FAST-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 ; AVX512F-FAST-NEXT: vpextrw $3, %xmm1, %eax -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4 ; AVX512F-FAST-NEXT: vmovd %xmm2, %eax -; AVX512F-FAST-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] +; AVX512F-FAST-NEXT: vpinsrw $3, %eax, %xmm4, %xmm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],mem[1,2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX512F-FAST-NEXT: vpermi2d %xmm6, %xmm1, %xmm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[6,7,12,13,10,11,4,5,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] ; AVX512F-FAST-NEXT: vmovq %xmm3, (%rsi) -; AVX512F-FAST-NEXT: vmovq %xmm1, (%rdx) +; AVX512F-FAST-NEXT: vmovq %xmm4, (%rdx) ; AVX512F-FAST-NEXT: vmovq %xmm5, (%rcx) ; AVX512F-FAST-NEXT: vmovq %xmm6, (%r8) ; AVX512F-FAST-NEXT: vmovq %xmm0, (%r9) ; AVX512F-FAST-NEXT: retq ; -; AVX512BW-LABEL: load_i16_stride5_vf4: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,6,11,0,1,6,11,0] -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,5,10,0,0,5,10,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512BW-NEXT: vpextrw $7, %xmm2, %eax -; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,7,12,17,2,7,12,17] -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm2 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,8,13,18,3,8,13,18] -; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm5 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,9,14,19,4,9,14,19] -; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm6 -; AVX512BW-NEXT: vmovq %xmm1, (%rsi) -; AVX512BW-NEXT: vmovq %xmm0, (%rdx) -; AVX512BW-NEXT: vmovq %xmm2, (%rcx) -; AVX512BW-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-NEXT: vmovq %xmm6, (%r9) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-SLOW-LABEL: load_i16_stride5_vf4: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-SLOW-NEXT: vpextrw $5, %xmm0, %eax +; AVX512BW-SLOW-NEXT: vpinsrw $1, %eax, %xmm0, %xmm2 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] +; AVX512BW-SLOW-NEXT: vpextrw $7, %xmm1, %eax +; AVX512BW-SLOW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpextrw $6, %xmm0, %eax +; AVX512BW-SLOW-NEXT: vpextrw $1, %xmm0, %r10d +; AVX512BW-SLOW-NEXT: vmovd %r10d, %xmm0 +; AVX512BW-SLOW-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vpextrw $3, %xmm1, %eax +; AVX512BW-SLOW-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,7,12,17,2,7,12,17] +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512BW-SLOW-NEXT: vpermi2w %ymm3, %ymm4, %ymm1 +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,8,13,18,3,8,13,18] +; AVX512BW-SLOW-NEXT: vpermi2w %ymm3, %ymm4, %ymm5 +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,9,14,19,4,9,14,19] +; AVX512BW-SLOW-NEXT: vpermi2w %ymm3, %ymm4, %ymm6 +; AVX512BW-SLOW-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-SLOW-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-SLOW-NEXT: vmovq %xmm1, (%rcx) +; AVX512BW-SLOW-NEXT: vmovq %xmm5, (%r8) +; AVX512BW-SLOW-NEXT: vmovq %xmm6, (%r9) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: load_i16_stride5_vf4: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-FAST-NEXT: vpextrw $5, %xmm0, %eax +; AVX512BW-FAST-NEXT: vpinsrw $1, %eax, %xmm0, %xmm2 +; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] +; AVX512BW-FAST-NEXT: vpextrw $7, %xmm1, %eax +; AVX512BW-FAST-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,6,11,0,1,6,11,0] +; AVX512BW-FAST-NEXT: vpermi2w %xmm1, %xmm0, %xmm3 +; AVX512BW-FAST-NEXT: vpinsrw $3, 32(%rdi), %xmm3, %xmm0 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,7,12,17,2,7,12,17] +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512BW-FAST-NEXT: vpermi2w %ymm3, %ymm4, %ymm1 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,8,13,18,3,8,13,18] +; AVX512BW-FAST-NEXT: vpermi2w %ymm3, %ymm4, %ymm5 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,9,14,19,4,9,14,19] +; AVX512BW-FAST-NEXT: vpermi2w %ymm3, %ymm4, %ymm6 +; AVX512BW-FAST-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-FAST-NEXT: vmovq %xmm0, (%rdx) +; AVX512BW-FAST-NEXT: vmovq %xmm1, (%rcx) +; AVX512BW-FAST-NEXT: vmovq %xmm5, (%r8) +; AVX512BW-FAST-NEXT: vmovq %xmm6, (%r9) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq %wide.vec = load <20 x i16>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> %strided.vec1 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> @@ -623,7 +659,7 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,0,1,10,11,u,u,u,u,u,u,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,4,6,7] @@ -636,7 +672,7 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] @@ -717,10 +753,10 @@ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [3,1,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,12,13,10,11,4,5,2,3,8,9,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] @@ -849,10 +885,10 @@ ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [3,1,6,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,12,13,10,11,4,5,2,3,8,9,u,u,u,u] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] @@ -1287,9 +1323,9 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [0,2,1,3,4,6,1,7] +; AVX2-SLOW-NEXT: vpermd %ymm9, %ymm10, %ymm9 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,10,11,0,1,6,7,10,11,4,5,4,5,18,19,28,29,26,27,16,17,22,23,26,27,20,21,20,21] ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm6[2],xmm4[3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] @@ -1315,9 +1351,9 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <3,u,u,0,7,5,2,0> +; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm11, %ymm10 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,14,15,0,1,22,23,28,29,26,27,20,21,18,19,18,19,30,31,16,17] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] @@ -1365,10 +1401,9 @@ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,5,0,0,3,5,0] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,3,1,u,0,3,5,u> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] @@ -1377,14 +1412,13 @@ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6],xmm9[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,u,u,u,4,7,1,6> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,u,u,u,4,6,1,7> ; AVX2-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm9 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,10,11,0,1,6,7,2,3,4,5,0,1,18,19,28,29,26,27,16,17,22,23,18,19,20,21,16,17] ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm9, %ymm6 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,3,6,0,1,3,6,0] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <1,3,2,u,1,3,6,u> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm9 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] @@ -1403,14 +1437,14 @@ ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm7 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [3,1,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,12,13,10,11,4,5,2,3,8,9,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,u,u,5,0,2,7> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,u,u,3,5,7,2,0> ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,18,19,28,29,26,27,16,17,22,23,22,23,18,19,28,29] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0] ; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] @@ -1468,9 +1502,9 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,2,1,3,4,6,1,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm10, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,10,11,0,1,6,7,10,11,4,5,4,5,18,19,28,29,26,27,16,17,22,23,26,27,20,21,20,21] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] @@ -1496,9 +1530,9 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <3,u,u,0,7,5,2,0> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,14,15,0,1,22,23,28,29,26,27,20,21,18,19,18,19,30,31,16,17] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm4[0,1],xmm5[2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] @@ -1529,85 +1563,85 @@ ; ; AVX512F-SLOW-LABEL: load_i16_stride5_vf16: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6],ymm5[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0],xmm6[1],xmm4[2,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6],ymm8[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7],ymm7[8,9,10,11,12],ymm4[13,14,15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero ; AVX512F-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm6[2],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-SLOW-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,5,18,19,28,29,26,27,16,17,22,23],zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5,6],xmm10[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero +; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm6[0],xmm4[1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <3,u,u,0,7,5,2,0> +; AVX512F-SLOW-NEXT: vpermd %ymm10, %ymm11, %ymm10 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,14,15,0,1,22,23,28,29,26,27,20,21,18,19,18,19,30,31,16,17] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm1[0,1],xmm0[2],xmm1[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa %ymm5, (%rsi) +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vmovdqa %ymm4, (%rsi) ; AVX512F-SLOW-NEXT: vmovdqa %ymm7, (%rdx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm8, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm9, (%r8) @@ -1617,93 +1651,90 @@ ; ; AVX512F-FAST-LABEL: load_i16_stride5_vf16: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <1,u,u,u,4,6,1,3> -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %ymm4, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,5,0,0,3,5,0] -; AVX512F-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,u,u,u,4,7,1,6> +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,3,1,u,0,3,5,u> +; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <1,u,u,u,4,6,1,3> ; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7],ymm6[8,9,10,11,12],ymm5[13,14,15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero ; AVX512F-FAST-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,3,6,0,1,3,6,0] -; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm9 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <1,3,2,u,1,3,6,u> +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,u,u,u,4,6,1,7> +; AVX512F-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm9 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[0,1,18,19,28,29,26,27,16,17,22,23],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6,7],ymm9[8,9,10,11,12],ymm6[13,14,15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6],xmm11[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero +; AVX512F-FAST-NEXT: vpor %ymm9, %ymm10, %ymm9 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,2,u,u,5,7,2,4> ; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,6,0,1,4,6,0] ; AVX512F-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm10 ; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm7 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,3,u,u,5,0,2,7> +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [3,1,6,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] +; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm9, %ymm9 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,12,13,10,11,4,5,2,3,8,9,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,u,u,3,5,7,2,0> ; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,18,19,28,29,26,27,16,17,22,23,22,23,18,19,28,29] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0] ; AVX512F-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm10 ; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm8 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <1,3,u,u,6,0,3,5> -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,5,7,0,2,5,7] -; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,u,u,u,u,24,25,30,31,u,u,u,u] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14] -; AVX512F-FAST-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,3,u,u,6,0,3,5> +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,5,7,0,2,5,7] +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-FAST-NEXT: vmovdqa %ymm5, (%rsi) ; AVX512F-FAST-NEXT: vmovdqa %ymm6, (%rdx) ; AVX512F-FAST-NEXT: vmovdqa %ymm7, (%rcx) ; AVX512F-FAST-NEXT: vmovdqa %ymm8, (%r8) -; AVX512F-FAST-NEXT: vmovdqa %ymm2, (%r9) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -1756,24 +1787,23 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i16_stride5_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $408, %rsp # imm = 0x198 -; SSE-NEXT: movdqa 64(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa 16(%rdi), %xmm9 -; SSE-NEXT: movdqa 32(%rdi), %xmm8 -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa 224(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm5 -; SSE-NEXT: movdqa 176(%rdi), %xmm10 -; SSE-NEXT: movdqa 208(%rdi), %xmm3 +; SSE-NEXT: subq $392, %rsp # imm = 0x188 +; SSE-NEXT: movdqa 144(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm12 +; SSE-NEXT: movdqa 96(%rdi), %xmm9 +; SSE-NEXT: movdqa 128(%rdi), %xmm7 +; SSE-NEXT: movdqa 112(%rdi), %xmm5 +; SSE-NEXT: movdqa 64(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa 16(%rdi), %xmm10 +; SSE-NEXT: movdqa 32(%rdi), %xmm13 +; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm11 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm13, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 @@ -1781,85 +1811,85 @@ ; SSE-NEXT: movdqa %xmm10, %xmm15 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] ; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm10, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; SSE-NEXT: andps %xmm10, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] ; SSE-NEXT: movaps %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,1,0,3] -; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] ; SSE-NEXT: movaps %xmm10, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: movdqa 272(%rdi), %xmm7 -; SSE-NEXT: andps %xmm10, %xmm3 -; SSE-NEXT: orps %xmm3, %xmm2 +; SSE-NEXT: movdqa 192(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: andps %xmm10, %xmm4 +; SSE-NEXT: orps %xmm4, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 256(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 240(%rdi), %xmm3 +; SSE-NEXT: movdqa 176(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] -; SSE-NEXT: movdqa 304(%rdi), %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa 160(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] +; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE-NEXT: movaps %xmm10, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: andps %xmm10, %xmm3 -; SSE-NEXT: orps %xmm3, %xmm2 +; SSE-NEXT: andps %xmm10, %xmm4 +; SSE-NEXT: orps %xmm4, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm1 +; SSE-NEXT: movdqa 288(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa 112(%rdi), %xmm2 +; SSE-NEXT: movdqa 272(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 256(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 80(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 240(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; SSE-NEXT: movdqa 144(%rdi), %xmm0 +; SSE-NEXT: movdqa 304(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: movaps %xmm10, %xmm1 @@ -1869,17 +1899,17 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] @@ -1888,11 +1918,11 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: psllq $48, %xmm2 -; SSE-NEXT: movaps %xmm10, %xmm3 -; SSE-NEXT: andnps %xmm2, %xmm3 +; SSE-NEXT: movaps %xmm10, %xmm4 +; SSE-NEXT: andnps %xmm2, %xmm4 ; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: orps %xmm1, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,3,2,3] @@ -1900,12 +1930,12 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,3,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,3,2,3] +; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] @@ -1914,23 +1944,24 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] @@ -1939,23 +1970,23 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm12 # 16-byte Reload ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] @@ -1969,20 +2000,19 @@ ; SSE-NEXT: pandn %xmm1, %xmm10 ; SSE-NEXT: por %xmm0, %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,1,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,2,0] @@ -1995,14 +2025,14 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm15[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,2,0] @@ -2015,13 +2045,13 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: andnps %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,0] @@ -2033,13 +2063,13 @@ ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0,1,3] -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: andnps %xmm5, %xmm1 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,2,0] @@ -2048,20 +2078,20 @@ ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm13[3,0] -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: andnps %xmm13, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm6[0,2] @@ -2073,7 +2103,7 @@ ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] @@ -2083,11 +2113,11 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: pand %xmm4, %xmm9 ; SSE-NEXT: por %xmm1, %xmm9 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[3,0] -; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: movdqa %xmm4, %xmm10 ; SSE-NEXT: pandn %xmm15, %xmm10 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,7,4,6,7] @@ -2097,7 +2127,7 @@ ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -2106,11 +2136,11 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm6 ; SSE-NEXT: por %xmm1, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[3,0] -; SSE-NEXT: movdqa %xmm3, %xmm15 +; SSE-NEXT: movdqa %xmm4, %xmm15 ; SSE-NEXT: pandn %xmm11, %xmm15 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,4,6,7] @@ -2121,19 +2151,19 @@ ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm0[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm12 +; SSE-NEXT: pand %xmm4, %xmm12 ; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] @@ -2142,29 +2172,29 @@ ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm5[3,0] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm14[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2] -; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: movaps %xmm7, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm11[3,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pand %xmm4, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm13 +; SSE-NEXT: pandn %xmm11, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm2[0,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,7,4,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -2184,62 +2214,62 @@ ; SSE-NEXT: por %xmm10, %xmm0 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,1,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] ; SSE-NEXT: por %xmm15, %xmm14 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,1,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm4[2,0] -; SSE-NEXT: por %xmm13, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,0] +; SSE-NEXT: por %xmm13, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps %xmm2, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rdx) +; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movaps %xmm2, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps %xmm2, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rcx) +; SSE-NEXT: movaps %xmm2, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps %xmm2, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%rcx) -; SSE-NEXT: movaps %xmm12, 16(%r8) -; SSE-NEXT: movaps %xmm6, 48(%r8) -; SSE-NEXT: movaps %xmm9, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%r8) -; SSE-NEXT: movaps %xmm3, 16(%r9) -; SSE-NEXT: movaps %xmm14, 48(%r9) -; SSE-NEXT: movaps %xmm0, (%r9) -; SSE-NEXT: movaps %xmm1, 32(%r9) -; SSE-NEXT: addq $408, %rsp # imm = 0x198 +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps %xmm12, 48(%r8) +; SSE-NEXT: movaps %xmm6, 32(%r8) +; SSE-NEXT: movaps %xmm9, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%r8) +; SSE-NEXT: movaps %xmm4, 48(%r9) +; SSE-NEXT: movaps %xmm14, 32(%r9) +; SSE-NEXT: movaps %xmm0, 16(%r9) +; SSE-NEXT: movaps %xmm1, (%r9) +; SSE-NEXT: addq $392, %rsp # imm = 0x188 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride5_vf32: @@ -2398,7 +2428,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm6[4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,8,9,2,3,12,13,12,13,12,13,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[3,1,2,3] @@ -2450,7 +2480,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm0[2,3],xmm5[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,10,11,4,5,14,15,14,15,14,15,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm0 ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3,4,5,6,7] @@ -2568,203 +2598,190 @@ ; ; AVX2-SLOW-LABEL: load_i16_stride5_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $280, %rsp # imm = 0x118 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-SLOW-NEXT: subq $232, %rsp +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6],ymm9[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] ; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2,3],xmm9[4,5],xmm10[6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm9 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm3[1,2],ymm8[3],ymm3[4],ymm8[5],ymm3[6,7],ymm8[8],ymm3[9,10],ymm8[11],ymm3[12],ymm8[13],ymm3[14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4],ymm2[5],ymm1[6,7],ymm2[8],ymm1[9,10],ymm2[11],ymm1[12],ymm2[13],ymm1[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6],ymm13[7] ; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0],ymm3[1],ymm8[2,3],ymm3[4],ymm8[5],ymm3[6],ymm8[7,8],ymm3[9],ymm8[10,11],ymm3[12],ymm8[13],ymm3[14],ymm8[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm13[1,2,3],xmm11[4,5],xmm13[6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm11 ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm11, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm5 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6],xmm14[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm13 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm13, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5],ymm8[6],ymm3[7,8],ymm8[9],ymm3[10,11],ymm8[12],ymm3[13],ymm8[14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3],xmm0[4,5,6],xmm13[7] -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm13, %ymm11, %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm3[0],xmm15[1],xmm3[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm11 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0,1,2,3,4],ymm11[5,6,7],ymm9[8,9,10,11,12],ymm11[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,2,1,3,4,6,1,7] +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm11, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,12,13,10,11,0,1,6,7,10,11,4,5,4,5,18,19,28,29,26,27,16,17,22,23,26,27,20,21,20,21] +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4,5],ymm8[6],ymm3[7,8],ymm8[9],ymm3[10],ymm8[11],ymm3[12,13],ymm8[14],ymm3[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3],xmm14[4,5,6],xmm15[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm14, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6],xmm8[7] +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm11, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm11, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm15[0],xmm0[1],xmm15[2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0],xmm13[1],xmm7[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm15[2],xmm3[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0],xmm13[1],xmm11[2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5,6,7],ymm12[8,9,10,11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm13[2],xmm11[3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5,6,7],ymm14[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm13[2],xmm7[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7],ymm10[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5,6,7],ymm14[8,9,10,11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm0[2],xmm15[3] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7],ymm8[8,9,10,11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6],ymm1[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7],ymm4[8,9],ymm5[10],ymm4[11],ymm5[12],ymm4[13,14],ymm5[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0],xmm15[1],xmm0[2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5],ymm12[6],ymm2[7,8],ymm12[9],ymm2[10,11],ymm12[12],ymm2[13],ymm12[14],ymm2[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5],ymm6[6],ymm2[7,8],ymm6[9],ymm2[10,11],ymm6[12],ymm2[13],ymm6[14],ymm2[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm11 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0,1],ymm8[2],ymm5[3],ymm8[4],ymm5[5,6],ymm8[7],ymm5[8,9],ymm8[10],ymm5[11],ymm8[12],ymm5[13,14],ymm8[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0],xmm3[1],xmm15[2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7],ymm0[8,9,10,11,12],ymm9[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6],ymm9[7] -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm6 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1],ymm6[2],ymm1[3],ymm6[4],ymm1[5,6],ymm6[7],ymm1[8,9],ymm6[10],ymm1[11],ymm6[12],ymm1[13,14],ymm6[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm7[1],xmm13[2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7],ymm0[8,9,10,11,12],ymm9[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4,5],ymm2[6],ymm11[7,8],ymm2[9],ymm11[10],ymm2[11],ymm11[12,13],ymm2[14],ymm11[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm7 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4],ymm0[5],ymm9[6],ymm0[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0,1],xmm12[2],xmm15[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm12, %xmm11 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7],ymm0[8,9,10,11,12],ymm9[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] -; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0],ymm1[1,2],ymm6[3],ymm1[4],ymm6[5],ymm1[6,7],ymm6[8],ymm1[9,10],ymm6[11],ymm1[12],ymm6[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2],xmm12[3] -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm12, %xmm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm12[2],xmm13[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2,3,4],ymm2[5,6,7],ymm9[8,9,10,11,12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm7[0,1],mem[2],ymm7[3],mem[4],ymm7[5,6],mem[7],ymm7[8,9],mem[10],ymm7[11],mem[12],ymm7[13,14],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4],ymm2[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5],ymm8[6],ymm5[7,8],ymm8[9],ymm5[10,11],ymm8[12],ymm5[13],ymm8[14],ymm5[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0],xmm11[1],xmm13[2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2],xmm1[3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm14[1],ymm12[2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7,8],ymm14[9],ymm12[10],ymm14[11],ymm12[12,13],ymm14[14],ymm12[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [3,1,2,0,7,5,2,0] +; AVX2-SLOW-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [6,7,12,13,10,11,4,5,2,3,2,3,14,15,0,1,22,23,28,29,26,27,20,21,18,19,18,19,30,31,16,17] +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0,1],xmm11[2],xmm13[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4],ymm5[5],ymm4[6,7],ymm5[8],ymm4[9,10],ymm5[11],ymm4[12],ymm5[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0],xmm8[1],xmm14[2],xmm8[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX2-SLOW-NEXT: vpermd %ymm8, %ymm9, %ymm8 +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1],xmm15[2],xmm0[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7],ymm2[8,9,10,11,12],ymm3[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7],ymm7[8,9],ymm6[10],ymm7[11],ymm6[12],ymm7[13,14],ymm6[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $173, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -2775,18 +2792,17 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 32(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9) -; AVX2-SLOW-NEXT: addq $280, %rsp # imm = 0x118 +; AVX2-SLOW-NEXT: addq $232, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -2795,31 +2811,28 @@ ; AVX2-FAST-NEXT: subq $264, %rsp # imm = 0x108 ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm7[1,2],ymm5[3],ymm7[4],ymm5[5],ymm7[6,7],ymm5[8],ymm7[9,10],ymm5[11],ymm7[12],ymm5[13],ymm7[14,15] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [1,3,0,2,4,6,1,3] ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13],ymm4[14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm15 +; AVX2-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm12 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1,2,3],xmm8[4,5],xmm12[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm13 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm13, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13],ymm3[14],ymm14[15] -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2,3],xmm13[4,5],xmm14[6,7] ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm12 @@ -2831,19 +2844,19 @@ ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm12, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,u,u,u,4,7,1,6> +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,u,u,u,4,6,1,7> ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,12,13,10,11,0,1,6,7,2,3,4,5,0,1,18,19,28,29,26,27,16,17,22,23,18,19,20,21,16,17] ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm15[1],ymm5[2],ymm15[3],ymm5[4,5],ymm15[6],ymm5[7,8],ymm15[9],ymm5[10],ymm15[11],ymm5[12,13],ymm15[14],ymm5[15] -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3],xmm13[4,5,6],xmm15[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm13 ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm13, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7] ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm11, %xmm11 @@ -2854,7 +2867,7 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,3,1,3,0,3,5,7] ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm11, %ymm12, %ymm12 ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm14, %ymm8 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5,6,7] @@ -2867,7 +2880,7 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,2,3,1,3,6,7] ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm9, %ymm10 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm10[5,6,7],ymm0[8,9,10,11,12],ymm10[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] @@ -2877,19 +2890,16 @@ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm8[2],ymm5[3],ymm8[4],ymm5[5,6],ymm8[7],ymm5[8,9],ymm8[10],ymm5[11],ymm8[12],ymm5[13,14],ymm8[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm9 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm9[3,4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm7[1],ymm3[2,3],ymm7[4],ymm3[5],ymm7[6],ymm3[7,8],ymm7[9],ymm3[10,11],ymm7[12],ymm3[13],ymm7[14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,2,u,u,5,7,2,4> -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm14, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,u,u,5,7,2,4> +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,4,6,0,1,4,6,0] @@ -2900,82 +2910,82 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1],ymm0[2],ymm4[3],ymm0[4],ymm4[5,6],ymm0[7],ymm4[8,9],ymm0[10],ymm4[11],ymm0[12],ymm4[13,14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm5[0,1],ymm3[2],ymm5[3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8,9],ymm3[10],ymm5[11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3,4],xmm10[5,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5],ymm12[6],ymm4[7,8],ymm12[9],ymm4[10,11],ymm12[12],ymm4[13],ymm12[14],ymm4[15] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm14, %ymm10 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13],ymm3[14],ymm14[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,2,u,u,5,7,2,4> +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm10 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm9, %ymm2 ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm6[1,2],ymm8[3],ymm6[4],ymm8[5],ymm6[6,7],ymm8[8],ymm6[9,10],ymm8[11],ymm6[12],ymm8[13],ymm6[14,15] -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm15 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7,8],ymm3[9],ymm7[10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,3,u,u,5,0,2,7> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,4,7,0,2,4,7,0] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,u,3,5,7,2,0> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,12,13,10,11,0,1,6,7,6,7,2,3,12,13,18,19,28,29,26,27,16,17,22,23,22,23,18,19,28,29] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [3,1,6,4] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm6[1,2],ymm8[3],ymm6[4],ymm8[5],ymm6[6,7],ymm8[8],ymm6[9,10],ymm8[11],ymm6[12],ymm8[13],ymm6[14,15] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm12, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,12,13,10,11,4,5,2,3,8,9,0,1,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,4,7,0,2,4,7,0] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2,3,4],ymm9[5,6,7],ymm1[8,9,10,11,12],ymm9[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm5[1,2],ymm0[3],ymm5[4],ymm0[5],ymm5[6,7],ymm0[8],ymm5[9,10],ymm0[11],ymm5[12],ymm0[13],ymm5[14,15] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7,8],ymm4[9],ymm12[10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm3[2],ymm7[3],ymm3[4],ymm7[5,6],ymm3[7],ymm7[8,9],ymm3[10],ymm7[11],ymm3[12],ymm7[13,14],ymm3[15] -; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm15[1],mem[2,3],ymm15[4],mem[5],ymm15[6],mem[7,8],ymm15[9],mem[10,11],ymm15[12],mem[13],ymm15[14],mem[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm9 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm3[0],ymm14[1],ymm3[2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8],ymm14[9],ymm3[10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,3,5,7,2,0> +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,12,13,10,11,0,1,6,7,6,7,2,3,12,13,18,19,28,29,26,27,16,17,22,23,22,23,18,19,28,29] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm10[2],mem[3],ymm10[4],mem[5,6],ymm10[7],mem[8,9],ymm10[10],mem[11],ymm10[12],mem[13,14],ymm10[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5],ymm8[6],ymm6[7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13],ymm8[14],ymm6[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,3,u,u,6,0,3,5> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7] ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm9[0],mem[1],ymm9[2,3],mem[4],ymm9[5],mem[6],ymm9[7,8],mem[9],ymm9[10,11],mem[12],ymm9[13],mem[14],ymm9[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm9[1],ymm4[2,3],ymm9[4],ymm4[5],ymm9[6],ymm4[7,8],ymm9[9],ymm4[10,11],ymm9[12],ymm4[13],ymm9[14],ymm4[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1],ymm14[2],ymm12[3],ymm14[4],ymm12[5,6],ymm14[7],ymm12[8,9],ymm14[10],ymm12[11],ymm14[12],ymm12[13,14],ymm14[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 @@ -2992,7 +3002,7 @@ ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm12, 32(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r9) @@ -3004,202 +3014,195 @@ ; AVX2-FAST-PERLANE-LABEL: load_i16_stride5_vf32: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $264, %rsp # imm = 0x108 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm4[1,2],ymm1[3],ymm4[4],ymm1[5],ymm4[6,7],ymm1[8],ymm4[9,10],ymm1[11],ymm4[12],ymm1[13],ymm4[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12],ymm1[13],ymm0[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5],ymm2[6],ymm7[7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13],ymm2[14],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1,2,3],xmm1[4,5],xmm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13],ymm2[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1,2,3],xmm1[4,5],xmm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm12, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm6[1,2],ymm11[3],ymm6[4],ymm11[5],ymm6[6,7],ymm11[8],ymm6[9,10],ymm11[11],ymm6[12],ymm11[13],ymm6[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm12, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm6[1,2],ymm8[3],ymm6[4],ymm8[5],ymm6[6,7],ymm8[8],ymm6[9,10],ymm8[11],ymm6[12],ymm8[13],ymm6[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm12, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5],ymm10[6],ymm5[7,8],ymm10[9],ymm5[10,11],ymm10[12],ymm5[13],ymm10[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm7[1],ymm14[2,3],ymm7[4],ymm14[5],ymm7[6],ymm14[7,8],ymm7[9],ymm14[10,11],ymm7[12],ymm14[13],ymm7[14],ymm14[15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm9, %ymm8, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4,5],ymm2[6],ymm14[7,8],ymm2[9],ymm14[10],ymm2[11],ymm14[12,13],ymm2[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6],xmm14[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm13, %ymm8, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5],ymm11[6],ymm15[7,8],ymm11[9],ymm15[10,11],ymm11[12],ymm15[13],ymm11[14],ymm15[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm8[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm15[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm8, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm9, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0],xmm8[1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm12, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm10, %ymm9, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5],ymm8[6],ymm6[7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13],ymm8[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,1,3,4,6,1,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,12,13,10,11,0,1,6,7,10,11,4,5,4,5,18,19,28,29,26,27,16,17,22,23,26,27,20,21,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm7[0],ymm14[1],ymm7[2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7,8],ymm14[9],ymm7[10],ymm14[11],ymm7[12,13],ymm14[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3],xmm14[4,5,6],xmm15[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm14, %ymm9, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8],ymm5[9],ymm2[10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5,6],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm9, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0],ymm11[1],ymm0[2,3],ymm11[4],ymm0[5],ymm11[6],ymm0[7,8],ymm11[9],ymm0[10,11],ymm11[12],ymm0[13],ymm11[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,2,1,3,4,6,1,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm10, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm15, %ymm13, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0],xmm9[1],xmm10[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2,3,4],ymm1[5,6,7],ymm7[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7],ymm4[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm4[0],xmm1[1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm11, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5,6,7],ymm12[8,9,10,11,12],ymm11[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm9[0,1],xmm8[2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm4[0,1],xmm1[2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm11[5,6,7],ymm14[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm10[0,1],xmm9[2],xmm10[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm11[5,6,7],ymm14[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm3[0,1],ymm5[2],ymm3[3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8,9],ymm5[10],ymm3[11],ymm5[12],ymm3[13,14],ymm5[15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3,4],xmm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm12, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm8[0],xmm9[1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm9[0],xmm10[1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm12, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2,3,4],ymm12[5,6,7],ymm11[8,9,10,11,12],ymm12[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5],ymm10[6],ymm6[7,8],ymm10[9],ymm6[10,11],ymm10[12],ymm6[13],ymm10[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm6[1],ymm8[2,3],ymm6[4],ymm8[5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10,11],ymm6[12],ymm8[13],ymm6[14],ymm8[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm12[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0,1],ymm5[2],mem[3],ymm5[4],mem[5,6],ymm5[7],mem[8,9],ymm5[10],mem[11],ymm5[12],mem[13,14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0,1],ymm2[2],ymm7[3],ymm2[4],ymm7[5,6],ymm2[7],ymm7[8,9],ymm2[10],ymm7[11],ymm2[12],ymm7[13,14],ymm2[15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3,4],xmm12[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm12, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm12, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2,3,4],ymm12[5,6,7],ymm11[8,9,10,11,12],ymm12[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm14[1],ymm4[2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8],ymm14[9],ymm4[10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0],ymm3[1,2],ymm7[3],ymm3[4],ymm7[5],ymm3[6,7],ymm7[8],ymm3[9,10],ymm7[11],ymm3[12],ymm7[13],ymm3[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm8[0,1],xmm9[2],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm7[1,2],ymm2[3],ymm7[4],ymm2[5],ymm7[6,7],ymm2[8],ymm7[9,10],ymm2[11],ymm7[12],ymm2[13],ymm7[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8],ymm8[9],ymm6[10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [3,1,2,0,7,5,2,0] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm12, %ymm14, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,12,13,10,11,4,5,2,3,2,3,14,15,0,1,22,23,28,29,26,27,20,21,18,19,18,19,30,31,16,17] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm4[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2,3,4],ymm12[5,6,7],ymm11[8,9,10,11,12],ymm12[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm6[1],ymm10[2],ymm6[3],ymm10[4,5],ymm6[6],ymm10[7,8],ymm6[9],ymm10[10],ymm6[11],ymm10[12,13],ymm6[14],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm12[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm14, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm2[5,6,7],ymm12[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm13[1,2],ymm8[3],ymm13[4],ymm8[5],ymm13[6,7],ymm8[8],ymm13[9,10],ymm8[11],ymm13[12],ymm8[13],ymm13[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0],xmm12[1],xmm15[2],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0],ymm15[1],ymm7[2],ymm15[3],ymm7[4,5],ymm15[6],ymm7[7,8],ymm15[9],ymm7[10],ymm15[11],ymm7[12,13],ymm15[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm12, %ymm14, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm12[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm9[0,1],xmm10[2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7],ymm2[8,9,10,11,12],ymm3[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1],ymm6[2],ymm10[3],ymm6[4],ymm10[5,6],ymm6[7],ymm10[8,9],ymm6[10],ymm10[11],ymm6[12],ymm10[13,14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm1[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4],ymm1[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5],ymm4[6],mem[7,8],ymm4[9],mem[10,11],ymm4[12],mem[13],ymm4[14],mem[15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7],mem[8,9],ymm1[10],mem[11],ymm1[12],mem[13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm15[2],ymm7[3],ymm15[4],ymm7[5,6],ymm15[7],ymm7[8,9],ymm15[10],ymm7[11],ymm15[12],ymm7[13,14],ymm15[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5],ymm8[6],ymm13[7,8],ymm8[9],ymm13[10,11],ymm8[12],ymm13[13],ymm8[14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] @@ -3215,8 +3218,8 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-FAST-PERLANE-NEXT: addq $264, %rsp # imm = 0x108 @@ -3265,29 +3268,29 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm11[0],xmm12[1],xmm11[2,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm17, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm18, %zmm15 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm16 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] -; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,3,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm13[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm9, %xmm15 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,3,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [7,5,2,0,7,5,6,4] +; AVX512F-SLOW-NEXT: vpermd %ymm15, %ymm17, %ymm15 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[14,15,0,1,12,13,12,13,10,11,4,5,2,3,8,9,30,31,16,17,28,29,28,29,26,27,20,21,18,19,24,25] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm13[5,6,7] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX512F-SLOW-NEXT: vpermd %ymm13, %ymm15, %ymm13 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[4,5,18,19,28,29,26,27,16,17,22,23],zero,zero,zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm14 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3],xmm15[4,5,6],xmm14[7] @@ -3296,8 +3299,8 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm11[0,1],xmm12[2],xmm11[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm17, %zmm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm14, %zmm18 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm18, %zmm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm14, %zmm17 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm14[1],ymm4[2,3,4,5,6,7] @@ -3325,34 +3328,34 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm12[0],xmm11[1],xmm12[2,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm17, %zmm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm14, %zmm17 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm18, %zmm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm14, %zmm19 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2],ymm4[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0],xmm4[1],xmm13[2],xmm4[3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <3,u,u,0,7,5,2,0> +; AVX512F-SLOW-NEXT: vpermd %ymm13, %ymm15, %ymm13 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,14,15,0,1,22,23,28,29,26,27,20,21,18,19,18,19,30,31,16,17] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm9[0],xmm10[1],xmm9[2,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm4[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm11[2],xmm12[3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm13, %zmm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm15[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm13 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [0,6,1,3,4,6,5,7] +; AVX512F-SLOW-NEXT: vpermd %ymm15, %ymm18, %ymm15 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[10,11,10,11,4,5,8,9,14,15,4,5,2,3,12,13,26,27,26,27,20,21,24,25,30,31,20,21,18,19,28,29] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm12[0,1],xmm11[2],xmm12[3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm15, %zmm13 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm13 ; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm13, %ymm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm14 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1,2,3],xmm15[4,5],xmm14[6,7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm14[1,2,3,4,5,6,7],ymm4[8],ymm14[9,10,11,12,13,14,15] @@ -3395,8 +3398,8 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, (%rcx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512F-SLOW-NEXT: vzeroupper @@ -3404,178 +3407,180 @@ ; ; AVX512F-FAST-LABEL: load_i16_stride5_vf32: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa 176(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa 176(%rdi), %xmm3 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <2,4,7,1,4,6,u,u> -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <8,9,3,2,4,u,u,u> -; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,4,7,1,4,6,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <1,u,u,u,4,6,1,3> -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm12, %ymm7 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %ymm7, %ymm12, %ymm12 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,3,1,u,0,3,5,u> -; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm14, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm12, %zmm15, %zmm14 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm16 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <2,u,u,u,4,7,1,6> -; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm14, %ymm12 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm14, %xmm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6],xmm6[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %ymm6, %ymm12, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <1,3,2,u,1,3,6,u> -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm12, %ymm12 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm12 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm14, %ymm19 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm6, %zmm15, %zmm12 -; AVX512F-FAST-NEXT: vpsrlq $48, %xmm2, %xmm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <0,2,5,7,4,7,u,u> -; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm17, %ymm14 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm14[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [1,4,6,3,1,4,6,3] -; AVX512F-FAST-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm17, %ymm14 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm17 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm14 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3,4],xmm14[5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm14[2],xmm2[2],xmm14[3],xmm2[3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = <0,3,5,2,5,7,u,u> -; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm18, %ymm12 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm12[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4],ymm6[5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3,4],xmm12[5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = <0,2,u,u,5,7,2,4> -; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm18, %ymm14 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [1,4,6,0,1,4,6,0] -; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm14, %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm14, %ymm13 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm12, %zmm15, %zmm13 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm13, %zmm13 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm12 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2],xmm6[3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,3,u,u,5,0,2,7> -; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm14, %ymm12 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm2[0],xmm3[1],xmm2[2,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <1,3,6,0,5,u,u,u> -; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm14 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [2,4,7,0,2,4,7,0] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <1,u,u,u,4,6,1,3> +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm13, %ymm8 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13],ymm12[14],ymm11[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2,3],xmm13[4,5],xmm14[6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpor %ymm8, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [0,3,5,0,0,3,5,0] ; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm14, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm14 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm12 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm12 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm12, %ymm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm6[0],ymm14[1,2,3,4,5,6,7],ymm6[8],ymm14[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <1,3,u,u,6,0,3,5> -; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm9, %ymm8 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,4,6,3,6,u,u,u> -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,1,3,0,2,5,7] -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm14, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm15, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm13, %zmm16, %zmm15 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm17 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <2,u,u,u,4,6,1,7> +; AVX512F-FAST-NEXT: vpermd %ymm13, %ymm15, %ymm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,28,29,26,27,16,17,22,23],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm15, %xmm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3],xmm15[4,5,6],xmm2[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpor %ymm2, %ymm13, %ymm2 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [1,3,6,0,1,3,6,0] +; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm16, %zmm13 +; AVX512F-FAST-NEXT: vpsrlq $48, %xmm3, %xmm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = <0,2,5,7,4,7,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm18, %ymm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [1,3,6,4,1,3,6,4] +; AVX512F-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm18, %ymm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,28,29,26,27,16,17,22,23,24,25] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm18 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3,4],xmm7[5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <0,3,5,2,5,7,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm13, %ymm19, %ymm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm13[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm7, %xmm13 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm13[3,4],xmm7[5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <0,2,u,u,5,7,2,4> +; AVX512F-FAST-NEXT: vpermd %ymm13, %ymm19, %ymm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm13[3,4,5,6,7] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [1,4,6,0,1,4,6,0] +; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm7, %zmm16, %zmm13 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [3,1,6,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm12[1,2],ymm11[3],ymm12[4],ymm11[5],ymm12[6,7],ymm11[8],ymm12[9,10],ymm11[11],ymm12[12],ymm11[13],ymm12[14,15] +; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,12,13,10,11,4,5,2,3,8,9,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,u,u,3,5,7,2,0> +; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,18,19,28,29,26,27,16,17,22,23,22,23,18,19,28,29] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <1,3,6,0,5,u,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm13, %ymm16, %ymm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm13[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [2,4,7,0,2,4,7,0] +; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm7 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm7, %ymm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1,2,3],xmm13[4,5],xmm15[6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0],ymm13[1,2,3,4,5,6,7],ymm2[8],ymm13[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13],ymm11[14],ymm12[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm7, %xmm11 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2],xmm7[3,4],xmm11[5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,3,u,u,6,0,3,5> +; AVX512F-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm9 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <1,4,6,3,6,u,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,1,3,0,2,5,7] +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 ; AVX512F-FAST-NEXT: movb $7, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm3, %ymm4 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, (%rdx) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, (%r8) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -3658,220 +3663,217 @@ ; SSE-LABEL: load_i16_stride5_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $1000, %rsp # imm = 0x3E8 -; SSE-NEXT: movdqa 464(%rdi), %xmm4 -; SSE-NEXT: movdqa 400(%rdi), %xmm10 -; SSE-NEXT: movdqa 416(%rdi), %xmm11 -; SSE-NEXT: movdqa 448(%rdi), %xmm5 +; SSE-NEXT: movdqa 384(%rdi), %xmm13 +; SSE-NEXT: movdqa 320(%rdi), %xmm9 +; SSE-NEXT: movdqa 336(%rdi), %xmm8 +; SSE-NEXT: movdqa 368(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm6 -; SSE-NEXT: movdqa 96(%rdi), %xmm9 -; SSE-NEXT: movdqa 128(%rdi), %xmm3 +; SSE-NEXT: movdqa 352(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa 16(%rdi), %xmm10 +; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] -; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,2,2,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] ; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: andps %xmm6, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,1,2,3] +; SSE-NEXT: movdqa %xmm8, %xmm14 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm4 -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,0,1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: movdqa 112(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: andps %xmm6, %xmm3 -; SSE-NEXT: orps %xmm3, %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,3] +; SSE-NEXT: orps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 128(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa 16(%rdi), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,1,2,3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa 80(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,2,2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3] -; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] ; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: andnps %xmm2, %xmm3 ; SSE-NEXT: andps %xmm6, %xmm4 ; SSE-NEXT: orps %xmm4, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 368(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,3] +; SSE-NEXT: movdqa 432(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 448(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa 336(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 320(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa 416(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[3,1,2,3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa 400(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3] -; SSE-NEXT: movdqa 384(%rdi), %xmm1 +; SSE-NEXT: movdqa 464(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] ; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: andnps %xmm2, %xmm3 ; SSE-NEXT: andps %xmm6, %xmm4 ; SSE-NEXT: orps %xmm4, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 288(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,3] +; SSE-NEXT: movdqa 192(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 208(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa 256(%rdi), %xmm1 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa 176(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 240(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa 160(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3] -; SSE-NEXT: movdqa 304(%rdi), %xmm1 +; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] ; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: andnps %xmm2, %xmm3 ; SSE-NEXT: andps %xmm6, %xmm4 ; SSE-NEXT: orps %xmm4, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 592(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 608(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,3] +; SSE-NEXT: movdqa 512(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 528(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa 576(%rdi), %xmm1 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa 496(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 560(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa 480(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3] -; SSE-NEXT: movdqa 624(%rdi), %xmm1 +; SSE-NEXT: movdqa 544(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: andnps %xmm2, %xmm1 ; SSE-NEXT: andps %xmm6, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 208(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,3] +; SSE-NEXT: orps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 272(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa 288(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,3] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa 176(%rdi), %xmm1 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa 256(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 160(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa 240(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3] -; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: movdqa 304(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: andnps %xmm2, %xmm1 ; SSE-NEXT: andps %xmm6, %xmm4 -; SSE-NEXT: orps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 528(%rdi), %xmm1 +; SSE-NEXT: orps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 608(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa 512(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa 496(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,3] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa 592(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa 480(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa 576(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa 560(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] -; SSE-NEXT: movdqa 544(%rdi), %xmm0 +; SSE-NEXT: movdqa 624(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: movaps %xmm6, %xmm1 @@ -3879,38 +3881,38 @@ ; SSE-NEXT: andps %xmm6, %xmm3 ; SSE-NEXT: orps %xmm3, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: psrlq $48, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,3,2,3] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: psllq $48, %xmm3 -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: andnps %xmm3, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: orps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,3,2,3] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: andnps %xmm3, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: orps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,3,2,3] ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -3921,21 +3923,21 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: psllq $48, %xmm9 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: psllq $48, %xmm13 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 ; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,3,2,3] +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pshufd $237, (%rsp), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,3,2,3] ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] @@ -3944,22 +3946,20 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psllq $48, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,3,2,3] +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,3,2,3] ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -3970,23 +3970,23 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psllq $48, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: psrlq $48, %xmm2 ; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,3,2,3] ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -3997,24 +3997,24 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: psllq $48, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm2 ; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: pshufd $237, (%rsp), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,3,2,3] ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3] @@ -4024,25 +4024,25 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psllq $48, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: pand %xmm6, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,3,2,3] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] @@ -4051,246 +4051,245 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,4,7] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: psllq $48, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: psllq $48, %xmm2 ; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 ; SSE-NEXT: pand %xmm6, %xmm3 ; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm2 ; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,3,2,3] ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] +; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psllq $48, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psllq $48, %xmm2 +; SSE-NEXT: pandn %xmm2, %xmm6 ; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] ; SSE-NEXT: movaps {{.*#+}} xmm15 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,1,3] +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm11[2],xmm3[3],xmm11[3] ; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,0,1,3] +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: andnps %xmm14, %xmm2 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,0,1,3] +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: andnps %xmm12, %xmm2 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,1,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] ; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3] +; SSE-NEXT: movdqa %xmm9, %xmm12 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,3] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] +; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm14[3,0] ; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: andnps %xmm2, %xmm0 +; SSE-NEXT: andnps %xmm14, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,7,4,6,7] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm7[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] ; SSE-NEXT: movaps %xmm15, %xmm3 ; SSE-NEXT: andnps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4302,13 +4301,14 @@ ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[2,2,2,2,4,5,6,7] ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,2,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -4317,25 +4317,26 @@ ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[3,0] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm9, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,7,4,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm15, %xmm3 +; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[2,2,2,2,4,5,6,7] ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -4344,24 +4345,24 @@ ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[3,0] ; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm10, %xmm1 +; SSE-NEXT: andnps %xmm12, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,4,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,2,2,2,4,5,6,7] ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -4370,19 +4371,20 @@ ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[3,0] ; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm5, %xmm1 +; SSE-NEXT: andnps %xmm6, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,4,6,7] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[2,2,2,2,4,5,6,7] ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -4395,50 +4397,49 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[1,0,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm15, %xmm11 ; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[3,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[3,0] ; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm7, %xmm1 +; SSE-NEXT: andnps %xmm10, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,7,4,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,2,2,2,4,5,6,7] ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[1,0,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm15, %xmm9 ; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0] -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: andnps %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,4,6,7] +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[3,0] +; SSE-NEXT: movdqa %xmm15, %xmm12 +; SSE-NEXT: pandn %xmm8, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,4,6,7] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,2,2,2,4,5,6,7] ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -4449,135 +4450,137 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm13[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm13[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm6[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm3[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm12[0,2] -; SSE-NEXT: movaps %xmm3, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0,2] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm1[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm7[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm10[0,2] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] -; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,2] +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm10[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm7[0,2] +; SSE-NEXT: movaps %xmm6, %xmm10 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm14[3,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm13[3,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[2,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm13 -; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm4[0,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,7,4,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm10[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[2,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm15, %xmm14 +; SSE-NEXT: pandn %xmm13, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm4[0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,7,4,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,6] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,6] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm13[2,0] ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, (%rsp), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,1,1,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm14[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm13[2,0] ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,1,1,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm14[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm13[2,0] ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,1,1,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm14[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm13[2,0] ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,1,1,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm13[2,0] ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,1,1,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm13[2,0] ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3] -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[0,1,1,3] +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[0,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,0] -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,4,7] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,0] +; SSE-NEXT: por %xmm12, %xmm0 +; SSE-NEXT: pshufd $232, (%rsp), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3] ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = mem[0,1,1,3] @@ -4585,7 +4588,7 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,0] -; SSE-NEXT: por %xmm13, %xmm15 +; SSE-NEXT: por %xmm14, %xmm15 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,1,1,3] @@ -4594,53 +4597,53 @@ ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm12[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 64(%rsi) +; SSE-NEXT: movaps %xmm4, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rsi) +; SSE-NEXT: movaps %xmm4, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 96(%rdx) +; SSE-NEXT: movaps %xmm4, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rdx) +; SSE-NEXT: movaps %xmm4, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 64(%rdx) +; SSE-NEXT: movaps %xmm4, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rdx) +; SSE-NEXT: movaps %xmm4, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 96(%rcx) +; SSE-NEXT: movaps %xmm4, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 64(%rcx) +; SSE-NEXT: movaps %xmm4, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rcx) +; SSE-NEXT: movaps %xmm4, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rcx) +; SSE-NEXT: movaps %xmm4, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rcx) ; SSE-NEXT: movaps %xmm8, 112(%r8) ; SSE-NEXT: movaps %xmm9, 96(%r8) ; SSE-NEXT: movaps %xmm11, 80(%r8) @@ -4668,270 +4671,271 @@ ; AVX1-ONLY-LABEL: load_i16_stride5_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm6[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 624(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,1,1,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,1,1,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,2,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,2,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm0[4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm4, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 624(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm0[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3],xmm0[4],xmm14[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm6, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm4, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm13, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,3,2,3] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm2[0,1,2,3,4],xmm15[5,6,7] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0,1,2,3,4],xmm13[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm13 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm11[2,3],mem[4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm14 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $48, %xmm3, %xmm13 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm6, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $48, %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm4, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm9[0,1],mem[2,3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm11[0,1],mem[2,3],xmm11[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, (%rsp), %xmm3, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2,3],xmm3[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2,3],xmm6[4,5],xmm15[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm15 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm8[2,3],xmm7[4,5],xmm8[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $48, %xmm9, %xmm15 -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm6, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm15, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm6[0,1],mem[2,3],xmm6[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $48, %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm4, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm14, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm12[2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm9[2,3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm10[0,1,2,3],xmm12[4,5],xmm10[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm8[0,1,2,3],mem[4,5],xmm8[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[0,3,2,3] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm15 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm14 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $48, %xmm5, %xmm15 -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm6, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm15, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm5[2,3],xmm3[4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $48, %xmm12, %xmm14 +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm4, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm14, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4959,118 +4963,119 @@ ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsllq $48, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm0[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,2,3,12,13,12,13,12,13,12,13] ; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm11[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,0] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vpshufd $36, (%rsp), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2,0] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm11[0,1,2,3],mem[4,5],xmm11[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,8,9,2,3,12,13,12,13,12,13,12,13] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, (%rsp), %xmm11, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm11[0,1],mem[2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm15[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm13[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm13[5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm6[0,1],mem[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[0,1,2,0] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm7[0,1,2,3],mem[4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm8[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,1,2,0] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm7[0,1,2,3],mem[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm12[0,1],xmm10[2,3],xmm12[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[0,1,2,0] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1],xmm14[2,3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm13[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0,1],xmm5[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm9[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[0,1,2,0] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm10[4,5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[0,1,2,3,8,9,2,3,12,13,12,13,12,13,12,13] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,1,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,1,1,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,2,0] +; AVX1-ONLY-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,0] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -5078,261 +5083,256 @@ ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,10,11,4,5,14,15,14,15,14,15,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, (%rsp), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm2[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm6[0,1],mem[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2,3],xmm6[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,6] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1],mem[2,3],xmm4[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm13[0],xmm4[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm11[0,1,2,3],xmm15[4,5],xmm11[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm7[2,3],xmm9[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm12[0,1,2,3],xmm14[4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm13[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm7[2,3],xmm5[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm5[0,1,2,3],mem[4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,6] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm8[2,3],xmm10[4,5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm9, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm13[0],xmm4[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm5[0,1,2,3],mem[4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0,1,2,3],xmm11[4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm13[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0,1],xmm12[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm12[0,1,2,3],mem[4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,6] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3],xmm4[4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1],xmm12[2,3],mem[4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm13[0,1,2,3],xmm3[4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm14[2,3],xmm5[4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm9[4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm9[4,5],xmm8[6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2],mem[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm14[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2,3],mem[4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm4[4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, (%rsp), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0,1,2],mem[3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm13[3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm10[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm12[0,1,2],mem[3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2],mem[3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm3[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,2,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2],mem[3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2,3],mem[4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2],mem[3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3],xmm5[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -5340,485 +5340,472 @@ ; AVX2-SLOW-LABEL: load_i16_stride5_vf64: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $1064, %rsp # imm = 0x428 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12],ymm1[13],ymm0[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13],ymm2[14],ymm5[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm8 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4],ymm13[5],ymm9[6,7],ymm13[8],ymm9[9,10],ymm13[11],ymm9[12],ymm13[13],ymm9[14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13],ymm4[14],ymm6[15] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm15 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3],xmm3[4,5],xmm6[6,7] -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13],ymm11[14],ymm12[15] -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3],xmm3[4,5],xmm6[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] +; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm1[1,2],ymm10[3],ymm1[4],ymm10[5],ymm1[6,7],ymm10[8],ymm1[9,10],ymm10[11],ymm1[12],ymm10[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13],ymm1[14],ymm4[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1,2],ymm0[3],ymm4[4],ymm0[5],ymm4[6,7],ymm0[8],ymm4[9,10],ymm0[11],ymm4[12],ymm0[13],ymm4[14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm4[1,2],ymm6[3],ymm4[4],ymm6[5],ymm4[6,7],ymm6[8],ymm4[9,10],ymm6[11],ymm4[12],ymm6[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6],ymm4[7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13],ymm4[14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13],ymm4[14],ymm1[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm15 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm1[1],ymm15[2],ymm1[3],ymm15[4,5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10],ymm1[11],ymm15[12,13],ymm1[14],ymm15[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6],xmm2[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,1,3,4,6,1,7] +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,12,13,10,11,0,1,6,7,10,11,4,5,4,5,18,19,28,29,26,27,16,17,22,23,26,27,20,21,20,21] +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2],mem[3],ymm1[4,5],mem[6],ymm1[7,8],mem[9],ymm1[10],mem[11],ymm1[12,13],mem[14],ymm1[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm11[1],ymm8[2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7,8],ymm11[9],ymm8[10],ymm11[11],ymm8[12,13],ymm11[14],ymm8[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15] +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm13[1],ymm9[2,3],ymm13[4],ymm9[5],ymm13[6],ymm9[7,8],ymm13[9],ymm9[10,11],ymm13[12],ymm9[13],ymm13[14],ymm9[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm15[1],ymm7[2],ymm15[3],ymm7[4,5],ymm15[6],ymm7[7,8],ymm15[9],ymm7[10],ymm15[11],ymm7[12,13],ymm15[14],ymm7[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6],xmm9[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5],ymm8[6],ymm5[7,8],ymm8[9],ymm5[10,11],ymm8[12],ymm5[13],ymm8[14],ymm5[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3],xmm9[4,5,6],xmm11[7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm11[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX2-SLOW-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm4[0],mem[1],ymm4[2],mem[3],ymm4[4,5],mem[6],ymm4[7,8],mem[9],ymm4[10],mem[11],ymm4[12,13],mem[14],ymm4[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3],xmm9[4,5,6],xmm11[7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, %xmm15 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10],ymm9[11],ymm8[12,13],ymm9[14],ymm8[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3],xmm4[4,5,6],xmm11[7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[0],ymm13[1],mem[2,3],ymm13[4],mem[5],ymm13[6],mem[7,8],ymm13[9],mem[10,11],ymm13[12],mem[13],ymm13[14],mem[15] +; AVX2-SLOW-NEXT: vpermd %ymm11, %ymm2, %ymm11 +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm11, %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm4, %ymm11, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3],xmm11[4,5,6],xmm9[7] +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm4[1],ymm10[2,3],ymm4[4],ymm10[5],ymm4[6],ymm10[7,8],ymm4[9],ymm10[10,11],ymm4[12],ymm10[13],ymm4[14],ymm10[15] +; AVX2-SLOW-NEXT: vpermd %ymm9, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0],xmm13[1],xmm10[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1,2,3,4],ymm6[5,6,7],ymm2[8,9,10,11,12],ymm6[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7],ymm5[8,9,10,11,12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 624(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm2[0],xmm5[1],xmm2[2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0],xmm5[1],xmm15[2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 624(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm9[0],xmm7[1],xmm9[2,3] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm11 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm2[0],xmm5[1],xmm2[2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm9 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm13 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0,1,2,3,4],ymm13[5,6,7],ymm2[8,9,10,11,12],ymm13[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm2[0],xmm5[1],xmm2[2,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm2[0],xmm5[1],xmm2[2,3] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7],ymm8[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm15, %xmm13 -; AVX2-SLOW-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm4[2],xmm15[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm5[2],xmm2[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm12[2],xmm10[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm10, %xmm12 ; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm13[2],xmm10[3] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1],xmm9[2],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm2[2],xmm15[3] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm5[2],xmm2[3] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm7[2],xmm9[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5],ymm5[6],ymm15[7,8],ymm5[9],ymm15[10,11],ymm5[12],ymm15[13],ymm5[14],ymm15[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1],ymm6[2],ymm11[3],ymm6[4],ymm11[5,6],ymm6[7],ymm11[8,9],ymm6[10],ymm11[11],ymm6[12],ymm11[13,14],ymm6[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1],ymm11[2],ymm14[3],ymm11[4],ymm14[5,6],ymm11[7],ymm14[8,9],ymm11[10],ymm14[11],ymm11[12],ymm14[13,14],ymm11[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm14[0],xmm13[1],xmm14[2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm13[0],xmm10[1],xmm13[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm7 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1,2,3,4],ymm7[5,6,7],ymm1[8,9,10,11,12],ymm7[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm2[0],xmm12[1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm10[2],ymm8[3],ymm10[4],ymm8[5,6],ymm10[7],ymm8[8,9],ymm10[10],ymm8[11],ymm10[12],ymm8[13,14],ymm10[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm12 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3,4],xmm7[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm2[0],xmm15[1],xmm2[2,3] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm7 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm14[0],mem[1],ymm14[2,3],mem[4],ymm14[5],mem[6],ymm14[7,8],mem[9],ymm14[10,11],mem[12],ymm14[13],mem[14],ymm14[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = mem[0],xmm4[1],mem[2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1,2,3,4],ymm7[5,6,7],ymm1[8,9,10,11,12],ymm7[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm6[2],ymm8[3],ymm6[4],ymm8[5,6],ymm6[7],ymm8[8,9],ymm6[10],ymm8[11],ymm6[12],ymm8[13,14],ymm6[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm12 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3,4],xmm7[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0],xmm9[1],xmm5[2,3] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm7 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm8[0,1],mem[2],ymm8[3],mem[4],ymm8[5,6],mem[7],ymm8[8,9],mem[10],ymm8[11],mem[12],ymm8[13,14],mem[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1,2,3,4],ymm7[5,6,7],ymm1[8,9,10,11,12],ymm7[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5],ymm12[6],ymm5[7,8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13],ymm12[14],ymm5[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1],ymm2[2],ymm6[3],ymm2[4],ymm6[5,6],ymm2[7],ymm6[8,9],ymm2[10],ymm6[11],ymm2[12],ymm6[13,14],ymm2[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3,4],xmm4[5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[0],xmm3[1],mem[2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0],xmm4[1],xmm7[2,3] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm15[1],ymm5[2],ymm15[3],ymm5[4,5],ymm15[6],ymm5[7,8],ymm15[9],ymm5[10],ymm15[11],ymm5[12,13],ymm15[14],ymm5[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2],ymm2[3],ymm6[4],ymm2[5],ymm6[6,7],ymm2[8],ymm6[9,10],ymm2[11],ymm6[12],ymm2[13],ymm6[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7,8],ymm5[9],ymm12[10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,1,2,0,7,5,2,0] +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [6,7,12,13,10,11,4,5,2,3,2,3,14,15,0,1,22,23,28,29,26,27,20,21,18,19,18,19,30,31,16,17] +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm6[2],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm4[5,6,7],ymm1[8,9,10,11,12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm7[0,1],xmm4[2],xmm7[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm12, %xmm12 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2,3,4],ymm12[5,6,7],ymm1[8,9,10,11,12],ymm12[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm14[1,2],ymm11[3],ymm14[4],ymm11[5],ymm14[6,7],ymm11[8],ymm14[9,10],ymm11[11],ymm14[12],ymm11[13],ymm14[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0],xmm1[1],xmm12[2],xmm1[3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0],ymm13[1,2],ymm9[3],ymm13[4],ymm9[5],ymm13[6,7],ymm9[8],ymm13[9,10],ymm9[11],ymm13[12],ymm9[13],ymm13[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1],xmm15[2],xmm2[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm4[5,6,7],ymm1[8,9,10,11,12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = mem[0],ymm9[1],mem[2],ymm9[3],mem[4,5],ymm9[6],mem[7,8],ymm9[9],mem[10],ymm9[11],mem[12,13],ymm9[14],mem[15] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm6 +; AVX2-SLOW-NEXT: vpermd %ymm12, %ymm3, %ymm12 +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm12, %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm12 = mem[0,1],xmm2[2],mem[3] +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm12, %xmm12 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2,3,4],ymm12[5,6,7],ymm1[8,9,10,11,12],ymm12[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm14[1],ymm10[2],ymm14[3],ymm10[4,5],ymm14[6],ymm10[7,8],ymm14[9],ymm10[10],ymm14[11],ymm10[12,13],ymm14[14],ymm10[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] -; AVX2-SLOW-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm12[1,2],mem[3],ymm12[4],mem[5],ymm12[6,7],mem[8],ymm12[9,10],mem[11],ymm12[12],mem[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = xmm13[0,1],mem[2],xmm13[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm4[5,6,7],ymm1[8,9,10,11,12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm4[1,2],ymm10[3],ymm4[4],ymm10[5],ymm4[6,7],ymm10[8],ymm4[9,10],ymm10[11],ymm4[12],ymm10[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0],xmm1[1],xmm12[2],xmm1[3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX2-SLOW-NEXT: vpermd %ymm12, %ymm6, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm14 +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm12, %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm11[0,1],xmm10[2],xmm11[3] +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm12, %xmm12 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2,3,4],ymm12[5,6,7],ymm1[8,9,10,11,12],ymm12[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0],xmm1[1],xmm12[2],xmm1[3] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm12[0],mem[1],ymm12[2],mem[3],ymm12[4,5],mem[6],ymm12[7,8],mem[9],ymm12[10],mem[11],ymm12[12,13],mem[14],ymm12[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm8[1,2],ymm14[3],ymm8[4],ymm14[5],ymm8[6,7],ymm14[8],ymm8[9,10],ymm14[11],ymm8[12],ymm14[13],ymm8[14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15] +; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm7[2],xmm11[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm15[2],xmm14[3] +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7],mem[8,9],ymm0[10],mem[11],ymm0[12],mem[13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7],ymm1[8,9],mem[10],ymm1[11],mem[12],ymm1[13,14],mem[15] +; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm9[2],mem[3],ymm9[4],mem[5,6],ymm9[7],mem[8,9],ymm9[10],mem[11],ymm9[12],mem[13,14],ymm9[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5],ymm9[6],mem[7,8],ymm9[9],mem[10,11],ymm9[12],mem[13],ymm9[14],mem[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4],xmm4[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = mem[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm10[0,1],mem[2],ymm10[3],mem[4],ymm10[5,6],mem[7],ymm10[8,9],mem[10],ymm10[11],mem[12],ymm10[13,14],mem[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm12[2],ymm6[3],ymm12[4],ymm6[5,6],ymm12[7],ymm6[8,9],ymm12[10],ymm6[11],ymm12[12],ymm6[13,14],ymm12[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5],ymm4[6],mem[7,8],ymm4[9],mem[10,11],ymm4[12],mem[13],ymm4[14],mem[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0],ymm14[1],mem[2,3],ymm14[4],mem[5],ymm14[6],mem[7,8],ymm14[9],mem[10,11],ymm14[12],mem[13],ymm14[14],mem[15] +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[3,1,2,3] +; AVX2-SLOW-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = mem[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = mem[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 64(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%r9) ; AVX2-SLOW-NEXT: addq $1064, %rsp # imm = 0x428 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -5826,916 +5813,904 @@ ; AVX2-FAST-LABEL: load_i16_stride5_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15] -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1,2],ymm14[3],ymm4[4],ymm14[5],ymm4[6,7],ymm14[8],ymm4[9,10],ymm14[11],ymm4[12],ymm14[13],ymm4[14,15] -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,0,2,4,6,1,3] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm13 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,0,2,4,6,1,3] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm5[1],ymm12[2,3],ymm5[4],ymm12[5],ymm5[6],ymm12[7,8],ymm5[9],ymm12[10,11],ymm5[12],ymm12[13],ymm5[14],ymm12[15] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3],xmm0[4,5],xmm4[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm11 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2,3],ymm0[4],ymm6[5],ymm0[6],ymm6[7,8],ymm0[9],ymm6[10,11],ymm0[12],ymm6[13],ymm0[14],ymm6[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4],ymm4[5],ymm3[6,7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12],ymm4[13],ymm3[14,15] +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm10[1],ymm4[2,3],ymm10[4],ymm4[5],ymm10[6],ymm4[7,8],ymm10[9],ymm4[10,11],ymm10[12],ymm4[13],ymm10[14],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm11, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10,11],ymm5[12],ymm0[13],ymm5[14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4],ymm1[5],ymm6[6,7],ymm1[8],ymm6[9,10],ymm1[11],ymm6[12],ymm1[13],ymm6[14,15] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm9 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm14[1,2],ymm4[3],ymm14[4],ymm4[5],ymm14[6,7],ymm4[8],ymm14[9,10],ymm4[11],ymm14[12],ymm4[13],ymm14[14,15] +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm15 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm15[1],ymm12[2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10],ymm15[11],ymm12[12,13],ymm15[14],ymm12[15] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm12[1],ymm2[2],ymm12[3],ymm2[4,5],ymm12[6],ymm2[7,8],ymm12[9],ymm2[10],ymm12[11],ymm2[12,13],ymm12[14],ymm2[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm14[1],mem[2,3],ymm14[4],mem[5],ymm14[6],mem[7,8],ymm14[9],mem[10,11],ymm14[12],mem[13],ymm14[14],mem[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,u,u,u,4,7,1,6> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2],ymm10[3],ymm13[4,5],ymm10[6],ymm13[7,8],ymm10[9],ymm13[10],ymm10[11],ymm13[12,13],ymm10[14],ymm13[15] -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,u,u,u,4,6,1,7> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,12,13,10,11,0,1,6,7,2,3,4,5,0,1,18,19,28,29,26,27,16,17,22,23,18,19,20,21,16,17] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm12 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm8[0],mem[1],ymm8[2],mem[3],ymm8[4,5],mem[6],ymm8[7,8],mem[9],ymm8[10],mem[11],ymm8[12,13],mem[14],ymm8[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm10 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3],xmm1[4,5,6],xmm10[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5],ymm7[6],ymm11[7,8],ymm7[9],ymm11[10,11],ymm7[12],ymm11[13],ymm7[14],ymm11[15] ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm3, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm12 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0],ymm15[1],ymm14[2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0],ymm13[1],ymm9[2],ymm13[3],ymm9[4,5],ymm13[6],ymm9[7,8],ymm13[9],ymm9[10],ymm13[11],ymm9[12,13],ymm13[14],ymm9[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6],xmm9[7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpblendw $82, (%rsp), %ymm14, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm14[0],mem[1],ymm14[2,3],mem[4],ymm14[5],mem[6],ymm14[7,8],mem[9],ymm14[10,11],mem[12],ymm14[13],mem[14],ymm14[15] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm3, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm9, %ymm10, %ymm9 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[0],ymm13[1],mem[2],ymm13[3],mem[4,5],ymm13[6],mem[7,8],ymm13[9],mem[10],ymm13[11],mem[12,13],ymm13[14],mem[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6],xmm11[7] -; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5],ymm4[6],mem[7,8],ymm4[9],mem[10,11],ymm4[12],mem[13],ymm4[14],mem[15] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm3, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm5[0],mem[1],ymm5[2],mem[3],ymm5[4,5],mem[6],ymm5[7,8],mem[9],ymm5[10],mem[11],ymm5[12,13],mem[14],ymm5[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3],xmm11[4,5,6],xmm8[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm4 -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm9[0],mem[1],ymm9[2,3],mem[4],ymm9[5],mem[6],ymm9[7,8],mem[9],ymm9[10,11],mem[12],ymm9[13],mem[14],ymm9[15] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm11 -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5],ymm5[6],ymm15[7,8],ymm5[9],ymm15[10,11],ymm5[12],ymm15[13],ymm5[14],ymm15[15] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm10 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,3,1,3,0,3,5,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm8[5,6,7],ymm0[8,9,10,11,12],ymm8[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0,1,2,3,4],ymm5[5,6,7],ymm3[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0,1,2,3,4],ymm11[5,6,7],ymm3[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0,1,2,3,4],ymm11[5,6,7],ymm3[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7],ymm6[8,9,10,11,12],ymm4[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,2,3,1,3,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm7 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm6, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5,6,7],ymm12[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm2[5,6,7],ymm12[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1,2,3,4],ymm1[5,6,7],ymm9[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm1[5,6,7],ymm10[8,9,10,11,12],ymm1[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7],ymm11[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $107, (%rsp), %ymm3, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7],ymm1[8,9],mem[10],ymm1[11],mem[12],ymm1[13,14],mem[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm12[0],mem[1],ymm12[2,3],mem[4],ymm12[5],mem[6],ymm12[7,8],mem[9],ymm12[10,11],mem[12],ymm12[13],mem[14],ymm12[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,2,u,u,5,7,2,4> -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,4,6,0,1,4,6,0] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm13[2],mem[3],ymm13[4],mem[5,6],ymm13[7],mem[8,9],ymm13[10],mem[11],ymm13[12],mem[13,14],ymm13[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm13[0],mem[1],ymm13[2,3],mem[4],ymm13[5],mem[6],ymm13[7,8],mem[9],ymm13[10,11],mem[12],ymm13[13],mem[14],ymm13[15] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0,1],ymm15[2],ymm14[3],ymm15[4],ymm14[5,6],ymm15[7],ymm14[8,9],ymm15[10],ymm14[11],ymm15[12],ymm14[13,14],ymm15[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10,11],ymm15[12],ymm14[13],ymm15[14],ymm14[15] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm4, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1],ymm8[2],ymm4[3],ymm8[4],ymm4[5,6],ymm8[7],ymm4[8,9],ymm8[10],ymm4[11],ymm8[12],ymm4[13,14],ymm8[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm7 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm11[0],mem[1],ymm11[2,3],mem[4],ymm11[5],mem[6],ymm11[7,8],mem[9],ymm11[10,11],mem[12],ymm11[13],mem[14],ymm11[15] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw $214, (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm3[0],mem[1,2],ymm3[3],mem[4],ymm3[5],mem[6,7],ymm3[8],mem[9,10],ymm3[11],mem[12],ymm3[13],mem[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4,5],ymm12[6],ymm5[7,8],ymm12[9],ymm5[10],ymm12[11],ymm5[12,13],ymm12[14],ymm5[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,3,u,u,5,0,2,7> -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,4,7,0,2,4,7,0] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm6[0,1,2,3,4],ymm10[5,6,7],ymm6[8,9,10,11,12],ymm10[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,u,u,5,7,2,4> +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,4,6,0,1,4,6,0] +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0,1,2,3,4],ymm11[5,6,7],ymm6[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0],ymm3[1,2],mem[3],ymm3[4],mem[5],ymm3[6,7],mem[8],ymm3[9,10],mem[11],ymm3[12],mem[13],ymm3[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0],xmm6[1],xmm10[2],xmm6[3] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[0],ymm13[1],mem[2],ymm13[3],mem[4,5],ymm13[6],mem[7,8],ymm13[9],mem[10],ymm13[11],mem[12,13],ymm13[14],mem[15] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm6[0,1,2,3,4],ymm10[5,6,7],ymm6[8,9,10,11,12],ymm10[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1],ymm5[2],ymm3[3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8,9],ymm5[10],ymm3[11],ymm5[12],ymm3[13,14],ymm5[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm11[3,4],xmm6[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0,1,2,3,4],ymm11[5,6,7],ymm6[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0],ymm6[1,2],mem[3],ymm6[4],mem[5],ymm6[6,7],mem[8],ymm6[9,10],mem[11],ymm6[12],mem[13],ymm6[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0],xmm6[1],xmm10[2],xmm6[3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10],ymm14[11],ymm15[12,13],ymm14[14],ymm15[15] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm14 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm6[0,1,2,3,4],ymm10[5,6,7],ymm6[8,9,10,11,12],ymm10[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3],mem[4],ymm6[5,6],mem[7],ymm6[8,9],mem[10],ymm6[11],mem[12],ymm6[13,14],mem[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm11[3,4],xmm6[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm9, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0,1,2,3,4],ymm11[5,6,7],ymm6[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm4[1,2],ymm8[3],ymm4[4],ymm8[5],ymm4[6,7],ymm8[8],ymm4[9,10],ymm8[11],ymm4[12],ymm8[13],ymm4[14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0],xmm6[1],xmm10[2],xmm6[3] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,1],ymm13[2],mem[3],ymm13[4],mem[5,6],ymm13[7],mem[8,9],ymm13[10],mem[11],ymm13[12],mem[13,14],ymm13[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm11[3,4],xmm6[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm7[0],ymm13[1],ymm7[2,3],ymm13[4],ymm7[5],ymm13[6],ymm7[7,8],ymm13[9],ymm7[10,11],ymm13[12],ymm7[13],ymm13[14],ymm7[15] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm15[1],ymm14[2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,u,3,5,7,2,0> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,10,11,0,1,6,7,6,7,2,3,12,13,18,19,28,29,26,27,16,17,22,23,22,23,18,19,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [3,1,6,4] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[0],ymm15[1,2],mem[3],ymm15[4],mem[5],ymm15[6,7],mem[8],ymm15[9,10],mem[11],ymm15[12],mem[13],ymm15[14,15] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm14, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,12,13,10,11,4,5,2,3,8,9,0,1,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,4,7,0,2,4,7,0] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm5[0],ymm3[1,2],ymm5[3],ymm3[4],ymm5[5],ymm3[6,7],ymm5[8],ymm3[9,10],ymm5[11],ymm3[12],ymm5[13],ymm3[14,15] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm14, %ymm10 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7,8],ymm8[9],ymm4[10],ymm8[11],ymm4[12,13],ymm8[14],ymm4[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4],ymm3[5],ymm5[6,7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12],ymm3[13],ymm5[14,15] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm14, %ymm10 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10],ymm4[11],ymm8[12,13],ymm4[14],ymm8[15] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm6, %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm0[0],mem[1,2],ymm0[3],mem[4],ymm0[5],mem[6,7],ymm0[8],mem[9,10],ymm0[11],mem[12],ymm0[13],mem[14,15] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm7[1],ymm13[2],ymm7[3],ymm13[4,5],ymm7[6],ymm13[7,8],ymm7[9],ymm13[10],ymm7[11],ymm13[12,13],ymm7[14],ymm13[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,u,3,5,7,2,0> +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $173, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm5[0,1],mem[2],ymm5[3],mem[4],ymm5[5,6],mem[7],ymm5[8,9],mem[10],ymm5[11],mem[12],ymm5[13,14],mem[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,3,u,u,6,0,3,5> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,2,1,3,0,2,5,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7],ymm0[8,9],mem[10],ymm0[11],mem[12],ymm0[13,14],mem[15] +; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm15[0],mem[1],ymm15[2,3],mem[4],ymm15[5],mem[6],ymm15[7,8],mem[9],ymm15[10,11],mem[12],ymm15[13],mem[14],ymm15[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <1,3,u,u,6,0,3,5> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,1,3,0,2,5,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13],ymm3[14],ymm5[15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm10 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3,4],xmm10[5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1],ymm4[2],ymm8[3],ymm4[4],ymm8[5,6],ymm4[7],ymm8[8,9],ymm4[10],ymm8[11],ymm4[12],ymm8[13,14],ymm4[15] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm6, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3,4],xmm10[5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm4, %ymm10 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm7, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3,4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-FAST-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm3, (%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r9) +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm6, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm10[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3,4],xmm10[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 64(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r9) ; AVX2-FAST-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride5_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: subq $1048, %rsp # imm = 0x418 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12],ymm1[13],ymm0[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2],ymm7[3],ymm0[4],ymm7[5],ymm0[6,7],ymm7[8],ymm0[9,10],ymm7[11],ymm0[12],ymm7[13],ymm0[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm8[1,2],ymm15[3],ymm8[4],ymm15[5],ymm8[6,7],ymm15[8],ymm8[9,10],ymm15[11],ymm8[12],ymm15[13],ymm8[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5],ymm9[6],ymm5[7,8],ymm9[9],ymm5[10,11],ymm9[12],ymm5[13],ymm9[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm4[1,2],ymm12[3],ymm4[4],ymm12[5],ymm4[6,7],ymm12[8],ymm4[9,10],ymm12[11],ymm4[12],ymm12[13],ymm4[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5],ymm11[6],ymm13[7,8],ymm11[9],ymm13[10,11],ymm11[12],ymm13[13],ymm11[14],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2],ymm10[3],ymm0[4],ymm10[5],ymm0[6,7],ymm10[8],ymm0[9,10],ymm10[11],ymm0[12],ymm10[13],ymm0[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5],ymm8[6],ymm6[7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13],ymm8[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm15[1,2],ymm10[3],ymm15[4],ymm10[5],ymm15[6,7],ymm10[8],ymm15[9,10],ymm10[11],ymm15[12],ymm10[13],ymm15[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4,5],ymm1[6],mem[7,8],ymm1[9],mem[10],ymm1[11],mem[12,13],ymm1[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm5[1],ymm12[2,3],ymm5[4],ymm12[5],ymm5[6],ymm12[7,8],ymm5[9],ymm12[10,11],ymm5[12],ymm12[13],ymm5[14],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1,2,3],xmm1[4,5],xmm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm15[1],mem[2,3],ymm15[4],mem[5],ymm15[6],mem[7,8],ymm15[9],mem[10,11],ymm15[12],mem[13],ymm15[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm3[1,2],ymm6[3],ymm3[4],ymm6[5],ymm3[6,7],ymm6[8],ymm3[9,10],ymm6[11],ymm3[12],ymm6[13],ymm3[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm9[0],mem[1],ymm9[2],mem[3],ymm9[4,5],mem[6],ymm9[7,8],mem[9],ymm9[10],mem[11],ymm9[12,13],mem[14],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13],ymm14[14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1,2,3],xmm1[4,5],xmm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10,11],ymm12[12],ymm14[13],ymm12[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm13[1],ymm11[2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7,8],ymm13[9],ymm11[10],ymm13[11],ymm11[12,13],ymm13[14],ymm11[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2,3],xmm9[4,5,6],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm3[1,2],ymm1[3],ymm3[4],ymm1[5],ymm3[6,7],ymm1[8],ymm3[9,10],ymm1[11],ymm3[12],ymm1[13],ymm3[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm9, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0],ymm10[1],mem[2,3],ymm10[4],mem[5],ymm10[6],mem[7,8],ymm10[9],mem[10,11],ymm10[12],mem[13],ymm10[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2,3],xmm9[4,5,6],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4,5],xmm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10],ymm6[11],ymm3[12,13],ymm6[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5],ymm5[6],ymm8[7,8],ymm5[9],ymm8[10,11],ymm5[12],ymm8[13],ymm5[14],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,1,3,4,6,1,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,12,13,10,11,0,1,6,7,10,11,4,5,4,5,18,19,28,29,26,27,16,17,22,23,26,27,20,21,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw $74, (%rsp), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm11[0],mem[1],ymm11[2],mem[3],ymm11[4,5],mem[6],ymm11[7,8],mem[9],ymm11[10],mem[11],ymm11[12,13],mem[14],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm7[1],mem[2,3],ymm7[4],mem[5],ymm7[6],mem[7,8],ymm7[9],mem[10,11],ymm7[12],mem[13],ymm7[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5,6],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5],ymm10[6],ymm15[7,8],ymm10[9],ymm15[10,11],ymm10[12],ymm15[13],ymm10[14],ymm15[15] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm7, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm14[0],ymm13[1],ymm14[2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10],ymm13[11],ymm14[12,13],ymm13[14],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3],xmm7[4,5,6],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13],ymm14[14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm6, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0],xmm5[1],xmm9[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1,2,3,4],ymm6[5,6,7],ymm2[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 624(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0],xmm12[1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1,2,3,4],ymm6[5,6,7],ymm2[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2,3,4],ymm2[5,6,7],ymm6[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0],xmm13[1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1,2,3,4],ymm6[5,6,7],ymm2[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 624(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0],xmm7[1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5,6,7],ymm8[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm2[0],xmm6[1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0],xmm11[1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7],ymm10[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7],ymm8[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm15[2],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm15, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm11[2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm12[2],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm5[2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7],ymm5[8,9,10,11,12],ymm0[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm13[2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7],ymm15[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm6[2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1],xmm7[2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13],ymm4[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm15[1],mem[2,3],ymm15[4],mem[5],ymm15[6],mem[7,8],ymm15[9],mem[10,11],ymm15[12],mem[13],ymm15[14],mem[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $148, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7],ymm1[8,9],mem[10],ymm1[11],mem[12],ymm1[13,14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $148, (%rsp), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm6[0,1],mem[2],ymm6[3],mem[4],ymm6[5,6],mem[7],ymm6[8,9],mem[10],ymm6[11],mem[12],ymm6[13,14],mem[15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm10[0],xmm14[1],xmm10[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0],xmm9[1],xmm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm6[1],mem[2,3],ymm6[4],mem[5],ymm6[6],mem[7,8],ymm6[9],mem[10,11],ymm6[12],mem[13],ymm6[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm8[0,1],mem[2],ymm8[3],mem[4],ymm8[5,6],mem[7],ymm8[8,9],mem[10],ymm8[11],mem[12],ymm8[13,14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0],xmm9[1],xmm12[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm14[0],mem[1],ymm14[2,3],mem[4],ymm14[5],mem[6],ymm14[7,8],mem[9],ymm14[10,11],mem[12],ymm14[13],mem[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm12[2],ymm9[3],ymm12[4],ymm9[5,6],ymm12[7],ymm9[8,9],ymm12[10],ymm9[11],ymm12[12],ymm9[13,14],ymm12[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm13[0],xmm5[1],xmm13[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm8[5,6,7],ymm3[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm12[2],mem[3],ymm12[4],mem[5,6],ymm12[7],mem[8,9],ymm12[10],mem[11],ymm12[12],mem[13,14],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm10[3,4],xmm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0],xmm5[1],xmm11[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm8[5,6,7],ymm3[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7],ymm13[8,9],ymm12[10],ymm13[11],ymm12[12],ymm13[13,14],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm10[3,4],xmm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm7[1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm8[5,6,7],ymm3[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm10[1],ymm2[2,3],ymm10[4],ymm2[5],ymm10[6],ymm2[7,8],ymm10[9],ymm2[10,11],ymm10[12],ymm2[13],ymm10[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1],ymm5[2],ymm11[3],ymm5[4],ymm11[5,6],ymm5[7],ymm11[8,9],ymm5[10],ymm11[11],ymm5[12],ymm11[13,14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3,4],xmm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm15[0],mem[1],xmm15[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0],xmm4[1],xmm8[2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $41, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1,2],mem[3],ymm1[4],mem[5],ymm1[6,7],mem[8],ymm1[9,10],mem[11],ymm1[12],mem[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm11[1,2],ymm5[3],ymm11[4],ymm5[5],ymm11[6,7],ymm5[8],ymm11[9,10],ymm5[11],ymm11[12],ymm5[13],ymm11[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm2[1],ymm10[2],ymm2[3],ymm10[4,5],ymm2[6],ymm10[7,8],ymm2[9],ymm10[10],ymm2[11],ymm10[12,13],ymm2[14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [3,1,2,0,7,5,2,0] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,12,13,10,11,4,5,2,3,2,3,14,15,0,1,22,23,28,29,26,27,20,21,18,19,18,19,30,31,16,17] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm4[2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw $41, (%rsp), %ymm6, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm6[1,2],mem[3],ymm6[4],mem[5],ymm6[6,7],mem[8],ymm6[9,10],mem[11],ymm6[12],mem[13],ymm6[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm15[0],mem[1],ymm15[2],mem[3],ymm15[4,5],mem[6],ymm15[7,8],mem[9],ymm15[10],mem[11],ymm15[12,13],mem[14],ymm15[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],xmm4[2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7],ymm1[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm6[0],mem[1],ymm6[2],mem[3],ymm6[4,5],mem[6],ymm6[7,8],mem[9],ymm6[10],mem[11],ymm6[12,13],mem[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0],ymm8[1,2],mem[3],ymm8[4],mem[5],ymm8[6,7],mem[8],ymm8[9,10],mem[11],ymm8[12],mem[13],ymm8[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0],xmm5[1],xmm11[2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[0,1],mem[2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7],ymm1[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm14[1],mem[2],ymm14[3],mem[4,5],ymm14[6],mem[7,8],ymm14[9],mem[10],ymm14[11],mem[12,13],ymm14[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0],xmm5[1],xmm11[2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm12[0,1],mem[2],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7],ymm1[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm10[1],ymm8[2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7,8],ymm10[9],ymm8[10],ymm10[11],ymm8[12,13],ymm10[14],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,1],xmm2[2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm5[0],mem[1,2],ymm5[3],mem[4],ymm5[5],mem[6,7],ymm5[8],mem[9,10],ymm5[11],mem[12],ymm5[13],mem[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm13[1,2],ymm11[3],ymm13[4],ymm11[5],ymm13[6,7],ymm11[8],ymm13[9,10],ymm11[11],ymm13[12],ymm11[13],ymm13[14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm15, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm7[2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7,8],ymm11[9],ymm6[10],ymm11[11],ymm6[12,13],ymm11[14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm4, %ymm8, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1],xmm10[2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm13[1,2],ymm12[3],ymm13[4],ymm12[5],ymm13[6,7],ymm12[8],ymm13[9,10],ymm12[11],ymm13[12],ymm12[13],ymm13[14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm13[1],ymm15[2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7,8],ymm13[9],ymm15[10],ymm13[11],ymm15[12,13],ymm13[14],ymm15[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [3,1,2,0,7,5,2,0] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm7[2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7],ymm0[8,9],mem[10],ymm0[11],mem[12],ymm0[13,14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2],xmm0[3,4],xmm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7],mem[8,9],ymm2[10],mem[11],ymm2[12],mem[13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm11[2],ymm6[3],ymm11[4],ymm6[5,6],ymm11[7],ymm6[8,9],ymm11[10],ymm6[11],ymm11[12],ymm6[13,14],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5],ymm5[6],mem[7,8],ymm5[9],mem[10,11],ymm5[12],mem[13],ymm5[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5],ymm5[6],mem[7,8],ymm5[9],mem[10,11],ymm5[12],mem[13],ymm5[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3,4],xmm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw $82, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm6, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1],ymm10[2],ymm8[3],ymm10[4],ymm8[5,6],ymm10[7],ymm8[8,9],ymm10[10],ymm8[11],ymm10[12],ymm8[13,14],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4],ymm6[5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3,4],xmm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm6[2],mem[3],ymm6[4],mem[5,6],ymm6[7],mem[8,9],ymm6[10],mem[11],ymm6[12],mem[13,14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4],ymm6[5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7],ymm15[8,9],ymm13[10],ymm15[11],ymm13[12],ymm15[13,14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5],mem[6],ymm6[7,8],mem[9],ymm6[10,11],mem[12],ymm6[13],mem[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3,4],xmm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0],ymm6[1],mem[2,3],ymm6[4],mem[5],ymm6[6],mem[7,8],ymm6[9],mem[10,11],ymm6[12],mem[13],ymm6[14],mem[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-FAST-PERLANE-NEXT: addq $1048, %rsp # imm = 0x418 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i16_stride5_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $488, %rsp # imm = 0x1E8 +; AVX512F-SLOW-NEXT: subq $584, %rsp # imm = 0x248 ; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm6 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm7 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm5 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] @@ -6745,653 +6720,668 @@ ; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %ymm4 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm8 ; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm6 ; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> ; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512F-SLOW-NEXT: vporq %ymm2, %ymm3, %ymm23 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa64 224(%rdi), %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa64 176(%rdi), %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm10 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm10[1,2],ymm3[3],ymm10[4],ymm3[5],ymm10[6,7],ymm3[8],ymm10[9,10],ymm3[11],ymm10[12],ymm3[13],ymm10[14,15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm10, %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512F-SLOW-NEXT: vporq %ymm2, %ymm3, %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm27 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm22 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa 576(%rdi), %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm12 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm3[1,2],ymm12[3],ymm3[4],ymm12[5],ymm3[6,7],ymm12[8],ymm3[9,10],ymm12[11],ymm3[12],ymm12[13],ymm3[14,15] +; AVX512F-SLOW-NEXT: vmovdqa %ymm12, %ymm13 +; AVX512F-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm12 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5],ymm11[6],ymm4[7,8],ymm11[9],ymm4[10,11],ymm11[12],ymm4[13],ymm11[14],ymm4[15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] ; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,8,9,18,19,28,29,22,23,16,17,26,27,128,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm7[1],ymm4[2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7,8],ymm7[9],ymm4[10],ymm7[11],ymm4[12,13],ymm7[14],ymm4[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7] +; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm21 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10,11],ymm13[12],ymm12[13],ymm13[14],ymm12[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,1,3,4,6,1,7] +; AVX512F-SLOW-NEXT: vpermd %ymm0, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,4,5,18,19,28,29,26,27,16,17,22,23,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm4[1],ymm11[2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7,8],ymm4[9],ymm11[10],ymm4[11],ymm11[12,13],ymm4[14],ymm11[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6],xmm4[7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX512F-SLOW-NEXT: vporq %ymm3, %ymm4, %ymm31 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8],ymm8[9],ymm6[10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] +; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] +; AVX512F-SLOW-NEXT: vpermd %ymm3, %ymm2, %ymm2 ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm29 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3],ymm12[4],ymm0[5,6],ymm12[7],ymm0[8,9],ymm12[10],ymm0[11],ymm12[12],ymm0[13,14],ymm12[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm22 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm21 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm7[2],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm12[1],ymm3[2,3],ymm12[4],ymm3[5],ymm12[6],ymm3[7,8],ymm12[9],ymm3[10,11],ymm12[12],ymm3[13],ymm12[14],ymm3[15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm15 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm30 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm29[3,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm8 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm27[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm10 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[3,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0],xmm11[1],xmm8[2,3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm24 -; AVX512F-SLOW-NEXT: vmovdqa 576(%rdi), %ymm13 ; AVX512F-SLOW-NEXT: vmovdqa 608(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm13[2],ymm1[3],ymm13[4],ymm1[5,6],ymm13[7],ymm1[8,9],ymm13[10],ymm1[11],ymm13[12],ymm1[13,14],ymm13[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa %ymm14, %ymm5 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm14[2],ymm1[3],ymm14[4],ymm1[5,6],ymm14[7],ymm1[8,9],ymm14[10],ymm1[11],ymm14[12],ymm1[13,14],ymm14[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm16 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 544(%rdi), %ymm5 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13],ymm2[14],ymm5[15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa 544(%rdi), %ymm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm20 +; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm7 ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 496(%rdi), %xmm16 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm16[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 480(%rdi), %xmm18 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm18[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm25, %zmm24 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm24, %zmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 496(%rdi), %xmm18 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm18[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %xmm13 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,10,11,10,11,14,15,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm23, %zmm15 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %ymm15, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa %ymm12, %ymm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm15[2],ymm12[3],ymm15[4],ymm12[5,6],ymm15[7],ymm12[8,9],ymm15[10],ymm12[11],ymm15[12],ymm12[13,14],ymm15[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm15[3,4],xmm0[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm12[1],ymm3[2,3],ymm12[4],ymm3[5],ymm12[6],ymm3[7,8],ymm12[9],ymm3[10,11],ymm12[12],ymm3[13],ymm12[14],ymm3[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm15 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm2[1,2],ymm15[3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm19[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm24 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm21[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm14[2],ymm1[3],ymm14[4],ymm1[5,6],ymm14[7],ymm1[8,9],ymm14[10],ymm1[11],ymm14[12],ymm1[13,14],ymm14[15] +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm4 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm0, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm3[1],ymm8[2,3],ymm3[4],ymm8[5],ymm3[6],ymm8[7,8],ymm3[9],ymm8[10,11],ymm3[12],ymm8[13],ymm3[14],ymm8[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm0[1,2],ymm11[3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm12 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 160(%rdi), %xmm26 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm26[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0],xmm10[1],xmm7[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm20 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0],xmm10[1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm23, %zmm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa %xmm11, %xmm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm11[2],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm4[1,2],ymm14[3],ymm4[4],ymm14[5],ymm4[6,7],ymm14[8],ymm4[9,10],ymm14[11],ymm4[12],ymm14[13],ymm4[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm26[0,3,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm12, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm21 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [7,5,2,0,7,5,6,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5],ymm8[6],ymm3[7,8],ymm8[9],ymm3[10,11],ymm8[12],ymm3[13],ymm8[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512F-SLOW-NEXT: vpermd %ymm15, %ymm22, %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [14,15,0,1,12,13,12,13,10,11,4,5,2,3,8,9,30,31,16,17,28,29,28,29,26,27,20,21,18,19,24,25] +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm15, %ymm15 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm15[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa %ymm5, %ymm9 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2],ymm14[3],ymm11[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm18[0,3,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm16, %xmm15 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm11, %ymm11 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm11[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa %ymm13, %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm6 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm13[0],ymm6[1,2],ymm13[3],ymm6[4],ymm13[5],ymm6[6,7],ymm13[8],ymm6[9,10],ymm13[11],ymm6[12],ymm13[13],ymm6[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2],xmm14[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm10[2],xmm6[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm28 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm23, %zmm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm3[1],ymm12[2,3],ymm3[4],ymm12[5],ymm3[6],ymm12[7,8],ymm3[9],ymm12[10,11],ymm3[12],ymm12[13],ymm3[14],ymm12[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm26 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2],ymm11[3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm21[0,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa %ymm5, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm2[1,2],ymm5[3],ymm2[4],ymm5[5],ymm2[6,7],ymm5[8],ymm2[9,10],ymm5[11],ymm2[12],ymm5[13],ymm2[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm6 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX512F-SLOW-NEXT: vpermd %ymm1, %ymm22, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm20 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[0,3,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm3 -; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm19, %xmm13 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0],ymm4[1,2],ymm1[3],ymm4[4],ymm1[5],ymm4[6,7],ymm1[8],ymm4[9,10],ymm1[11],ymm4[12],ymm1[13],ymm4[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm27 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm28 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2],xmm11[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm10[2],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm19 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm29, %zmm25, %zmm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm18, %xmm15 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm1[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm5[2],xmm11[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm23, %zmm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpblendw $82, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13],ymm4[14],ymm1[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0,1],ymm14[2],ymm10[3],ymm14[4],ymm10[5,6],ymm14[7],ymm10[8,9],ymm14[10],ymm10[11],ymm14[12],ymm10[13,14],ymm14[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm11 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX512F-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7],mem[8,9],ymm1[10],mem[11],ymm1[12],mem[13,14],ymm1[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm10[3,4],xmm1[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm11[1],xmm5[2,3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm23, %zmm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10],ymm7[11],ymm6[12,13],ymm7[14],ymm6[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm29 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm7, %ymm31 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm11 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] ; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0],xmm8[1],xmm7[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm31 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm18[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm12[2],xmm16[2],xmm12[3],xmm16[3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm29 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3,4],xmm13[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm12, %xmm12 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm30 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm24[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm23 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm18[2],xmm15[3],xmm18[3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm0[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13],ymm4[14],ymm2[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm27 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm30 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm10 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm15[3,4],xmm10[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa %ymm3, %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa %ymm8, %ymm13 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm3[1],ymm8[2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7,8],ymm3[9],ymm8[10],ymm3[11],ymm8[12,13],ymm3[14],ymm8[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm26[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm21[2],xmm1[3],xmm21[3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm24, %ymm8 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm14[1],ymm8[2,3],ymm14[4],ymm8[5],ymm14[6],ymm8[7,8],ymm14[9],ymm8[10,11],ymm14[12],ymm8[13],ymm14[14],ymm8[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3,4],xmm10[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw $148, (%rsp), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm11 = ymm3[0,1],mem[2],ymm3[3],mem[4],ymm3[5,6],mem[7],ymm3[8,9],mem[10],ymm3[11],mem[12],ymm3[13,14],mem[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3,4],xmm11[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2],ymm2[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm5[0],xmm3[1],xmm5[2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm11 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm2[1],ymm15[2,3],ymm2[4],ymm15[5],ymm2[6],ymm15[7,8],ymm2[9],ymm15[10,11],ymm2[12],ymm15[13],ymm2[14],ymm15[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm1[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6],ymm1[7] +; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm9[1],xmm12[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm23, %zmm10 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [3,1,2,0,7,5,2,0] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm15[1],ymm2[2],ymm15[3],ymm2[4,5],ymm15[6],ymm2[7,8],ymm15[9],ymm2[10],ymm15[11],ymm2[12,13],ymm15[14],ymm2[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512F-SLOW-NEXT: vpermd %ymm10, %ymm24, %ymm10 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,12,13,10,11,4,5,2,3,2,3,14,15,0,1,22,23,28,29,26,27,20,21,18,19,18,19,30,31,16,17] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm3[0],xmm6[1],xmm3[2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm22 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm10, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [0,6,1,3,4,6,5,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1],ymm7[2],ymm13[3],ymm7[4],ymm13[5,6],ymm7[7],ymm13[8,9],ymm7[10],ymm13[11],ymm7[12],ymm13[13,14],ymm7[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm16 +; AVX512F-SLOW-NEXT: vpermd %ymm11, %ymm25, %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,10,11,4,5,8,9,14,15,4,5,2,3,12,13,26,27,26,27,20,21,24,25,30,31,20,21,18,19,28,29] +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm11, %ymm11 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm9[2],xmm12[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm11 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm25, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, %ymm13 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm10[1,2],ymm14[3],ymm10[4],ymm14[5],ymm10[6,7],ymm14[8],ymm10[9,10],ymm14[11],ymm10[12],ymm14[13],ymm10[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2],xmm3[3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa %ymm8, %ymm5 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm11 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm11[2],ymm3[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0],xmm8[1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm3[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm0[2],xmm7[3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5],ymm7[6],ymm0[7,8],ymm7[9],ymm0[10,11],ymm7[12],ymm0[13],ymm7[14],ymm0[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm9[1,2,3],xmm2[4,5],xmm9[6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm9 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1,2,3,4,5,6,7],ymm9[8],ymm2[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1],ymm13[2],ymm12[3],ymm13[4],ymm12[5,6],ymm13[7],ymm12[8,9],ymm13[10],ymm12[11],ymm13[12],ymm12[13,14],ymm13[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm3[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4],ymm3[5,6],ymm9[7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5],ymm14[6],ymm10[7,8],ymm14[9],ymm10[10,11],ymm14[12],ymm10[13],ymm14[14],ymm10[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3,4],xmm10[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4],ymm5[5],ymm4[6,7],ymm5[8],ymm4[9,10],ymm5[11],ymm4[12],ymm5[13],ymm4[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm5 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm10 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm8[1],ymm14[2,3],ymm8[4],ymm14[5],ymm8[6],ymm14[7,8],ymm8[9],ymm14[10,11],ymm8[12],ymm14[13],ymm8[14],ymm14[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm23 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm15 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3],xmm0[4,5],xmm15[6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm10, %ymm11 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1,2,3,4,5,6,7],ymm11[8],ymm0[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm19 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX512F-SLOW-NEXT: vpermd %ymm11, %ymm24, %ymm11 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[6,7,12,13,10,11,4,5,2,3,2,3,14,15,0,1,22,23,28,29,26,27,20,21,18,19,18,19,30,31,16,17] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0],xmm8[1],xmm12[2,3] +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm11, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm9 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm9[2],ymm7[3],ymm9[4],ymm7[5,6],ymm9[7],ymm7[8,9],ymm9[10],ymm7[11],ymm9[12],ymm7[13,14],ymm9[15] +; AVX512F-SLOW-NEXT: vpermd %ymm0, %ymm25, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm0[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm11 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm11 = xmm6[0,1],mem[2],xmm6[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm26, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm30, %ymm13 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5],ymm6[6],ymm13[7,8],ymm6[9],ymm13[10,11],ymm6[12],ymm13[13],ymm6[14],ymm13[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1,2,3],xmm1[4,5],xmm10[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm10 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1,2,3,4,5,6,7],ymm10[8],ymm1[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm1[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4],ymm1[5,6],ymm10[7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1,2],ymm9[3],ymm7[4],ymm9[5],ymm7[6,7],ymm9[8],ymm7[9,10],ymm9[11],ymm7[12],ymm9[13],ymm7[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm12[0,1],xmm8[2],xmm12[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 ; AVX512F-SLOW-NEXT: movb $7, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10],ymm0[11],ymm7[12,13],ymm0[14],ymm7[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3],xmm3[4,5,6],xmm5[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm5 {%k1} +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm13[1],ymm6[2],ymm13[3],ymm6[4,5],ymm13[6],ymm6[7,8],ymm13[9],ymm6[10],ymm13[11],ymm6[12,13],ymm13[14],ymm6[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5,6],xmm7[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm10 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1,2,3,4,5,6,7],ymm10[8],ymm1[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4],ymm4[5],ymm3[6,7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12],ymm4[13],ymm3[14,15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm3 +; AVX512F-SLOW-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3],mem[4],ymm3[5,6],mem[7],ymm3[8,9],mem[10],ymm3[11],mem[12],ymm3[13,14],mem[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8],ymm14[9],ymm3[10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm7 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4,5,6,7],ymm7[8],ymm3[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm3 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4],ymm3[5],ymm0[6,7],ymm3[8],ymm0[9,10],ymm3[11],ymm0[12],ymm3[13],ymm0[14,15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm0[2],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7],mem[8,9],ymm0[10],mem[11],ymm0[12],mem[13,14],ymm0[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4],ymm4[5,6],ymm6[7] -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpblendw $82, (%rsp), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm6 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8],ymm4[9],ymm0[10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6],xmm6[7] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm0, (%rsi) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm0, 64(%rdx) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm0, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, 64(%rcx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, (%rcx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 64(%r8) -; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovaps %zmm0, (%r8) +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm3, 64(%rsi) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm3, (%rsi) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm3, 64(%rdx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm3, (%rdx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm3, 64(%rcx) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm3, (%rcx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, (%r8) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, (%r9) -; AVX512F-SLOW-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512F-SLOW-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i16_stride5_vf64: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: subq $520, %rsp # imm = 0x208 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa 496(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm21 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX512F-FAST-NEXT: vmovdqa 512(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vmovdqa 544(%rdi), %ymm9 -; AVX512F-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa 608(%rdi), %ymm11 -; AVX512F-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %ymm3 -; AVX512F-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 384(%rdi), %ymm7 -; AVX512F-FAST-NEXT: vmovdqa 416(%rdi), %ymm10 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm10[1,2],ymm7[3],ymm10[4],ymm7[5],ymm10[6,7],ymm7[8],ymm10[9,10],ymm7[11],ymm10[12],ymm7[13],ymm10[14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm29 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm24 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,0,2,4,6,1,3] -; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0,1],ymm6[2],ymm11[3],ymm6[4],ymm11[5,6],ymm6[7],ymm11[8,9],ymm6[10],ymm11[11],ymm6[12],ymm11[13,14],ymm6[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm30 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm28 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <2,4,7,1,4,6,u,u> -; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm17, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [8,9,3,2,4,5,7,6] -; AVX512F-FAST-NEXT: vpermt2d %ymm2, %ymm19, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [0,3,1,3,0,3,5,7] -; AVX512F-FAST-NEXT: vmovdqa64 448(%rdi), %ymm27 -; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm21, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm25, %zmm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 176(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm4, %xmm13 ; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm16 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm15 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm26 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13],ymm4[14],ymm6[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = <2,4,7,1,4,6,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm27, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm4[1,2],ymm15[3],ymm4[4],ymm15[5],ymm4[6,7],ymm15[8],ymm4[9,10],ymm15[11],ymm4[12],ymm15[13],ymm4[14,15] -; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm6 -; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10,11],ymm5[12],ymm0[13],ymm5[14],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa %ymm5, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm12 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5],ymm14[6],ymm12[7,8],ymm14[9],ymm12[10,11],ymm14[12],ymm12[13],ymm14[14],ymm12[15] -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm17, %ymm3 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4],ymm2[5],ymm4[6,7],ymm2[8],ymm4[9,10],ymm2[11],ymm4[12],ymm2[13],ymm4[14,15] +; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm12 +; AVX512F-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm14 +; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [1,3,0,2,4,6,1,3] +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm19, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5],ymm9[6],ymm7[7,8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13],ymm9[14],ymm7[15] +; AVX512F-FAST-NEXT: vmovdqa %ymm9, %ymm15 +; AVX512F-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm20 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm7[1,2,3],xmm4[4,5],xmm7[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %ymm9 +; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %ymm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm25 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm9, %ymm24 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm10 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [0,3,1,3,0,3,5,7] +; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm28 +; AVX512F-FAST-NEXT: vpermd %ymm28, %ymm22, %ymm10 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm7, %zmm9, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm30 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %ymm13 +; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5],ymm13[6],ymm5[7,8],ymm13[9],ymm5[10,11],ymm13[12],ymm5[13],ymm13[14],ymm5[15] +; AVX512F-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1,2,3],xmm5[4,5],xmm7[6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX512F-FAST-NEXT: vmovdqa 416(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1,2],ymm9[3],ymm5[4],ymm9[5],ymm5[6,7],ymm9[8],ymm5[9,10],ymm9[11],ymm5[12],ymm9[13],ymm5[14,15] +; AVX512F-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm19, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512F-FAST-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa 496(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm31 +; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm18 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX512F-FAST-NEXT: vmovdqa 512(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa 544(%rdi), %ymm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm23 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm27, %ymm3 ; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %ymm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3,4],xmm0[5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpermt2d %ymm20, %ymm19, %ymm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa 576(%rdi), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 608(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm17 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm8[3,4],xmm3[5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm20 -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm21, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm25, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 448(%rdi), %ymm29 +; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm22, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm30, %zmm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm10[1],ymm7[2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7,8],ymm10[9],ymm7[10],ymm10[11],ymm7[12,13],ymm10[14],ymm7[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm31 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm26 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm10 +; AVX512F-FAST-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm10[1],ymm15[2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8],ymm10[9],ymm15[10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = <2,u,u,u,4,7,1,6> -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm15[1],ymm6[2,3],ymm15[4],ymm6[5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10,11],ymm15[12],ymm6[13],ymm15[14],ymm6[15] -; AVX512F-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm22, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vporq %ymm3, %ymm0, %ymm19 -; AVX512F-FAST-NEXT: vpsrlq $48, %xmm13, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm17 -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,3,12,13,2,3,12,13,2,3,12,13,2,3,12,13] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm11 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm9 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = <2,u,u,u,4,6,1,7> +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5],ymm14[6],ymm12[7,8],ymm14[9],ymm12[10,11],ymm14[12],ymm12[13],ymm14[14],ymm12[15] +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm22, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,28,29,26,27,16,17,22,23,128,128,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vporq %ymm3, %ymm2, %ymm20 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm14 +; AVX512F-FAST-NEXT: vpsrlq $48, %xmm21, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,12,13,12,13,14,15,8,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm11 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <0,2,5,7,4,7,u,u> -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10,11],ymm12[12],ymm14[13],ymm12[14],ymm14[15] -; AVX512F-FAST-NEXT: vmovdqa %ymm12, %ymm7 -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm21, %ymm8 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm26 +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm21, %ymm8 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm8[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,4,6,3,1,4,6,3] -; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX512F-FAST-NEXT: vmovdqa %ymm5, %ymm12 -; AVX512F-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpermd %ymm13, %ymm9, %ymm13 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm8[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,3,6,4,1,3,6,4] +; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm7 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15] +; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm11, %ymm12 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,2,3,12,13,10,11,0,1,6,7,8,9,16,17,18,19,18,19,28,29,26,27,16,17,22,23,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm12 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [1,3,2,3,1,3,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm16, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm19, %zmm25, %zmm0 +; AVX512F-FAST-NEXT: vpermd %ymm28, %ymm16, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm20, %zmm24, %zmm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm4 -; AVX512F-FAST-NEXT: vpblendw $74, (%rsp), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm0 = ymm4[0],mem[1],ymm4[2],mem[3],ymm4[4,5],mem[6],ymm4[7,8],mem[9],ymm4[10],mem[11],ymm4[12,13],mem[14],ymm4[15] +; AVX512F-FAST-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm0 = ymm13[0],mem[1],ymm13[2],mem[3],ymm13[4,5],mem[6],ymm13[7,8],mem[9],ymm13[10],mem[11],ymm13[12,13],mem[14],ymm13[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6],xmm8[7] -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm10 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm19 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm23 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm4 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5],ymm9[6],mem[7,8],ymm9[9],mem[10,11],ymm9[12],mem[13],ymm9[14],mem[15] ; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm22, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 ; AVX512F-FAST-NEXT: vpor %ymm4, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm6 -; AVX512F-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm4 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5],mem[6],ymm6[7,8],mem[9],ymm6[10,11],mem[12],ymm6[13],mem[14],ymm6[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13],ymm4[14],ymm1[15] ; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm21, %ymm4 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload -; AVX512F-FAST-NEXT: vpsrlq $48, %xmm29, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm13 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX512F-FAST-NEXT: vpsrlq $48, %xmm31, %xmm4 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm4 -; AVX512F-FAST-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm6[1,2],ymm4[3],ymm6[4],ymm4[5],ymm6[6,7],ymm4[8],ymm6[9,10],ymm4[11],ymm6[12],ymm4[13],ymm6[14,15] -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm4 +; AVX512F-FAST-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm9 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm4[1,2],ymm9[3],ymm4[4],ymm9[5],ymm4[6,7],ymm9[8],ymm4[9,10],ymm9[11],ymm4[12],ymm9[13],ymm4[14,15] +; AVX512F-FAST-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm11, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm16, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm2 +; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm16, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm24, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm5[1],ymm12[2,3],ymm5[4],ymm12[5],ymm5[6],ymm12[7,8],ymm5[9],ymm12[10,11],ymm5[12],ymm12[13],ymm5[14],ymm12[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm22 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm22 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm21 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,5,14,15,4,5,14,15,4,5,14,15,4,5,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm30 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm17[2],xmm1[3],xmm17[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,4,5,14,15,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm15, %xmm27 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm14, %xmm16 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = <0,3,5,2,5,7,u,u> -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm14[1],ymm7[2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7,8],ymm14[9],ymm7[10],ymm14[11],ymm7[12,13],ymm14[14],ymm7[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm16 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm14 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm5[1],ymm14[2],ymm5[3],ymm14[4,5],ymm5[6],ymm14[7,8],ymm5[9],ymm14[10],ymm5[11],ymm14[12,13],ymm5[14],ymm14[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm25 ; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm24, %ymm5 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm5 @@ -7401,246 +7391,236 @@ ; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm21 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3],ymm8[4],ymm0[5,6],ymm8[7],ymm0[8,9],ymm8[10],ymm0[11],ymm8[12],ymm0[13,14],ymm8[15] +; AVX512F-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm0 = mem[0,1],ymm10[2],mem[3],ymm10[4],mem[5,6],ymm10[7],mem[8,9],ymm10[10],mem[11],ymm10[12],mem[13,14],ymm10[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2],xmm8[3,4],xmm0[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <0,2,u,u,5,7,2,4> -; AVX512F-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm0 = ymm15[0],mem[1],ymm15[2,3],mem[4],ymm15[5],mem[6],ymm15[7,8],mem[9],ymm15[10,11],mem[12],ymm15[13],mem[14],ymm15[15] -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm26, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,2,u,u,5,7,2,4> +; AVX512F-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm12 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm13[3,4,5,6,7] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [1,4,6,0,1,4,6,0] -; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm13, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm15 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm8, %zmm25, %zmm15 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm12 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm12[3,4,5,6,7] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,4,6,0,1,4,6,0] +; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm28, %ymm12, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm15 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm8, %zmm17, %zmm15 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13],ymm4[14],ymm6[15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm9[1],ymm4[2,3],ymm9[4],ymm4[5],ymm9[6],ymm4[7,8],ymm9[9],ymm4[10,11],ymm9[12],ymm4[13],ymm9[14],ymm4[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3,4],xmm8[5,6,7] ; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm6 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0],ymm6[1],ymm15[2],ymm6[3],ymm15[4,5],ymm6[6],ymm15[7,8],ymm6[9],ymm15[10],ymm6[11],ymm15[12,13],ymm6[14],ymm15[15] ; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm24, %ymm5 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm10, %xmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm10 -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm29[2],xmm2[3],xmm29[3] +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm9 +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm31[2],xmm2[3],xmm31[3] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX512F-FAST-NEXT: vmovdqu (%rsp), %ymm15 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm15[2],ymm3[3],ymm15[4],ymm3[5,6],ymm15[7],ymm3[8,9],ymm15[10],ymm3[11],ymm15[12],ymm3[13,14],ymm15[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm29 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7],mem[8,9],ymm2[10],mem[11],ymm2[12],mem[13,14],ymm2[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm28 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm19 -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm26, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm11, %ymm3 ; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm13, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0],xmm7[1],xmm13[2,3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [6,7,0,1,10,11,0,0,6,7,0,1,10,11,0,0] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <1,3,6,0,5,u,u,u> -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm14[2],ymm9[3],ymm14[4],ymm9[5,6],ymm14[7],ymm9[8,9],ymm14[10],ymm9[11],ymm14[12],ymm9[13,14],ymm14[15] -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm4 +; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm12, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm31 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0],xmm10[1],xmm11[2,3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,3,6,0,5,u,u,u> +; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm8 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1],ymm8[2],ymm14[3],ymm8[4],ymm14[5,6],ymm8[7],ymm14[8,9],ymm8[10],ymm14[11],ymm8[12],ymm14[13,14],ymm8[15] +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm4 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] ; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,7,0,2,4,7,0] ; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm24 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0],xmm11[1],xmm10[2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm16 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm10, %xmm30 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm6[2],ymm8[3],ymm6[4],ymm8[5,6],ymm6[7],ymm8[8,9],ymm6[10],ymm8[11],ymm6[12],ymm8[13,14],ymm6[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm18 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm23 -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm17, %ymm2 +; AVX512F-FAST-NEXT: vpermd %ymm28, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm24 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0],xmm13[1],xmm9[2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm16 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm19 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0,1],ymm6[2],ymm15[3],ymm6[4],ymm15[5,6],ymm6[7],ymm15[8,9],ymm6[10],ymm15[11],ymm6[12],ymm15[13,14],ymm6[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm15, %ymm17 +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm4, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm25 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm7[2],xmm13[3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm9[1,2],ymm14[3],ymm9[4],ymm14[5],ymm9[6,7],ymm14[8],ymm9[9,10],ymm14[11],ymm9[12],ymm14[13],ymm9[14,15] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0] -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,4,6,3,6,u,u,u> -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7] -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm26, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm7 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm7[1,2],ymm5[3],ymm7[4],ymm5[5],ymm7[6,7],ymm5[8],ymm7[9,10],ymm5[11],ymm7[12],ymm5[13],ymm7[14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm17 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,3,u,u,5,0,2,7> -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm10[1],ymm5[2],ymm10[3],ymm5[4,5],ymm10[6],ymm5[7,8],ymm10[9],ymm5[10],ymm10[11],ymm5[12,13],ymm10[14],ymm5[15] -; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3,4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm8, %zmm24 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm3 -; AVX512F-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm13 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0],xmm13[1,2,3],xmm3[4,5],xmm13[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm31 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm24, %ymm14 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm24, %zmm20 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm12 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm12[1,2],ymm15[3],ymm12[4],ymm15[5],ymm12[6,7],ymm15[8],ymm12[9,10],ymm15[11],ymm12[12],ymm15[13],ymm12[14,15] -; AVX512F-FAST-NEXT: vmovdqa %ymm15, %ymm13 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3] -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm3[1],ymm15[2],ymm3[3],ymm15[4,5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10],ymm3[11],ymm15[12,13],ymm3[14],ymm15[15] -; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm7, %zmm8, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm8 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3] -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm26, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10,11],ymm11[12],ymm9[13],ymm11[14],ymm9[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4,5],xmm2[6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm25, %ymm2 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm25, %zmm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm10[2],ymm5[3],ymm10[4],ymm5[5,6],ymm10[7],ymm5[8,9],ymm10[10],ymm5[11],ymm10[12],ymm5[13,14],ymm10[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,3,u,u,6,0,3,5> -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm7 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3,4],xmm7[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] -; AVX512F-FAST-NEXT: movb $7, %al -; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm6, %ymm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm10[2],xmm11[3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm14[1,2],ymm8[3],ymm14[4],ymm8[5],ymm14[6,7],ymm8[8],ymm14[9,10],ymm8[11],ymm14[12],ymm8[13],ymm14[14,15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm30 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <1,4,6,3,6,u,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm26, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,1,3,0,2,5,7] +; AVX512F-FAST-NEXT: vpermd %ymm28, %ymm2, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,u,u,3,5,7,2,0> +; AVX512F-FAST-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] +; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,12,13,10,11,0,1,6,7,6,7,2,3,12,13,18,19,28,29,26,27,16,17,22,23,22,23,18,19,28,29] +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} xmm27 = [3,1,6,4] +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15] +; AVX512F-FAST-NEXT: vpermd %ymm15, %ymm27, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,12,13,10,11,4,5,2,3,8,9,0,1,2,3] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm15 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2],ymm5[3,4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm5, %zmm28, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm5 -; AVX512F-FAST-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm5 = mem[0],ymm5[1],mem[2],ymm5[3],mem[4,5],ymm5[6],mem[7,8],ymm5[9],mem[10],ymm5[11],mem[12,13],ymm5[14],mem[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3],xmm5[4,5,6],xmm8[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10,11],ymm5[12],ymm0[13],ymm5[14],ymm0[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm14 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm14[1,2,3],xmm5[4,5],xmm14[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm5 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm5[1,2,3,4,5,6,7],ymm2[8],ymm5[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10,11],ymm13[12],ymm12[13],ymm13[14],ymm12[15] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0,1],ymm3[2],ymm15[3],ymm3[4],ymm15[5,6],ymm3[7],ymm15[8,9],ymm3[10],ymm15[11],ymm3[12],ymm15[13,14],ymm3[15] -; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm14, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm9[1],ymm11[2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10],ymm9[11],ymm11[12,13],ymm9[14],ymm11[15] +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm24, %ymm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm5[1,2,3,4,5,6,7],ymm0[8],ymm5[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm24, %zmm18 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4],ymm10[5],ymm12[6,7],ymm10[8],ymm12[9,10],ymm10[11],ymm12[12],ymm10[13],ymm12[14,15] +; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm27, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0],ymm0[1],ymm15[2],ymm0[3],ymm15[4,5],ymm0[6],ymm15[7,8],ymm0[9],ymm15[10],ymm0[11],ymm15[12,13],ymm0[14],ymm15[15] +; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm28, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm5 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] +; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm26, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5],ymm11[6],ymm13[7,8],ymm11[9],ymm13[10,11],ymm11[12],ymm13[13],ymm11[14],ymm13[15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm25, %ymm4 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, (%rsi) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%rsi) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%rdx) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, (%rdx) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, 64(%rcx) -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm3, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 64(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm25, %zmm3 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <1,3,u,u,6,0,3,5> +; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] +; AVX512F-FAST-NEXT: movb $7, %al +; AVX512F-FAST-NEXT: kmovw %eax, %k1 +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm7 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm7, %xmm9 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3],xmm7[4,5,6],xmm9[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0],ymm7[1,2,3,4,5,6,7],ymm4[8],ymm7[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5],ymm10[6],ymm12[7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13],ymm10[14],ymm12[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3,4],xmm7[5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7],ymm15[8,9],ymm0[10],ymm15[11],ymm0[12],ymm15[13,14],ymm0[15] +; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm13[1],ymm11[2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7,8],ymm13[9],ymm11[10],ymm13[11],ymm11[12,13],ymm13[14],ymm11[15] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7] +; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm0, 64(%rdx) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm0, (%rdx) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, 64(%rcx) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm0, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, (%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512F-FAST-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -386,7 +386,7 @@ ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] @@ -412,10 +412,11 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpsrld $16, %xmm1, %xmm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,2,3,14,15,u,u,u,u] ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm2[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] @@ -441,10 +442,11 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpsrld $16, %xmm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,2,3,14,15,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] @@ -469,28 +471,25 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1,10,7] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vpermi2d %ymm2, %ymm4, %ymm1 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,13,10,3] -; AVX512F-SLOW-NEXT: vpermi2d %ymm4, %ymm2, %ymm6 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpsrld $16, %xmm1, %xmm4 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm2[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vmovq %xmm3, (%rsi) -; AVX512F-SLOW-NEXT: vmovq %xmm0, (%rdx) -; AVX512F-SLOW-NEXT: vmovq %xmm5, (%rcx) -; AVX512F-SLOW-NEXT: vmovq %xmm1, (%r8) -; AVX512F-SLOW-NEXT: vmovq %xmm2, (%r9) -; AVX512F-SLOW-NEXT: vmovq %xmm4, (%rax) -; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: vmovq %xmm4, (%rdx) +; AVX512F-SLOW-NEXT: vmovq %xmm6, (%rcx) +; AVX512F-SLOW-NEXT: vmovq %xmm5, (%r8) +; AVX512F-SLOW-NEXT: vmovq %xmm1, (%r9) +; AVX512F-SLOW-NEXT: vmovq %xmm0, (%rax) ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i16_stride6_vf4: @@ -502,27 +501,25 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15] -; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1,10,7] -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512F-FAST-NEXT: vpermi2d %ymm2, %ymm4, %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,13,10,3] -; AVX512F-FAST-NEXT: vpermi2d %ymm4, %ymm2, %ymm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpsrld $16, %xmm1, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,2,3,14,15,u,u,u,u] +; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vmovq %xmm3, (%rsi) -; AVX512F-FAST-NEXT: vmovq %xmm0, (%rdx) -; AVX512F-FAST-NEXT: vmovq %xmm5, (%rcx) -; AVX512F-FAST-NEXT: vmovq %xmm1, (%r8) -; AVX512F-FAST-NEXT: vmovq %xmm2, (%r9) -; AVX512F-FAST-NEXT: vmovq %xmm4, (%rax) -; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: vmovq %xmm4, (%rdx) +; AVX512F-FAST-NEXT: vmovq %xmm6, (%rcx) +; AVX512F-FAST-NEXT: vmovq %xmm5, (%r8) +; AVX512F-FAST-NEXT: vmovq %xmm1, (%r9) +; AVX512F-FAST-NEXT: vmovq %xmm0, (%rax) ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-LABEL: load_i16_stride6_vf4: @@ -769,7 +766,7 @@ ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] @@ -778,7 +775,7 @@ ; AVX2-SLOW-NEXT: vpbroadcastw 74(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,14,15,u,u,6,7,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3],xmm7[4,5],xmm5[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] @@ -832,7 +829,7 @@ ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] @@ -840,7 +837,7 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vpbroadcastw 74(%rdi), %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,14,15,u,u,6,7,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3],xmm7[4,5],xmm5[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] @@ -892,7 +889,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] @@ -900,7 +897,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw 74(%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,14,15,u,u,6,7,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3],xmm7[4,5],xmm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] @@ -1600,29 +1597,29 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,2,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm5[2,3] ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm2[0,1],ymm5[0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1,2],ymm10[3,4,5,6,7],ymm2[8,9,10],ymm10[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3],xmm9[4,5],xmm10[6,7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3],xmm9[4,5],xmm10[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 @@ -1642,9 +1639,8 @@ ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[0,0,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] @@ -1656,9 +1652,8 @@ ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] @@ -1706,39 +1701,39 @@ ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3],xmm9[4,5],xmm5[6],xmm9[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm2[2,3],ymm6[2,3] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[0,1],ymm6[0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm5[1],ymm6[2,3,4,5],ymm5[6],ymm6[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm5[2,3] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm2[0,1],ymm5[0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1,2],ymm10[3,4,5,6,7],ymm2[8,9,10],ymm10[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] @@ -1747,7 +1742,7 @@ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3] ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] @@ -1755,9 +1750,8 @@ ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] @@ -1767,9 +1761,8 @@ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] @@ -1777,30 +1770,30 @@ ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5],ymm6[6],ymm5[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5],xmm4[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rsi) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rdx) ; AVX2-FAST-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm4, (%r9) @@ -1814,39 +1807,39 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3],xmm9[4,5],xmm5[6],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm2[2,3],ymm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[0,1],ymm6[0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm5[1],ymm6[2,3,4,5],ymm5[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm2[0,1],ymm5[0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1,2],ymm10[3,4,5,6,7],ymm2[8,9,10],ymm10[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] @@ -1855,7 +1848,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7] @@ -1863,9 +1856,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] @@ -1875,9 +1867,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] @@ -1885,30 +1876,30 @@ ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5],ymm6[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm10, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5],xmm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%r9) @@ -1922,40 +1913,40 @@ ; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm8 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[2,2,2,2,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],mem[2,3] -; AVX512F-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm7 +; AVX512F-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],mem[2,3] +; AVX512F-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm7 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,2,0,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7],ymm8[8,9,10],ymm5[11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3],xmm9[4,5],xmm10[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7],ymm8[8,9,10],ymm5[11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4] @@ -1991,7 +1982,7 @@ ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] @@ -1999,9 +1990,9 @@ ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX512F-SLOW-NEXT: vpternlogq $236, %ymm11, %ymm7, %ymm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,1,0,2,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,6,6,6] @@ -2012,15 +2003,15 @@ ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpternlogq $236, %ymm11, %ymm4, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rsi) +; AVX512F-SLOW-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512F-SLOW-NEXT: vmovdqa %ymm5, (%rdx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm8, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm9, (%r8) @@ -2047,25 +2038,25 @@ ; AVX512F-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm7 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm2[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm9[3,4,5,6,7],ymm2[8,9,10],ymm9[11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7],ymm8[8,9,10],ymm5[11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[u,u,u,u,2,3,u,u,10,11,14,15,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7],ymm8[8,9,10],ymm5[11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 @@ -2099,7 +2090,7 @@ ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7] @@ -2118,7 +2109,7 @@ ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,6,7,u,u,u,u,10,11,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpternlogq $236, %ymm11, %ymm4, %ymm3 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u] @@ -2210,8 +2201,8 @@ ; SSE-LABEL: load_i16_stride6_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $488, %rsp # imm = 0x1E8 -; SSE-NEXT: movdqa 304(%rdi), %xmm5 -; SSE-NEXT: movdqa 320(%rdi), %xmm7 +; SSE-NEXT: movdqa 112(%rdi), %xmm5 +; SSE-NEXT: movdqa 128(%rdi), %xmm7 ; SSE-NEXT: movdqa 64(%rdi), %xmm2 ; SSE-NEXT: movdqa 80(%rdi), %xmm10 ; SSE-NEXT: movdqa (%rdi), %xmm3 @@ -2256,7 +2247,7 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,2,4,5,6,7] ; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 288(%rdi), %xmm0 +; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2267,8 +2258,8 @@ ; SSE-NEXT: movdqa %xmm8, %xmm10 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa 352(%rdi), %xmm4 -; SSE-NEXT: movdqa 368(%rdi), %xmm0 +; SSE-NEXT: movdqa 160(%rdi), %xmm4 +; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: movdqa %xmm0, %xmm3 @@ -2282,7 +2273,7 @@ ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa 336(%rdi), %xmm0 +; SSE-NEXT: movdqa 144(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2328,22 +2319,22 @@ ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm0 +; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] ; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa 112(%rdi), %xmm1 +; SSE-NEXT: movdqa 304(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa 288(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa 160(%rdi), %xmm1 -; SSE-NEXT: movdqa 176(%rdi), %xmm11 +; SSE-NEXT: movdqa 352(%rdi), %xmm1 +; SSE-NEXT: movdqa 368(%rdi), %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] ; SSE-NEXT: movdqa %xmm11, %xmm4 @@ -2357,7 +2348,7 @@ ; SSE-NEXT: pslld $16, %xmm11 ; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE-NEXT: movdqa 144(%rdi), %xmm1 +; SSE-NEXT: movdqa 336(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm4[0,1,0,2,4,5,6,7] @@ -2760,42 +2751,42 @@ ; SSE-NEXT: pandn %xmm8, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps %xmm2, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) +; SSE-NEXT: movaps %xmm2, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps %xmm2, 48(%rdx) ; SSE-NEXT: movaps %xmm14, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rdx) +; SSE-NEXT: movaps %xmm2, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps %xmm2, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rcx) +; SSE-NEXT: movaps %xmm2, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm7, 16(%r8) +; SSE-NEXT: movdqa %xmm7, 48(%r8) ; SSE-NEXT: movdqa %xmm13, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%r8) +; SSE-NEXT: movaps %xmm2, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, (%r8) -; SSE-NEXT: movdqa %xmm3, 16(%r9) +; SSE-NEXT: movdqa %xmm3, 48(%r9) ; SSE-NEXT: movdqa %xmm4, 32(%r9) -; SSE-NEXT: movdqa %xmm5, 48(%r9) +; SSE-NEXT: movdqa %xmm5, 16(%r9) ; SSE-NEXT: movdqa %xmm6, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm0, 16(%rax) +; SSE-NEXT: movdqa %xmm0, 48(%rax) ; SSE-NEXT: movdqa %xmm12, 32(%rax) -; SSE-NEXT: movdqa %xmm15, 48(%rax) +; SSE-NEXT: movdqa %xmm15, 16(%rax) ; SSE-NEXT: movdqa %xmm1, (%rax) ; SSE-NEXT: addq $488, %rsp # imm = 0x1E8 ; SSE-NEXT: retq @@ -3253,28 +3244,28 @@ ; ; AVX2-SLOW-LABEL: load_i16_stride6_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm3[2,3],ymm2[2,3] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm3[2,3],ymm2[2,3] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm3[0,1],ymm2[0,1] ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] ; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm11[2],ymm4[3,4],ymm11[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm0 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[2,2,2,2,4,5,6,7] @@ -3282,20 +3273,21 @@ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm4 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm10 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2,3],xmm13[4],xmm10[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0],ymm14[1],ymm8[2,3,4,5],ymm14[6],ymm8[7] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm13, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm10 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0],ymm15[1],ymm7[2,3,4,5],ymm15[6],ymm7[7] +; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] @@ -3304,191 +3296,185 @@ ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm13, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm12, %ymm1 ; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3],xmm11[4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[0,2,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0],ymm8[1],ymm4[2,3,4,5],ymm8[6],ymm4[7] -; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm6, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm12 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3],xmm12[4,5],xmm4[6],xmm12[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm7[2],ymm3[3,4],ymm7[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1],xmm5[2],xmm14[3],xmm5[4,5],xmm14[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0],ymm4[1],ymm15[2,3,4,5],ymm4[6],ymm15[7] +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm6, %ymm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7],ymm14[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm14[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm8 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm15 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[2,2,2,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm11[3],xmm8[4,5],xmm11[6],xmm8[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, (%rsp), %ymm2, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm15 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,2,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2],xmm15[3],xmm12[4,5],xmm15[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm14[0],mem[1],ymm14[2,3,4,5],mem[6],ymm14[7] -; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm15, %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0,1,2],ymm8[3,4,5,6,7],ymm12[8,9,10],ymm8[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3],xmm7[4,5],xmm3[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3],xmm15[4,5],xmm11[6],xmm15[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm9 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm14[2],xmm9[3],xmm14[4,5],xmm9[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $66, (%rsp), %ymm13, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm13[0],mem[1],ymm13[2,3,4,5],mem[6],ymm13[7] +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm14, %ymm12 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7],ymm12[8,9,10],ymm11[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3],xmm8[4,5],xmm2[6],xmm8[7] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2],xmm1[3],xmm7[4,5],xmm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] ; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3],xmm5[4,5],xmm1[6],xmm5[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm4, %xmm2 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm11, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm15, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm14, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm15, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,0,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5,6],xmm5[7] -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm10[2],mem[3,4],ymm10[5],mem[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,2,3] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm5[1,2],xmm12[3],xmm5[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm2[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2],ymm1[3,4,5,6,7],ymm13[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm13[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4],xmm0[5,6],xmm7[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[2,1,2,3] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1,2],xmm12[3],xmm11[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm3, %ymm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7],ymm14[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4],xmm14[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,0,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm13[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm8[4],xmm5[5,6],xmm8[7] -; AVX2-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm10[2],ymm5[3,4],ymm10[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm11[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm14[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1,2,3],xmm9[4],xmm15[5,6],xmm9[7] +; AVX2-SLOW-NEXT: vpblendd $148, (%rsp), %ymm13, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0],xmm3[1,2],xmm12[3],xmm3[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1,2],ymm5[3,4,5,6,7],ymm12[8,9,10],ymm5[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm12[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2],xmm4[3],xmm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3,4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm4 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0],xmm2[1,2],xmm13[3],xmm2[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm15, %ymm12 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7],ymm12[8,9,10],ymm9[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm12[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5,6],xmm4[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3,4,5,6,7],ymm3[8,9,10],ymm4[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5,6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm15, %ymm6 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm6[5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5,6],ymm5[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4],xmm4[5],xmm7[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4],xmm5[5],xmm7[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6],mem[7] @@ -3501,21 +3487,21 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm4 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4],xmm7[5],xmm3[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4],xmm7[5],xmm4[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -3526,42 +3512,42 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 32(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) -; AVX2-SLOW-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX2-SLOW-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride6_vf32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm3[2,3],ymm2[2,3] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm3[2,3],ymm2[2,3] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm3[0,1],ymm2[0,1] ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] ; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm0 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[2,2,2,2,4,5,6,7] @@ -3569,24 +3555,25 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm10 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[2,2,2,2,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2,3],xmm13[4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0],ymm14[1],ymm8[2,3,4,5],ymm14[6],ymm8[7] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0],ymm15[1],ymm7[2,3,4,5],ymm15[6],ymm7[7] ; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,6,7,6,7,6,7,10,11,10,11,10,11,10,11] ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 @@ -3594,176 +3581,177 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm1 ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm11, %xmm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm3 +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,1,0,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm8[1],ymm5[2,3,4,5],ymm8[6],ymm5[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3],xmm12[4,5],xmm6[6],xmm12[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7],ymm1[8,9,10],ymm6[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm0[3],xmm3[4,5],xmm0[6],xmm3[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[2,1,0,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm14 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm14[2],xmm5[3],xmm14[4,5],xmm5[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm8[1],ymm15[2,3,4,5],ymm8[6],ymm15[7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm14 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0,1,2],ymm3[3,4,5,6,7],ymm14[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm14[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5],xmm9[6],xmm8[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, (%rsp), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm15[2],xmm3[3],xmm15[4,5],xmm3[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm14[0],mem[1],ymm14[2,3,4,5],mem[6],ymm14[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3,4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm10[2],xmm3[3],xmm10[4,5],xmm3[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm8 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm11 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3],xmm15[4,5],xmm11[6],xmm15[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5,6],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm9 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm14 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3],xmm12[4,5],xmm9[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $66, (%rsp), %ymm13, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm13[0],mem[1],ymm13[2,3,4,5],mem[6],ymm13[7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4,5],xmm2[6],xmm5[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3],xmm6[4,5],xmm2[6],xmm6[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,2,3,2,3,2,3,10,11,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2],xmm1[3],xmm7[4,5],xmm1[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm2 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm12, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm15, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm11[2],mem[3,4],ymm11[5],mem[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,0,3] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm4 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm9[4],xmm4[5,6],xmm9[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm13 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1,2],xmm15[3],xmm13[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3,4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm11[2],ymm6[3,4],ymm11[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm7 +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,1] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5,6],xmm8[7] -; AVX2-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5,6],xmm10[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm14 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1,2],xmm15[3],xmm14[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3,4],xmm0[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm5 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm8[2],ymm5[3,4],ymm8[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm14, %xmm7 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm9[4],xmm7[5,6],xmm9[7] +; AVX2-FAST-NEXT: vpblendd $148, (%rsp), %ymm13, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $36, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm12 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm12 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2],xmm15[3],xmm12[4,5,6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1,2],xmm13[3],xmm12[4,5,6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1,2],ymm7[3,4,5,6,7],ymm15[8,9,10],ymm7[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm15[5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm9, %ymm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm13[0,1,2],ymm7[3,4,5,6,7],ymm13[8,9,10],ymm7[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6],xmm2[7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2],xmm9[3],xmm10[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3,4],xmm3[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5,6],xmm6[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,6,7,6,7,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm11 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2],xmm10[3],xmm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3,4,5,6,7],ymm3[8,9,10],ymm4[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm4 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5,6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2],xmm0[3],xmm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6],ymm6[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5,6],ymm5[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,4,5,12,13,12,13,12,13,12,13] ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm8 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 @@ -3780,20 +3768,20 @@ ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,6,7,14,15,14,15,14,15,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm2 ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4],xmm6[5],xmm3[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -3805,12 +3793,12 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 32(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX2-FAST-NEXT: vzeroupper @@ -3819,27 +3807,27 @@ ; AVX2-FAST-PERLANE-LABEL: load_i16_stride6_vf32: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm3[2,3],ymm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm3[2,3],ymm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm3[0,1],ymm2[0,1] ; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[2,2,2,2,4,5,6,7] @@ -3847,24 +3835,25 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[2,2,2,2,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2,3],xmm13[4],xmm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0],ymm14[1],ymm8[2,3,4,5],ymm14[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0],ymm15[1],ymm7[2,3,4,5],ymm15[6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm13, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm14 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,6,7,6,7,6,7,10,11,10,11,10,11,10,11] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm1 @@ -3872,176 +3861,177 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm13, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm11, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm15 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm10, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm8[1],ymm5[2,3,4,5],ymm8[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm5, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3],xmm12[4,5],xmm6[6],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7],ymm1[8,9,10],ymm6[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm0[3],xmm3[4,5],xmm0[6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,8,9,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm14[2],xmm5[3],xmm14[4,5],xmm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm8[1],ymm15[2,3,4,5],ymm8[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0,1,2],ymm3[3,4,5,6,7],ymm14[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5],xmm9[6],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $146, (%rsp), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm12, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm15[2],xmm3[3],xmm15[4,5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm14[0],mem[1],ymm14[2,3,4,5],mem[6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm4 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm10[2],xmm3[3],xmm10[4,5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3],xmm15[4,5],xmm11[6],xmm15[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm15, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm14, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3],xmm12[4,5],xmm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $66, (%rsp), %ymm13, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm13[0],mem[1],ymm13[2,3,4,5],mem[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm12, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4,5],xmm2[6],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3],xmm6[4,5],xmm2[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,2,3,2,3,2,3,10,11,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2],xmm1[3],xmm7[4,5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm3, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm11[2],mem[3,4],ymm11[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm9[4],xmm4[5,6],xmm9[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm12 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm10, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1,2],xmm15[3],xmm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3,4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm11[2],ymm6[3,4],ymm11[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm13, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5,6],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5,6],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1,2],xmm15[3],xmm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3,4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm8[2],ymm5[3,4],ymm8[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm14, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm9[4],xmm7[5,6],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $148, (%rsp), %ymm13, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $36, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm4, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2],xmm15[3],xmm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1,2],xmm13[3],xmm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1,2],ymm7[3,4,5,6,7],ymm15[8,9,10],ymm7[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm9, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm13[0,1,2],ymm7[3,4,5,6,7],ymm13[8,9,10],ymm7[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2],xmm9[3],xmm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3,4],xmm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5,6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,6,7,6,7,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2],xmm10[3],xmm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3,4,5,6,7],ymm3[8,9,10],ymm4[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm14, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm8, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm9, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2],xmm0[3],xmm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,4,5,12,13,12,13,12,13,12,13] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 @@ -4058,20 +4048,20 @@ ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,6,7,14,15,14,15,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm3[4],xmm6[5],xmm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -4083,12 +4073,12 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -4097,285 +4087,285 @@ ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride6_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: pushq %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,2,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm5[2],ymm14[3,4],ymm5[5],ymm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm9[3],xmm5[4,5],xmm9[6],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,2,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2],xmm0[3],xmm9[4,5],xmm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm12[1],ymm9[2,3,4,5],ymm12[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,2,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3],xmm4[4,5],xmm7[6],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,2,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1],xmm7[2],xmm2[3],xmm7[4,5],xmm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm0[1],ymm6[2,3,4,5],ymm0[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm2, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm10[3],xmm8[4,5],xmm10[6],xmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm8[2,3],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm8, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm11[1],ymm12[2,3,4,5],ymm11[6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3],xmm7[4,5],xmm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3],xmm6[4,5],xmm2[6],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm2, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm5, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm9, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm6, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm11[3],xmm9[4,5],xmm11[6],xmm9[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm11[2,3],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm11, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm13[1],ymm11[2,3,4,5],ymm13[6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7],ymm12[8,9,10],ymm9[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm10[2],xmm5[3],xmm10[4,5],xmm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3],xmm8[4,5],xmm3[6],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm6, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm31 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,0,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2],ymm1[3,4],ymm14[5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,0,0,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,6,5,6,4] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm15[2],ymm12[3],ymm15[4],ymm12[5,6],ymm15[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm15, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm13[2],ymm15[3],ymm13[4],ymm15[5,6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm3, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm0[2],ymm15[3],ymm0[4],ymm15[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3,4],ymm14[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm0, %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm16[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6],xmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm14, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm17, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm17, %zmm0, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2],xmm2[3],xmm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5,6],xmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm17, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5,6],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm17, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm12[1],ymm0[2,3,4,5],ymm12[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm11, %ymm10, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2],xmm4[3],xmm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm14, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7],ymm4[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm9[4],xmm6[5],xmm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm11[1],ymm13[2,3,4,5],ymm11[6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm22, %ymm11, %ymm10 ; AVX512F-ONLY-SLOW-NEXT: movw $31, %ax ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5,6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm12, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm14[0,1,0,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3,4,5],ymm15[6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm12[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4],ymm10[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4],xmm3[5],xmm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3],xmm8[4],xmm10[5],xmm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0],ymm11[1],ymm10[2,3,4,5],ymm11[6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm11[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4],xmm4[5],xmm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm11, %ymm2, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm12[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm22, %ymm2, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm20, %zmm3, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm30, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm21, %zmm2, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm19, %zmm13 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm31, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm21, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm15, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm17, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512F-ONLY-SLOW-NEXT: popq %rax ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq @@ -4383,276 +4373,278 @@ ; AVX512F-ONLY-FAST-LABEL: load_i16_stride6_vf32: ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: subq $136, %rsp -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,0,1,0,1,0,1,8,9,12,13,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3],xmm9[4,5],xmm5[6],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6],ymm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5,6],ymm13[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm13, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm11[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm11, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm2[1],ymm11[2,3,4,5],ymm2[6],ymm11[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0,1],xmm8[2],xmm2[3],xmm8[4,5],xmm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm0[1],ymm9[2,3,4,5],ymm0[6],ymm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm9, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm10[3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 352(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm2[2],ymm11[3,4],ymm2[5],ymm11[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3],xmm10[4,5],xmm2[6],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm2[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm11[1],ymm12[2,3,4,5],ymm11[6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,6] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm9[2],xmm4[3],xmm9[4,5],xmm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3],xmm7[4,5],xmm3[6],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm0[2],ymm10[3,4],ymm0[5],ymm10[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3],xmm12[4,5],xmm10[6],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm12[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm12, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2,3,4,5],ymm13[6],ymm12[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm9, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,2,3,2,3,2,3,10,11,14,15,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3],xmm6[4,5],xmm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3],xmm4[4,5],xmm7[6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm30 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm31 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm2[2],ymm15[3,4],ymm2[5],ymm15[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm15, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm14, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm14[2],ymm2[3,4],ymm14[5],ymm2[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm13[2],ymm14[3],ymm13[4],ymm14[5,6],ymm13[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm14[2],ymm13[3],ymm14[4],ymm13[5,6],ymm14[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm1, %xmm16 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm16[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm12[4],xmm1[5,6],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm16 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm16[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5,6],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm17, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm17, %zmm0, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2],xmm2[3],xmm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,6,7,6,7,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm7 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5,6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5,6],xmm5[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm22 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4],xmm3[5,6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5,6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm5, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm3[4],xmm9[5],xmm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,4,5,12,13,12,13,12,13,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4],xmm7[5],xmm3[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm14[1],ymm13[2,3,4,5],ymm14[6],ymm13[7] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3,4,5],ymm13[6],ymm14[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3],xmm12[4],xmm0[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3],xmm11[4],xmm1[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm23, %ymm12, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm23, %ymm12, %ymm1 ; AVX512F-ONLY-FAST-NEXT: movw $31, %ax ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5,6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2,3],xmm7[4],xmm10[5],xmm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm11[1],ymm1[2,3],ymm11[4],ymm1[5,6],ymm11[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm15[4],xmm4[5],xmm15[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm12[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,6,7,6,7,6,7,10,11,10,11,10,11,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm7, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,2,3,6,7,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4],xmm9[5],xmm7[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm10[1],ymm12[2,3,4,5],ymm10[6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm7[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm23, %ymm9, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm8[4],xmm1[5],xmm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm7, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm12[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm28, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm22, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm29, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm22, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm17, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%r9) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512F-ONLY-FAST-NEXT: addq $136, %rsp ; AVX512F-ONLY-FAST-NEXT: vzeroupper @@ -4661,260 +4653,263 @@ ; AVX512DQ-SLOW-LABEL: load_i16_stride6_vf32: ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: pushq %rax -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm10 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,2,0,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1],ymm2[2],ymm6[3,4],ymm2[5],ymm6[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[3],xmm2[4,5],xmm8[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm16 -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],mem[2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5,6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm24 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm5, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3],xmm4[4,5],xmm6[6],xmm4[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm16 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm2[2,3],mem[2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm21 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,2,0,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2],xmm1[3],xmm9[4,5],xmm1[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm12[1],ymm8[2,3,4,5],ymm12[6],ymm8[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm29 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm13[1],ymm0[2,3,4,5],ymm13[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm9 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm28 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm14[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm11[3],xmm6[4,5],xmm11[6],xmm6[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm6 -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm6, %ymm11 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm1, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm14[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3],xmm11[4,5],xmm10[6],xmm11[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm11 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm11[2,3],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm11, %ymm11 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm6, %ymm8 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1,2],ymm10[3,4,5,6,7],ymm8[8,9,10],ymm10[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm9 ; AVX512DQ-SLOW-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm8, %zmm0, %zmm9 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm8[2],xmm4[3],xmm8[4,5],xmm4[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] ; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm15[3],xmm7[4,5],xmm15[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm7, %zmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm17, %zmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm30 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3],xmm7[4,5],xmm3[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm9[3],xmm5[4,5],xmm9[6],xmm5[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm12, %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3],xmm4[4,5],xmm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm17, %zmm5 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm30 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,6,5,6,4] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm12[2],ymm11[3],ymm12[4],ymm11[5,6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm13[2],ymm9[3],ymm13[4],ymm9[5,6],ymm13[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm0, %xmm16 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,6,5,6,4] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5,6],xmm9[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2],ymm9[3,4,5,6,7],ymm0[8,9,10],ymm9[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,4,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm16[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,6,5,6,4] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm11[4],xmm7[5,6],xmm11[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3],ymm12[4],ymm11[5,6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm8 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm17, %zmm7, %zmm10 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm17, %zmm18 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm17, %zmm18 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[3,1,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm10[0,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[1,1,1,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4],xmm9[5,6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2],xmm4[3],xmm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1,2],xmm8[3],xmm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,7,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm8[4],xmm3[5,6],xmm8[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3],xmm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm11, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[1,1,1,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5,6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4],xmm4[5,6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm7, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,2,2,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm11[1],ymm12[2,3,4,5],ymm11[6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm7 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm10, %ymm9, %ymm7 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm8 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm8[4],xmm4[5],xmm8[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm9[1],ymm13[2,3,4,5],ymm9[6],ymm13[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm10 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm14, %ymm11, %ymm10 ; AVX512DQ-SLOW-NEXT: movw $31, %ax ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm7, %zmm0, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm10, %zmm0, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm8 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm7[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm14[0,1,2,3],xmm8[4],xmm14[5],xmm8[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4,5],ymm11[6],ymm14[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm11[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm10, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2,3],xmm7[4],xmm11[5],xmm7[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0],ymm11[1],ymm9[2,3,4,5],ymm11[6],ymm9[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm11[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3,4],ymm7[5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3],xmm5[4],xmm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] ; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm15, %xmm8, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm10, %ymm1, %ymm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm14, %ymm1, %ymm2 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,1,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -4926,11 +4921,11 @@ ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, (%rdx) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm8 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm17, %zmm7 ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%r9) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, (%r9) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-SLOW-NEXT: popq %rax ; AVX512DQ-SLOW-NEXT: vzeroupper @@ -4939,270 +4934,273 @@ ; AVX512DQ-FAST-LABEL: load_i16_stride6_vf32: ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: pushq %rax -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm1 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,0,1,0,1,0,1,8,9,12,13,12,13,14,15] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm11 ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm4 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4,5],xmm4[6],xmm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1],ymm2[2],ymm13[3,4],ymm2[5],ymm13[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3],xmm9[4,5],xmm2[6],xmm9[7] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm16 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm19 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm9, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm27 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm16 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6],ymm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm11, %ymm23 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm6[2],xmm1[3],xmm6[4,5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm9, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm0[1],ymm9[2,3,4,5],ymm0[6],ymm9[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2],ymm9[3,4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm2 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3],xmm8[4,5],xmm2[6],xmm8[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm11, %ymm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm10 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm10[3],xmm8[4,5],xmm10[6],xmm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm10 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm10[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[2,3,4,5],ymm12[6],ymm10[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm24 +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm11 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4,5,6,7],ymm11[8,9,10],ymm8[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm16, %zmm17, %zmm9 ; AVX512DQ-FAST-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm8, %zmm0, %zmm9 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm14[2],xmm5[3],xmm14[4,5],xmm5[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm10, %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,2,3,2,3,2,3,10,11,14,15,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3],xmm7[4,5],xmm4[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm15[3],xmm10[4,5],xmm15[6],xmm10[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm10, %zmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm17, %zmm3 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm28 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm29 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3],xmm5[4,5],xmm15[6],xmm5[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm4, %zmm17, %zmm5 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm1 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm13[2],ymm2[3,4],ymm13[5],ymm2[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm30 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,1,0,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[0,1,2,1] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,6,5,6,4] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,0,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,1,2,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1,2],xmm0[3],xmm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm9[2],ymm12[3],ymm9[4],ymm12[5,6],ymm9[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm14[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm12[2],ymm9[3],ymm12[4],ymm9[5,6],ymm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm7 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm1 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX512DQ-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm17 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm0 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm17[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm6[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,6,5,6,4] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5,6],xmm10[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1],ymm10[2],ymm13[3],ymm10[4],ymm13[5,6],ymm10[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm11 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm16, %zmm0, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm10, %zmm17, %zmm18 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1,2],xmm10[3],xmm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6],xmm2[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm14[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5,6],xmm5[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm7, %zmm17, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,6,7,14,15,14,15,14,15,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm15[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1,2],xmm11[3],xmm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm14[4],xmm5[5,6],xmm14[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm0, %zmm2 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm15 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm8 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,12,13,12,13,12,13,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm8 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm12[1],ymm9[2,3,4,5],ymm12[6],ymm9[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm9[1],ymm12[2,3,4,5],ymm9[6],ymm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm11[1],xmm4[2,3],xmm11[4],xmm4[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3],xmm11[4],xmm1[5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm19, %ymm13, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm19, %ymm13, %ymm1 ; AVX512DQ-FAST-NEXT: movw $31, %ax ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm4, %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6],ymm11[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm14 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm14[4],xmm7[5],xmm14[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm1, %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm11[1],ymm1[2,3],ymm11[4],ymm1[5,6],ymm11[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm14 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm14[4],xmm5[5],xmm14[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm11 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0],ymm11[1],ymm12[2,3,4,5],ymm11[6],ymm12[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4],ymm7[5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm5[5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,6,7,6,7,6,7,10,11,10,11,10,11,10,11] ; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm10, %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm19, %ymm8, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm19, %ymm8, %ymm0 ; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm15, %xmm8 ; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm3 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,6,7,14,15,14,15,14,15,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm0, %zmm0, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm1, %zmm0, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%rsi) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%rdx) +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, (%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, (%rdx) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm7 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, (%r9) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-FAST-NEXT: popq %rax ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -5313,30 +5311,30 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i16_stride6_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1160, %rsp # imm = 0x488 -; SSE-NEXT: movdqa 496(%rdi), %xmm5 +; SSE-NEXT: subq $1128, %rsp # imm = 0x468 +; SSE-NEXT: movdqa 400(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 512(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm3 -; SSE-NEXT: movdqa 176(%rdi), %xmm0 -; SSE-NEXT: movdqa 112(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm4 +; SSE-NEXT: movdqa 416(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm3 +; SSE-NEXT: movdqa 80(%rdi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm14, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm1 @@ -5353,41 +5351,39 @@ ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,3,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm8[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,0,2,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 480(%rdi), %xmm0 +; SSE-NEXT: movdqa 384(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa 544(%rdi), %xmm4 -; SSE-NEXT: movdqa 560(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 448(%rdi), %xmm3 +; SSE-NEXT: movdqa 464(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm1 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa 528(%rdi), %xmm1 +; SSE-NEXT: movdqa 432(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5395,38 +5391,37 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm0 +; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 112(%rdi), %xmm10 +; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa 64(%rdi), %xmm3 -; SSE-NEXT: movdqa 80(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE-NEXT: movdqa 160(%rdi), %xmm3 +; SSE-NEXT: movdqa 176(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa 144(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5434,38 +5429,38 @@ ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 416(%rdi), %xmm0 +; SSE-NEXT: movdqa 512(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 400(%rdi), %xmm2 +; SSE-NEXT: movdqa 496(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 384(%rdi), %xmm0 +; SSE-NEXT: movdqa 480(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa 448(%rdi), %xmm4 -; SSE-NEXT: movdqa 464(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: movdqa 544(%rdi), %xmm3 +; SSE-NEXT: movdqa 560(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 432(%rdi), %xmm0 +; SSE-NEXT: movdqa 528(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5473,38 +5468,38 @@ ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm0 +; SSE-NEXT: movdqa 224(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm0 +; SSE-NEXT: movdqa 192(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa 352(%rdi), %xmm4 -; SSE-NEXT: movdqa 368(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: movdqa 256(%rdi), %xmm3 +; SSE-NEXT: movdqa 272(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 336(%rdi), %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5512,38 +5507,38 @@ ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 704(%rdi), %xmm0 +; SSE-NEXT: movdqa 608(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 688(%rdi), %xmm2 +; SSE-NEXT: movdqa 592(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm0 +; SSE-NEXT: movdqa 576(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa 736(%rdi), %xmm4 -; SSE-NEXT: movdqa 752(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa 640(%rdi), %xmm3 +; SSE-NEXT: movdqa 656(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 720(%rdi), %xmm0 +; SSE-NEXT: movdqa 624(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5551,183 +5546,182 @@ ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm0 +; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 208(%rdi), %xmm2 +; SSE-NEXT: movdqa 304(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm0 +; SSE-NEXT: movdqa 288(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa 256(%rdi), %xmm3 -; SSE-NEXT: movdqa 272(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa 352(%rdi), %xmm4 +; SSE-NEXT: movdqa 368(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[3,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 608(%rdi), %xmm0 +; SSE-NEXT: movdqa 704(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa 592(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa 688(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm0 +; SSE-NEXT: movdqa 672(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,4,6,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa 640(%rdi), %xmm7 -; SSE-NEXT: movdqa 656(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[3,0] +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa 736(%rdi), %xmm12 +; SSE-NEXT: movdqa 752(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[3,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm4[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[2,3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslld $16, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm3[2,3] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslld $16, %xmm3 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa 624(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm9[0,1,0,2,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movdqa 720(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm3[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm12 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pandn %xmm14, %xmm0 ; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,1,3,4,5,6,7] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm6 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] @@ -5738,29 +5732,29 @@ ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm10[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pandn %xmm15, %xmm12 +; SSE-NEXT: por %xmm2, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrldq {{.*#+}} xmm14 = xmm14[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm14 = xmm14[0],xmm1[0] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm6[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] @@ -5768,82 +5762,55 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm10[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm2[0] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm9[2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] @@ -5851,56 +5818,55 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: movdqa %xmm13, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm6, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] @@ -5908,56 +5874,55 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: movdqa %xmm13, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: por %xmm7, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm7, %xmm10 +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: por %xmm6, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm12[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm6[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm10 +; SSE-NEXT: por %xmm7, %xmm10 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] @@ -5965,382 +5930,407 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,5,4] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pandn %xmm7, %xmm11 +; SSE-NEXT: pand %xmm13, %xmm10 +; SSE-NEXT: por %xmm10, %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm10[0] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: pandn %xmm7, %xmm10 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm11[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm11[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,0,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm11 +; SSE-NEXT: por %xmm10, %xmm11 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $132, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,5,4] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm11 +; SSE-NEXT: por %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm12 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm12[0] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: pandn %xmm11, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm10, %xmm2 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm2[0] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm10, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufhw $231, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: psrlq $48, %xmm5 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: psrlq $48, %xmm3 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm3 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm10 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm10[0] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,2,3,3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,2,3,3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm14[0] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufhw $231, (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,4,6] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1],mem[1] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: andps %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: andps %xmm13, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm8[1] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm7[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm12[1] +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3] ; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: andps %xmm13, %xmm5 +; SSE-NEXT: por %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1],mem[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm11[1] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1],mem[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm10[1] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: pandn %xmm5, %xmm12 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm4, %xmm12 -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm12 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,0,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm9[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm8[1] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pandn %xmm5, %xmm11 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm11 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: pandn %xmm4, %xmm11 -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm5[1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm9[1] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm10 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pandn %xmm4, %xmm9 -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: por %xmm3, %xmm9 -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm10[1] +; SSE-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[1],mem[1] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3] +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,6] +; SSE-NEXT: movdqa %xmm13, %xmm9 +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: andps %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: psrld $16, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm5[1] ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,6] -; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,5,7] +; SSE-NEXT: movdqa %xmm13, %xmm8 ; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: andps %xmm15, %xmm3 +; SSE-NEXT: andps %xmm13, %xmm3 ; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: psrlq $48, %xmm13 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrlq $48, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm4 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] ; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm15[0],xmm2[1,2,3] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: movdqa %xmm13, %xmm7 ; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: andps %xmm15, %xmm2 +; SSE-NEXT: andps %xmm13, %xmm2 ; SSE-NEXT: por %xmm2, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: andps %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrlq $48, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm3 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: andps %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: andps %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -6357,9 +6347,9 @@ ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: movdqa %xmm13, %xmm14 ; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: andps %xmm15, %xmm1 +; SSE-NEXT: andps %xmm13, %xmm1 ; SSE-NEXT: por %xmm1, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm2 @@ -6377,10 +6367,10 @@ ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm13 -; SSE-NEXT: pandn %xmm2, %xmm13 -; SSE-NEXT: andps %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: andps %xmm13, %xmm1 +; SSE-NEXT: por %xmm1, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -6397,28 +6387,28 @@ ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: movdqa %xmm13, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: andps %xmm15, %xmm1 +; SSE-NEXT: andps %xmm13, %xmm1 ; SSE-NEXT: por %xmm1, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm4 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,5,7] -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: andps %xmm15, %xmm3 +; SSE-NEXT: andps %xmm13, %xmm3 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrlq $48, %xmm1 @@ -6432,62 +6422,62 @@ ; SSE-NEXT: # xmm3 = mem[0,1,2,3,4,5,5,7] ; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm3 +; SSE-NEXT: andps %xmm13, %xmm3 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,5,7] -; SSE-NEXT: pandn %xmm4, %xmm15 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rsi) +; SSE-NEXT: pandn %xmm4, %xmm13 +; SSE-NEXT: por %xmm3, %xmm13 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 64(%rsi) +; SSE-NEXT: movaps %xmm3, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rsi) +; SSE-NEXT: movaps %xmm3, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps %xmm1, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rcx) +; SSE-NEXT: movaps %xmm1, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps %xmm1, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps %xmm1, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 96(%r8) @@ -6499,12 +6489,12 @@ ; SSE-NEXT: movaps %xmm1, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%r8) -; SSE-NEXT: movdqa %xmm8, 112(%r9) -; SSE-NEXT: movdqa %xmm9, 96(%r9) +; SSE-NEXT: movdqa %xmm9, 112(%r9) +; SSE-NEXT: movdqa %xmm10, 96(%r9) ; SSE-NEXT: movdqa %xmm11, 80(%r9) ; SSE-NEXT: movdqa %xmm12, 64(%r9) ; SSE-NEXT: movdqa %xmm0, 48(%r9) @@ -6512,18 +6502,18 @@ ; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm15, 112(%rax) +; SSE-NEXT: movdqa %xmm13, 112(%rax) ; SSE-NEXT: movdqa %xmm2, 96(%rax) ; SSE-NEXT: movdqa %xmm5, 80(%rax) -; SSE-NEXT: movdqa %xmm13, 64(%rax) +; SSE-NEXT: movdqa %xmm15, 64(%rax) ; SSE-NEXT: movdqa %xmm14, 48(%rax) -; SSE-NEXT: movdqa %xmm10, 32(%rax) -; SSE-NEXT: movdqa %xmm6, 16(%rax) -; SSE-NEXT: movdqa %xmm7, (%rax) -; SSE-NEXT: addq $1160, %rsp # imm = 0x488 +; SSE-NEXT: movdqa %xmm6, 32(%rax) +; SSE-NEXT: movdqa %xmm7, 16(%rax) +; SSE-NEXT: movdqa %xmm8, (%rax) +; SSE-NEXT: addq $1128, %rsp # imm = 0x468 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride6_vf64: @@ -6540,27 +6530,28 @@ ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1 @@ -6579,136 +6570,137 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa 672(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm3 @@ -6743,29 +6735,28 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[0,1,0,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] ; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm15 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3,4,5],xmm14[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -6779,9 +6770,10 @@ ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm15 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,5,7,6,7] @@ -6795,12 +6787,12 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm13 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3,4,5],xmm12[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -6813,28 +6805,28 @@ ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,5,7,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd $250, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm9 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3,4,5],xmm8[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -6842,313 +6834,316 @@ ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,1,1,1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,0,1,12,13,14,15,8,9,10,11,12,13,14,15] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm4[0,1],mem[2,3],xmm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm6[0,1],mem[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm4[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm15, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm6[0,1],mem[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm11[4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm9[4,5],xmm8[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm14[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm5[0],xmm0[0] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm10[0,1],mem[2,3],xmm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vpshufd $85, (%rsp), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm9[0],xmm5[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3,4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm15, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $207, (%rsp), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm1[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm5[0],xmm0[0] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3,4],xmm14[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm13[0],xmm5[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3,4],xmm14[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm15, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm5[0],xmm0[0] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3,4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm2[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm8[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm7[0],xmm1[0] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,2,3,14,15,14,15,8,9,10,11,12,13,14,15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm1[3,4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3,4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm9, %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm4, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3,4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3,4],xmm6[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3,4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -7157,8 +7152,8 @@ ; AVX1-ONLY-NEXT: # xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1],xmm1[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 @@ -7172,20 +7167,62 @@ ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm12[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm10[1] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm8, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm2[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpunpckhqdq (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[1],mem[1] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] @@ -7198,231 +7235,190 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1],xmm2[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[1],mem[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm8, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1],xmm2[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm10[5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, (%rsp), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1],xmm7[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm15[0,1,0,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[1],mem[1] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm14[5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm11[1] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm14[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1],xmm6[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,5,4,6] -; AVX1-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[1],mem[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1],mem[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm9 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm7[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1],xmm6[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm9 ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,4,5,4,6] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm14[0,1,2,3,4,5,4,6] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm1[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm9[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm11 +; AVX1-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm8, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm11[1],xmm9[1] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3,4],ymm9[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm8 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm9 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm8[1],xmm6[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm7[1],xmm5[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm6[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm8[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm8[5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm15[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm9[1],xmm7[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm5 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7440,10 +7436,10 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) @@ -7460,926 +7456,924 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rax) ; AVX1-ONLY-NEXT: addq $1368, %rsp # imm = 0x558 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride6_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1256, %rsp # imm = 0x4E8 +; AVX2-SLOW-NEXT: subq $1272, %rsp # imm = 0x4F8 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm7[2,3],ymm6[2,3] -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm7[2,3],ymm6[2,3] +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[0,1],ymm6[0,1] ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm5[2,3],ymm4[2,3] -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm5[0,1],ymm4[0,1] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm5[2,3],ymm4[2,3] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1] ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[0,1],ymm1[0,1] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm6[1],ymm13[2,3,4,5],ymm6[6],ymm13[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm15, %xmm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4,5],ymm3[6],ymm5[7] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm10 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,2,2,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0],ymm11[1],mem[2,3,4,5],ymm11[6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm6, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm9[1],ymm12[2,3,4,5],ymm9[6],ymm12[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm12 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0],xmm13[1],xmm1[2,3],xmm13[4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm10[1],ymm8[2,3,4,5],ymm10[6],ymm8[7] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm2[1],xmm13[2,3],xmm2[4],xmm13[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm3 -; AVX2-SLOW-NEXT: vpshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = mem[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm2 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm15[1],ymm14[2,3,4,5],ymm15[6],ymm14[7] +; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm12 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm12, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[1,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1],ymm12[2],ymm3[3,4],ymm12[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm14, %xmm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm13 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0],xmm12[1],xmm3[2,3],xmm12[4],xmm3[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm3, %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm12, %ymm11, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm7[1],xmm4[2,3],xmm7[4],xmm4[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm5, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm14, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[1,1,2,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, %xmm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3],xmm5[4,5],xmm6[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm9[0],mem[1],ymm9[2,3,4,5],mem[6],ymm9[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vmovdqa %xmm11, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm6, %xmm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1],xmm13[2],xmm14[3],xmm13[4,5],xmm14[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX2-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm11[0],mem[1],ymm11[2,3,4,5],mem[6],ymm11[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[0,1,2,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3],xmm14[4,5],xmm15[6],xmm14[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7],ymm15[8,9,10],ymm14[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,2] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3],xmm15[4,5],xmm14[6],xmm15[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,2,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3],xmm15[4,5],xmm13[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm8[1],ymm10[2,3,4,5],ymm8[6],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm8, %xmm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3],xmm10[4,5],xmm11[6],xmm10[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm3, %xmm11 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm15[2],xmm11[3],xmm15[4,5],xmm11[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7],ymm15[8,9,10],ymm14[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1,2],xmm15[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm7 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,2,2,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3],xmm7[4,5],xmm12[6],xmm7[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm12, %xmm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,2,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2],xmm0[3],xmm9[4,5],xmm0[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[0],ymm9[1],mem[2,3,4,5],ymm9[6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm9, %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2],ymm7[3,4,5,6,7],ymm0[8,9,10],ymm7[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,10,11,u,u,2,3,14,15,u,u,u,u] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm0 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm11[2,2,2,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm15[3],xmm0[4,5],xmm15[6],xmm0[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm15[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5],xmm3[6],xmm0[7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] +; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm14[2],xmm0[3],xmm14[4,5],xmm0[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm14, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm8, %xmm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,5,5,5,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm12 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3],xmm12[4,5],xmm10[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7],ymm5[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,5,5,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,5,5,5,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm9, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm12, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpblendd $219, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5,6],xmm5[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,3,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1,2],xmm12[3],xmm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,1,2,3] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1,2],xmm6[3],xmm1[4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm12 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm12[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm10[0,0,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm15[4],xmm13[5,6],xmm15[7] +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm5[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm4[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3],xmm15[4],xmm14[5,6],xmm15[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[2,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[2,1,2,3] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm15 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm15[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm11[1,2],xmm6[3],xmm11[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm13[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm15[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2],xmm12[3],xmm13[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm13 +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm2, %ymm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7],ymm14[8,9,10],ymm13[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm14[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,0,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[0,0,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm13[0,1,2,3,6,5,6,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7] +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm12[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,6,5,6,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0,1,2,3],xmm10[4],xmm14[5,6],xmm10[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,1,2,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1,2],xmm8[3],xmm5[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm11[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7],ymm9[8,9,10],ymm8[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,5,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm9[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = mem[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm14[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5,6],xmm5[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1,2],xmm7[3],xmm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7],ymm7[8,9,10],ymm5[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm7[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5,6],xmm5[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm1[0,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[2,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm11[1,2],xmm6[3],xmm11[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm14[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7],ymm11[8,9,10],ymm10[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm11[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = mem[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3],xmm6[4],xmm10[5,6],xmm6[7] +; AVX2-SLOW-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = mem[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm11 = mem[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2],xmm10[3],xmm11[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2,3,4],xmm0[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = mem[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4],xmm6[5,6],xmm0[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1,2],xmm1[3],xmm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0,1,2],ymm5[3,4,5,6,7],ymm3[8,9,10],ymm5[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6],xmm1[7] -; AVX2-SLOW-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = mem[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = mem[0,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2],xmm3[3],xmm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7],ymm7[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4],xmm4[5,6],xmm0[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,7,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5,6],xmm3[7] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6],xmm0[7] +; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm14, %ymm3 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,3,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm8[4],xmm3[5],xmm8[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5,6],mem[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,2,3,4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5,6],mem[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $146, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5,6],mem[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm11 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm11[4],xmm9[5],xmm11[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,2,3,4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5,6],mem[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[0,1,0,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm14 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5],xmm14[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm13 = mem[0,1,2,3,4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5,6],mem[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5,6],mem[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm15, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,3,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm14[0,1,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1,2,3],xmm4[4],xmm10[5],xmm4[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm5 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm11[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1,2,3],xmm5[4],xmm9[5],xmm5[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2,3],xmm3[4],xmm11[5],xmm3[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4],xmm5[5],xmm7[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm15, %xmm6 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4],xmm9[5],xmm6[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r8) +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm7 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm14[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1,2,3],xmm7[4],xmm10[5],xmm7[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm10, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm10, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm10, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm10, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm10, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm10, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm10, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm10, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm10, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm10, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm10, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm10, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, 96(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-SLOW-NEXT: vmovdqa %ymm4, 96(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm13, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 64(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rax) -; AVX2-SLOW-NEXT: addq $1256, %rsp # imm = 0x4E8 +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-SLOW-NEXT: addq $1272, %rsp # imm = 0x4F8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride6_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $1272, %rsp # imm = 0x4F8 +; AVX2-FAST-NEXT: subq $1256, %rsp # imm = 0x4E8 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm7 -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm7[2,3],ymm6[2,3] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1] -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[0,1],ymm6[0,1] ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm5[2,3],ymm4[2,3] -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm12, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm5[0,1],ymm4[0,1] +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm7[1],ymm11[2,3,4,5],ymm7[6],ymm11[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm15, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm2 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[2,2,2,2,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4,5],ymm3[6],ymm5[7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[0],ymm10[1],mem[2,3,4,5],ymm10[6],mem[7] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[2,2,2,2,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0],xmm13[1],xmm1[2,3],xmm13[4],xmm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm13, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2,3],xmm13[4],xmm15[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm13, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm13 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1],ymm12[2],ymm1[3,4],ymm12[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm13 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[2,2,2,2,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0],xmm12[1],xmm1[2,3],xmm12[4],xmm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm12, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,6,7,6,7,6,7,10,11,10,11,10,11,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm12[1],xmm4[2,3],xmm12[4],xmm4[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm8, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm14, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6,7] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3],xmm8[4,5],xmm0[6,7] -; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm14[0],mem[1],ymm14[2,3,4,5],mem[6],ymm14[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm10[0],mem[1],ymm10[2,3,4,5],mem[6],ymm10[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm14 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3],xmm14[4,5],xmm0[6],xmm14[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7],ymm15[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm6 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm12, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm10[0,1,2],xmm6[3],xmm10[4,5],xmm6[6],xmm10[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $109, (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm14 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,1,0,3] +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm10 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,1,0,3] ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm15 -; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm13[2],xmm15[3],xmm13[4,5],xmm15[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3,4,5],ymm11[6],ymm12[7] +; AVX2-FAST-NEXT: vmovdqa %xmm15, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm15 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2],xmm15[3],xmm12[4,5],xmm15[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3],xmm11[4,5],xmm0[6],xmm11[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm12 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm14 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3],xmm14[4,5],xmm12[6],xmm14[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm14 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,1,0,3] +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm14 +; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm15 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1],xmm13[2],xmm14[3],xmm13[4,5],xmm14[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm9 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm11 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2],xmm0[3],xmm12[4,5],xmm0[6],xmm12[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3],xmm8[4,5],xmm1[6,7] +; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm15[2],xmm0[3],xmm15[4,5],xmm0[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm14 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3],xmm14[4,5],xmm7[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm3 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,2,3,2,3,2,3,10,11,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm11, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2],xmm8[3],xmm11[4,5],xmm8[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufhw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = mem[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5],xmm3[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3],xmm6[4,5],xmm3[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7],ymm7[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm1 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm11, %xmm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,5,5,5,5] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5],xmm3[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm10, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm12, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8389,206 +8383,197 @@ ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,0,3] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] +; AVX2-FAST-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm10 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm6 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,1,0,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm2 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm3[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1,2,3],xmm12[4],xmm2[5,6],xmm12[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, (%rsp), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[0,3,2,1] -; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm13 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1,2],xmm15[3],xmm13[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0,1,2],ymm12[3,4,5,6,7],ymm15[8,9,10],ymm12[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm15[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm14 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm6[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1,2],xmm15[3],xmm14[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm15 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[2,1,0,3] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[2,1,0,3] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm14[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm12[4],xmm0[5,6],xmm12[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm10, %xmm3 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[2,1,2,0,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1,2],xmm5[3],xmm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,3,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm0 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1,2],xmm11[3],xmm0[4,5,6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3,4,5,6,7],ymm11[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm11[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,1,0,3] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm11 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,2,1] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,5,6,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,6,5,6,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4],xmm11[5,6],xmm10[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[2,1,2,0,4,5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1,2],xmm9[3],xmm1[4,5,6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7],ymm9[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,5,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm9[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm10[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm9 +; AVX2-FAST-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm10 = mem[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5,6],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,6,7,6,7,14,15,14,15,14,15,14,15] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm9 = mem[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm9[4],xmm1[5,6],xmm9[7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1,2],xmm4[3],xmm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm7 = mem[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm0[1,2],xmm7[3],xmm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4],xmm0[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2],ymm9[3,4,5,6,7],ymm0[8,9,10],ymm9[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm1 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1,2],xmm6[3],xmm1[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm14, %ymm6 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm6 -; AVX2-FAST-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm11 = mem[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1,2],xmm11[3],xmm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm11[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6],xmm2[7] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm5 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2],xmm3[3],xmm5[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm0 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm1 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0],xmm1[1,2],xmm12[3],xmm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm12 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm12[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm1 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm4 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,1] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm3[4],xmm9[5],xmm3[6,7] +; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,12,13,12,13,12,13,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2,3],xmm3[4],xmm8[5],xmm3[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm11 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm12 +; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,1] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm10, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm12 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload @@ -8598,8 +8583,8 @@ ; AVX2-FAST-NEXT: # ymm12 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm14 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm15 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm13, %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm15 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5],xmm14[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -8608,34 +8593,34 @@ ; AVX2-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,1] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5],xmm10[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm15, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5],xmm5[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,2,3,6,7,14,15,14,15,14,15,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm5[4],xmm10[5],xmm5[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2,3],xmm4[4],xmm12[5],xmm4[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5],xmm10[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -8658,328 +8643,326 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 96(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 96(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm4, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 96(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm6, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 96(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm14, 32(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm11, (%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm11, 64(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm3, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, (%rax) -; AVX2-FAST-NEXT: addq $1272, %rsp # imm = 0x4F8 +; AVX2-FAST-NEXT: vmovdqa %ymm8, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-NEXT: addq $1256, %rsp # imm = 0x4E8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i16_stride6_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1272, %rsp # imm = 0x4F8 +; AVX2-FAST-PERLANE-NEXT: subq $1256, %rsp # imm = 0x4E8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm7[2,3],ymm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[0,1],ymm6[0,1] ; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm5[2,3],ymm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm5[0,1],ymm4[0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm7[1],ymm11[2,3,4,5],ymm7[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[2,2,2,2,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4,5],ymm3[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0],ymm10[1],mem[2,3,4,5],ymm10[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm12, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[2,2,2,2,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0],xmm13[1],xmm1[2,3],xmm13[4],xmm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm12[1],ymm11[2,3,4,5],ymm12[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm13, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm9 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2,3],xmm13[4],xmm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm13, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm7, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm8, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1],ymm12[2],ymm1[3,4],ymm12[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm14, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[2,2,2,2,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0],xmm12[1],xmm1[2,3],xmm12[4],xmm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm12, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,6,7,6,7,6,7,10,11,10,11,10,11,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm12[1],xmm4[2,3],xmm12[4],xmm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm8, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm14, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3],xmm8[4,5],xmm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm14[0],mem[1],ymm14[2,3,4,5],mem[6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,8,9,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm10[0],mem[1],ymm10[2,3,4,5],mem[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm4, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3],xmm14[4,5],xmm0[6],xmm14[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7],ymm15[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm10[0,1,2],xmm6[3],xmm10[4,5],xmm6[6],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $109, (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm13[2],xmm15[3],xmm13[4,5],xmm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3,4,5],ymm11[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm15, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm6, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm0, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2],xmm15[3],xmm12[4,5],xmm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm6, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3],xmm11[4,5],xmm0[6],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3],xmm14[4,5],xmm12[6],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm0, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1],xmm13[2],xmm14[3],xmm13[4,5],xmm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm11, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm10, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2],xmm0[3],xmm12[4,5],xmm0[6],xmm12[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,u,u,0,1,u,u,8,9,12,13,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3],xmm8[4,5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm13, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm15[2],xmm0[3],xmm15[4,5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm8, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7],ymm1[8,9,10],ymm9[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm15, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3],xmm14[4,5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,2,3,2,3,2,3,10,11,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2],xmm8[3],xmm11[4,5],xmm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,2,3,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5],xmm3[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3],xmm6[4,5],xmm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7],ymm7[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5],xmm3[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm11, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm10, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm8, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm12, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8989,206 +8972,197 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm7, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm0 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm12 = xmm3[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1,2,3],xmm12[4],xmm2[5,6],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, (%rsp), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1,2],xmm15[3],xmm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0,1,2],ymm12[3,4,5,6,7],ymm15[8,9,10],ymm12[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm6[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1,2],xmm15[3],xmm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm15, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm12 = xmm14[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm12[4],xmm0[5,6],xmm12[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm6, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[2,1,2,0,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1,2],xmm5[3],xmm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1,2],xmm11[3],xmm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm14, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3,4,5,6,7],ymm11[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm12, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm5, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,6,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4],xmm11[5,6],xmm10[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[2,1,2,0,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1,2],xmm9[3],xmm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7],ymm9[8,9,10],ymm2[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5,6],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,6,7,6,7,14,15,14,15,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm9[4],xmm1[5,6],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1,2],xmm4[3],xmm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm0[1,2],xmm7[3],xmm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4],xmm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2],ymm9[3,4,5,6,7],ymm0[8,9,10],ymm9[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1,2],xmm6[3],xmm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm14, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1,2],xmm11[3],xmm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2],xmm3[3],xmm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm15, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0],xmm1[1,2],xmm12[3],xmm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm6, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm8, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm3[4],xmm9[5],xmm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,12,13,12,13,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2,3],xmm3[4],xmm8[5],xmm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload @@ -9198,8 +9172,8 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm13, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm12, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm12, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5],xmm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -9208,34 +9182,34 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm15, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5],xmm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm15, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5],xmm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,2,3,6,7,14,15,14,15,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm13, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm5[4],xmm10[5],xmm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2,3],xmm4[4],xmm12[5],xmm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5],xmm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -9258,236 +9232,231 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 96(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 96(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $1272, %rsp # imm = 0x4F8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $1256, %rsp # imm = 0x4E8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride6_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $1416, %rsp # imm = 0x588 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 608(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: subq $1512, %rsp # imm = 0x5E8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,0,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,2,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 544(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm10, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,0,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm17 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 480(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 736(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm7, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm19 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 672(%rdi), %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[0,2,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,0,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm22 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm14, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm8, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 544(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm6, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm23 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,2,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0],ymm1[1],ymm3[2,3,4,5],ymm1[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm6, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,2,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 480(%rdi), %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm12, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm11[0,1,2],xmm13[3],xmm11[4,5],xmm13[6],xmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm11[2,3],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm11, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],mem[2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm15, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm14, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2],xmm14[3],xmm0[4,5],xmm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4,5],xmm4[6],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 672(%rdi), %ymm3, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4,5],ymm0[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm14, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1],xmm0[2],xmm11[3],xmm0[4,5],xmm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm14[3],xmm0[4,5],xmm14[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm17, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,5,5,5,5] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm11, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3],xmm1[4,5],xmm7[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm10, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm12, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm3, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,5,5,5,5] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] @@ -9500,9 +9469,9 @@ ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] @@ -9513,380 +9482,377 @@ ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm18 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,0,0,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm13 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,1,2,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,0,0,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,6,5,6,4] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm10, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[2,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm29, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm24, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm2[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0,1,2,3],xmm11[4],xmm1[5,6],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0,1,2],ymm11[3,4,5,6,7],ymm8[8,9,10],ymm11[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm29, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm24, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm20, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,1,1,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3],xmm2[4],xmm14[5,6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1,2],xmm0[3],xmm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm20, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,1,1,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm8[4],xmm11[5,6],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm8, %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[3,1,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm14[1,2],xmm2[3],xmm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0],xmm11[1,2],xmm8[3],xmm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm15[1,1,1,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2,3],xmm2[4],xmm12[5,6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4],xmm0[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4],xmm9[5,6],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm10, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7],ymm9[8,9,10],ymm7[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm25 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm28, %zmm29, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm24, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[1,1,1,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm24, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2],xmm0[3],xmm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[1,1,1,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5,6],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[0,1,3,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2],xmm2[3],xmm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm5, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2],xmm5[3],xmm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm4[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm24, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm24, %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm13 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6],mem[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm24 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm5, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm5, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm30 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm4, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm30, %ymm4, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: movw $31, %ax ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm20 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm6, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm6, %xmm27 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm8, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm19 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm8, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm30, %ymm3, %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,2,2,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm3 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,0,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm14, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm15, %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm14[4],xmm3[5],xmm14[6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm31 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm31 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm7 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm5, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,0,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm10[4],xmm1[5],xmm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm7, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm8, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0],xmm10[1],xmm2[2,3],xmm10[4],xmm2[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm6[1],xmm0[2,3],xmm6[4],xmm0[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm24, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm0[4],xmm13[5],xmm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm19[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm13, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm22[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm30, %ymm13, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm20, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1,2,3],xmm8[4],xmm13[5],xmm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2,3],xmm6[4],xmm13[5],xmm6[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm29, %ymm8, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm18[1,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3],xmm7[4],xmm2[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $236, %ymm30, %ymm6, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[1,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3],xmm6[4],xmm3[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm11[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm11[4],xmm9[5],xmm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5],xmm10[6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm9, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5],xmm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm8, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5],xmm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rsi) ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rdx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload @@ -9894,599 +9860,580 @@ ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm4, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm29, %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm4, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 64(%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, (%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 64(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%r9) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $1416, %rsp # imm = 0x588 +; AVX512F-ONLY-SLOW-NEXT: addq $1512, %rsp # imm = 0x5E8 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i16_stride6_vf64: ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: subq $1512, %rsp # imm = 0x5E8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,8,9,12,13,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, %xmm5 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 544(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm21 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm22 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 480(%rdi), %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 736(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm16 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 672(%rdi), %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u> ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 544(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3],xmm4[4,5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm1[1],ymm4[2,3,4,5],ymm1[6],ymm4[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 480(%rdi), %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1,2],xmm9[3],xmm3[4,5],xmm9[6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm3, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1,2],ymm9[3,4,5,6,7],ymm0[8,9,10],ymm9[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 672(%rdi), %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm0[1],ymm3[2,3,4,5],ymm0[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,2,3,2,3,10,11,14,15,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0,1],xmm15[2],xmm13[3],xmm15[4,5],xmm13[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm14[3],xmm0[4,5],xmm14[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm11, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2],xmm0[3],xmm11[4,5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm14, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm9[3],xmm0[4,5],xmm9[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2],xmm0[3],xmm7[4,5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm7 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm7[2],xmm0[3],xmm7[4,5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm10, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3],xmm5[4,5],xmm2[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm15[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13] ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm14, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm28 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm21 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm24 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm18 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm18 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm17 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5,6],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $107, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,5,4] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm4, %zmm29, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm4, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2],xmm4[3],xmm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,1,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,6,5,6,4] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5,6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,2,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,1,0,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,1,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,6,5,6,4] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,1,2,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm3[0,1,2,3,6,5,6,4] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3],xmm15[4],xmm1[5,6],xmm15[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm13 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm11[0,1,2],ymm15[3,4,5,6,7],ymm11[8,9,10],ymm15[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,5,4] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm13[0,1,2],ymm15[3,4,5,6,7],ymm13[8,9,10],ymm15[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm23, %zmm29, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm0[1,2],xmm11[3],xmm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm24, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,6,7,6,7,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0],xmm13[1,2],xmm15[3],xmm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm14, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6],xmm14[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm0, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm11, %xmm20 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1,2],xmm13[3],xmm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1,2],xmm15[3],xmm14[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm0[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm11[4],xmm13[5,6],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm0[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,5,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3],xmm12[4],xmm14[5,6],xmm12[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm14, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3,4,5,6,7],ymm14[8,9,10],ymm11[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm24, %zmm29, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1,2],ymm12[3,4,5,6,7],ymm14[8,9,10],ymm12[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm28, %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm24, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[3,1,2,1,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1,2],xmm10[3],xmm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5,6],xmm9[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[3,1,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1,2],xmm6[3],xmm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm25, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm24, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm26 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm12 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,12,13,12,13,12,13,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm25 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = ymm1[0],mem[1],ymm1[2,3,4,5],mem[6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm10 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm30 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm30, %ymm3, %ymm1 ; AVX512F-ONLY-FAST-NEXT: movw $31, %ax ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $66, (%rsp), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,2,2,2,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5],xmm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm29 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = ymm1[0],mem[1],ymm1[2,3,4,5],mem[6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm8, %xmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = ymm1[0],mem[1],ymm1[2,3,4,5],mem[6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm15, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm18 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm30, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm11, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm31 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm31 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm7 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm8[4],xmm1[5],xmm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm1[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1],xmm11[2,3],xmm8[4],xmm11[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm11, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4],xmm15[5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4],xmm8[5],xmm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,6,7,6,7,6,7,10,11,10,11,10,11,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm10, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0],xmm1[1],xmm10[2,3],xmm1[4],xmm10[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm30, %ymm10, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,6,7,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2,3],xmm0[4],xmm12[5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm12 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4],ymm1[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm29, %ymm15, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1,2,3],xmm5[4],xmm9[5],xmm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm15, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3],xmm0[4],xmm15[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, %ymm30, %ymm12, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0],xmm3[1],xmm12[2,3],xmm3[4],xmm12[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm11, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5],xmm11[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm12 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm8, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] @@ -10498,12 +10445,12 @@ ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%rsi) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rsi) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload @@ -10515,7 +10462,7 @@ ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm30, %zmm4, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm29, %zmm4, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm31, %zmm4, %zmm12 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm4, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm4, %zmm0 @@ -10524,9 +10471,9 @@ ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, (%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 64(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, (%r8) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 64(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) @@ -10536,78 +10483,78 @@ ; ; AVX512DQ-SLOW-LABEL: load_i16_stride6_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $840, %rsp # imm = 0x348 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vmovdqa 608(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: subq $936, %rsp # imm = 0x3A8 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm0 -; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm1, %xmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm24[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm12, %xmm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm15 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,2,0,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 544(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm2 -; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm15, %xmm21 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm21[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm18 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm19 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm14, %xmm2 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm20 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 480(%rdi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm20 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm21 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 736(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 704(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm2 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,2,2,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm19 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm7, %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],mem[2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 672(%rdi), %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm2 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm23 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm3 @@ -10615,143 +10562,142 @@ ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm24 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 544(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm8 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm5, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,2,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm1[1],ymm3[2,3,4,5],ymm1[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm10 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,2] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm10[0,1,2],xmm12[3],xmm10[4,5],xmm12[6],xmm10[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm10 -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm10[2,3],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 288(%rdi), %ymm10, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm6 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm14, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3],xmm0[4,5],xmm13[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4,5],xmm4[6],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,2,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 480(%rdi), %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm3 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm9, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 736(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 704(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 672(%rdi), %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm3 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3,4,5,6,7],ymm3[8,9,10],ymm4[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm0, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1],xmm0[2],xmm12[3],xmm0[4,5],xmm12[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm15 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm15[3],xmm0[4,5],xmm15[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1],xmm1[2],xmm14[3],xmm1[4,5],xmm14[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm15 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm15, %ymm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2],ymm15[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm10[3],xmm1[4,5],xmm10[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm10 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm10, %ymm10 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm5 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm15 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm15 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm26, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm13[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3],xmm1[4,5],xmm7[6],xmm1[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm15, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm9, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm8, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,5,5,5,5] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm4 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm24 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,1,2,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -10759,970 +10705,948 @@ ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm22 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,6,4] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm20 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm25 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm0[2],ymm3[3],ymm0[4],ymm3[5,6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,0,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,6,5,6,4] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm21, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,1,2,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[0,0,0,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,6,5,6,4] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm11, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,1,2,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[2,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,1,2,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm4, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,1,2,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm28 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm21, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm2[0,1,2,3,6,5,6,4] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3],xmm15[4],xmm1[5,6],xmm15[7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm1, %ymm13 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm13[0,1,2],ymm15[3,4,5,6,7],ymm13[8,9,10],ymm15[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm30 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm19, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm14 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,1,1,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3],xmm2[4],xmm14[5,6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm13 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm13[1,2],xmm0[3],xmm13[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm20, %xmm13 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,6,5] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm15 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,1,1,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2,3],xmm13[4],xmm15[5,6],xmm13[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm13, %zmm20 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm14[1,2],xmm2[3],xmm14[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[3,1,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0],xmm15[1,2],xmm13[3],xmm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm15[1,1,1,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1,2,3],xmm2[4],xmm12[5,6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm13 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm27 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2,3,4],xmm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,1,1,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0,1,2,3],xmm10[4],xmm14[5,6],xmm10[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm11, %ymm11 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7],ymm11[8,9,10],ymm10[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm29 ; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm20, %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm27 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[3,1,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[1,1,1,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm29 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[3,1,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[0,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,7,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1,2],xmm0[3],xmm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[1,1,1,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5,6],xmm7[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2],xmm2[3],xmm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm5, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2],xmm5[3],xmm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm4[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm20 +; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm4 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm26, %zmm20 ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm19 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm5, %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm18 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm8 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm30 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm21, %ymm4, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, %xmm8 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm2 = ymm1[0],mem[1],ymm1[2,3,4,5],mem[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm27 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm22, %ymm4, %ymm1 ; AVX512DQ-SLOW-NEXT: movw $31, %ax ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm0, %zmm0, %zmm22 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm4[1],ymm0[2,3,4,5],ymm4[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm1, %zmm0, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm2 = ymm1[0],mem[1],ymm1[2,3,4,5],mem[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm18 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm8, %xmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm7, %xmm17 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm7 = ymm1[0],mem[1],ymm1[2,3,4,5],mem[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm12 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm16 +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm22, %ymm3, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm14 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm0 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[2,2,2,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm9 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm25 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm1, %zmm0, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm7 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,1] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,1,0,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm6, %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm6, %xmm16 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm1 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm17 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm14 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm15 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm12 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm21, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm6 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm10 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,2,2,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm5 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm25 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm4, %zmm0, %zmm25 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm4, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5],xmm11[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm11 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm11[1],xmm3[2,3],xmm11[4],xmm3[5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4],xmm10[5],xmm11[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm10, %zmm5 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm27[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm10[1],xmm6[2,3],xmm10[4],xmm6[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm11 +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm22, %ymm11, %ymm6 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm6, %zmm0, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm6 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm0[4],xmm13[5],xmm0[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm26 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm30[1,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3],xmm11[4],xmm8[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm13 -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm21, %ymm13, %ymm8 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm8, %zmm0, %zmm26 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1,2,3],xmm8[4],xmm13[5],xmm8[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0,1,2,3],xmm6[4],xmm11[5],xmm6[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm14, %ymm8 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm6, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[1,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm21, %ymm8, %ymm11 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,2,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $236, %ymm22, %ymm6, %ymm10 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[1,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3],xmm6[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm6 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3],xmm6[4],xmm8[5],xmm6[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm6, %zmm1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm11, %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm5, %ymm5 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, (%rsi) +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $0, %ymm10, %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm7, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 64(%rsi) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 64(%rdx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, (%rdx) ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm3, %zmm24 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm25, %zmm3, %zmm23 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm26, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, (%rcx) +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm3, %zmm23 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm25, %zmm3, %zmm24 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 64(%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, (%rcx) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 64(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 64(%r9) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%r9) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, (%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 64(%r9) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, (%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-SLOW-NEXT: addq $840, %rsp # imm = 0x348 +; AVX512DQ-SLOW-NEXT: addq $936, %rsp # imm = 0x3A8 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: load_i16_stride6_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $936, %rsp # imm = 0x3A8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> -; AVX512DQ-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: subq $968, %rsp # imm = 0x3C8 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm0 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,8,9,12,13,12,13,14,15] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, %xmm6 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 544(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,4,5,4,5,4,5,8,9,8,9,8,9,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm1 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,4,5,4,5,6,7,0,1,12,13,12,13,4,5] +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm19 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm21 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm20 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 480(%rdi), %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 736(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm19 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm2[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 672(%rdi), %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm5[1],ymm2[2,3,4,5],ymm5[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm18 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm8, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm27 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm3 ; AVX512DQ-FAST-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm6, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm10 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa 544(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm24 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3],xmm4[4,5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm26 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 96(%rdi), %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm1[1],ymm4[2,3,4,5],ymm1[6],ymm4[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm4[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 480(%rdi), %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 736(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm13 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0,1,2],xmm13[3],xmm3[4,5],xmm13[6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, 288(%rdi), %ymm3, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm6 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm0 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm7[2],xmm0[3],xmm7[4,5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3],xmm6[4,5],xmm2[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2],ymm7[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23> -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm16, %zmm7 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm15, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3],xmm5[4,5],xmm4[6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5],xmm3[6],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5],xmm2[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],mem[2,3] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vinserti128 $1, 672(%rdi), %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5],ymm3[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3,4,5,6,7],ymm3[8,9,10],ymm4[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm3, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,14,15,12,13,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,2,3,2,3,2,3,10,11,14,15,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm15 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0,1],xmm15[2],xmm13[3],xmm15[4,5],xmm13[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm14 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm14[3],xmm0[4,5],xmm14[6],xmm0[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm16, %zmm4 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm12, %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm14 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2],xmm12[3],xmm14[4,5],xmm12[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm15, %ymm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2],ymm15[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm12 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm12 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3],xmm12[4,5],xmm9[6],xmm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm12 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm12 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7],ymm12[8,9,10],ymm9[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm15 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm9, %zmm0, %zmm15 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3],xmm8[4,5],xmm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13] ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm1 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm14, %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm20 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,6,4] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm24 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm22 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm20 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm18 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,1,2,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm23 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm25 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $219, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm18 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm17 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5,6],xmm6[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm8, %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,5,4] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm4, %zmm21, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm4, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm4 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[2,1,2,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2],xmm4[3],xmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm4 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,1,2,1] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,6,5,6,4] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5,6],xmm5[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm19 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,2,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,1,0,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,1,2,1] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,6,5,6,4] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm1 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm15 = xmm3[0,1,2,3,6,5,6,4] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3],xmm15[4],xmm1[5,6],xmm15[7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm13 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm11[0,1,2],ymm15[3,4,5,6,7],ymm11[8,9,10],ymm15[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,5,4] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm13[0,1,2],ymm15[3,4,5,6,7],ymm13[8,9,10],ymm15[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm31 ; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm19, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm27, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm0[1,2],xmm11[3],xmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u> +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,6,7,6,7,14,15,14,15,14,15,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm13 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0],xmm13[1,2],xmm15[3],xmm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15] ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm14, %zmm20 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm11, %xmm22 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm12 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6],xmm14[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm0, %zmm20 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1,2],xmm13[3],xmm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1,2],xmm15[3],xmm14[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 ; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm0[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm11 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm11 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm11[4],xmm13[5,6],xmm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19> +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm0[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm12 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm12 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,5,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3],xmm12[4],xmm14[5,6],xmm12[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm14, %ymm14 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3,4,5,6,7],ymm14[8,9,10],ymm11[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,4,5] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm29 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm14 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1,2],ymm12[3,4,5,6,7],ymm14[8,9,10],ymm12[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm29 ; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm20, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm27, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm26, %zmm29 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[3,1,2,1,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1,2],xmm10[3],xmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5,6],xmm9[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1,2],xmm6[3],xmm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm4 ; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm5 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm27, %zmm20 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm20 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm21, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm20 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15> -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm15 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm31 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm26 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,12,13,12,13,12,13,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm30 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5],xmm2[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = ymm1[0],mem[1],ymm1[2,3,4,5],mem[6],ymm1[7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm15 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm18 -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm28, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm28, %ymm4, %ymm1 ; AVX512DQ-FAST-NEXT: movw $31, %ax ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm0, %zmm0, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $146, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm27 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm1, %zmm0, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = ymm1[0],mem[1],ymm1[2,3,4,5],mem[6],ymm1[7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm19 +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm22 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm21 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm14 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = ymm1[0],mem[1],ymm1[2,3,4,5],mem[6],ymm1[7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm6 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[2,2,2,2,4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm17 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm16 -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm28, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm12 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,2,2,2,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm2 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm28, %ymm1, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm11 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,2,2,2,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm5 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5],xmm2[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm24 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm3, %zmm0, %zmm24 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm24 +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm4, %zmm0, %zmm24 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm6 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm11, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,6,7,6,7,6,7,10,11,10,11,10,11,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm9 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm28, %ymm11, %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm11 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm15 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1,2,3],xmm4[4],xmm15[5],xmm4[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm26 -; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm9, %zmm0, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm28, %ymm9, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm9 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm9[1],xmm12[2,3],xmm9[4],xmm12[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,2,3,6,7,14,15,14,15,14,15,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm14 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1,2,3],xmm3[4],xmm14[5],xmm3[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm26 +; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm7, %zmm0, %zmm26 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm9 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1,2,3],xmm4[4],xmm9[5],xmm4[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm15 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4],xmm7[5],xmm3[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm14 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4],ymm3[5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm14 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm14 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm28, %ymm15, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3],xmm1[4],xmm6[5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm13 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3],xmm0[4],xmm13[5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $236, %ymm28, %ymm14, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm10, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 ; AVX512DQ-FAST-NEXT: vinserti32x8 $0, %ymm0, %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5],xmm2[6,7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, (%rsi) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 64(%rsi) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, (%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 64(%rdx) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm2, (%rdx) ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm22, %zmm2, %zmm23 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm21, %zmm2, %zmm23 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm24, %zmm2, %zmm25 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm26, %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm26, %zmm2, %zmm3 ; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 64(%rcx) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm1, (%rcx) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 64(%r8) @@ -11731,8 +11655,8 @@ ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-FAST-NEXT: addq $936, %rsp # imm = 0x3A8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-FAST-NEXT: addq $968, %rsp # imm = 0x3C8 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -119,20 +119,20 @@ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpsrld $16, %xmm0, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpbroadcastw 8(%rdi), %xmm8 ; AVX2-FAST-NEXT: vpsrlq $48, %xmm1, %xmm9 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovd %xmm2, (%rsi) ; AVX2-FAST-NEXT: vmovd %xmm4, (%rdx) ; AVX2-FAST-NEXT: vmovd %xmm7, (%rcx) -; AVX2-FAST-NEXT: vmovd %xmm5, (%r8) +; AVX2-FAST-NEXT: vmovd %xmm6, (%r8) ; AVX2-FAST-NEXT: vmovd %xmm8, (%r9) ; AVX2-FAST-NEXT: vmovd %xmm3, (%r10) ; AVX2-FAST-NEXT: vmovd %xmm0, (%rax) @@ -147,20 +147,20 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpsrld $16, %xmm0, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw 8(%rdi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpsrlq $48, %xmm1, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm2, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm4, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm7, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovd %xmm5, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovd %xmm6, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm8, (%r9) ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm3, (%r10) ; AVX2-FAST-PERLANE-NEXT: vmovd %xmm0, (%rax) @@ -205,20 +205,20 @@ ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpsrld $16, %xmm0, %xmm3 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm7 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpbroadcastw 8(%rdi), %xmm8 ; AVX512F-FAST-NEXT: vpsrlq $48, %xmm1, %xmm9 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX512F-FAST-NEXT: vmovd %xmm2, (%rsi) ; AVX512F-FAST-NEXT: vmovd %xmm4, (%rdx) ; AVX512F-FAST-NEXT: vmovd %xmm7, (%rcx) -; AVX512F-FAST-NEXT: vmovd %xmm5, (%r8) +; AVX512F-FAST-NEXT: vmovd %xmm6, (%r8) ; AVX512F-FAST-NEXT: vmovd %xmm8, (%r9) ; AVX512F-FAST-NEXT: vmovd %xmm3, (%r10) ; AVX512F-FAST-NEXT: vmovd %xmm0, (%rax) @@ -436,49 +436,49 @@ ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm5[2],xmm3[3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6],xmm1[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3,4,5,6],xmm4[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm5[2],xmm6[3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6],xmm2[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6],xmm4[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-SLOW-NEXT: vmovq %xmm1, (%rsi) -; AVX2-SLOW-NEXT: vmovq %xmm6, (%rdx) -; AVX2-SLOW-NEXT: vmovq %xmm3, (%rcx) -; AVX2-SLOW-NEXT: vmovq %xmm4, (%r8) -; AVX2-SLOW-NEXT: vmovq %xmm5, (%r9) +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-SLOW-NEXT: vmovq %xmm2, (%rsi) +; AVX2-SLOW-NEXT: vmovq %xmm3, (%rdx) +; AVX2-SLOW-NEXT: vmovq %xmm4, (%rcx) +; AVX2-SLOW-NEXT: vmovq %xmm5, (%r8) +; AVX2-SLOW-NEXT: vmovq %xmm6, (%r9) ; AVX2-SLOW-NEXT: vmovq %xmm7, (%r10) ; AVX2-SLOW-NEXT: vmovq %xmm0, (%rax) ; AVX2-SLOW-NEXT: vzeroupper @@ -493,26 +493,26 @@ ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],xmm4[2],xmm2[3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm3[6],xmm5[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm3[6],xmm6[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6],xmm3[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm8 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm8 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm7 @@ -523,11 +523,11 @@ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-NEXT: vmovq %xmm5, (%rsi) -; AVX2-FAST-NEXT: vmovq %xmm6, (%rdx) -; AVX2-FAST-NEXT: vmovq %xmm2, (%rcx) -; AVX2-FAST-NEXT: vmovq %xmm3, (%r8) -; AVX2-FAST-NEXT: vmovq %xmm4, (%r9) +; AVX2-FAST-NEXT: vmovq %xmm6, (%rsi) +; AVX2-FAST-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FAST-NEXT: vmovq %xmm3, (%rcx) +; AVX2-FAST-NEXT: vmovq %xmm4, (%r8) +; AVX2-FAST-NEXT: vmovq %xmm5, (%r9) ; AVX2-FAST-NEXT: vmovq %xmm7, (%r10) ; AVX2-FAST-NEXT: vmovq %xmm0, (%rax) ; AVX2-FAST-NEXT: vzeroupper @@ -542,26 +542,26 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],xmm4[2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm3[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm3[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm9, %xmm7 @@ -572,11 +572,11 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm5, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm6, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm2, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm3, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm4, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm6, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm2, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm3, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm4, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm5, (%r9) ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm7, (%r10) ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -586,20 +586,20 @@ ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm3[2],xmm4[3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6],xmm2[7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] @@ -642,45 +642,45 @@ ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm3[2],xmm4[3] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6],xmm2[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm8 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm7 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512F-FAST-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-FAST-NEXT: vmovq %xmm4, (%rdx) -; AVX512F-FAST-NEXT: vmovq %xmm1, (%rcx) +; AVX512F-FAST-NEXT: vmovq %xmm1, (%rdx) +; AVX512F-FAST-NEXT: vmovq %xmm2, (%rcx) ; AVX512F-FAST-NEXT: vmovq %xmm5, (%r8) ; AVX512F-FAST-NEXT: vmovq %xmm6, (%r9) ; AVX512F-FAST-NEXT: vmovq %xmm7, (%r10) -; AVX512F-FAST-NEXT: vmovq %xmm2, (%rax) +; AVX512F-FAST-NEXT: vmovq %xmm3, (%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -876,7 +876,7 @@ ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] @@ -889,7 +889,7 @@ ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm12[0],xmm2[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm15[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,2,2,2] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm11[0,2] ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero @@ -1063,7 +1063,7 @@ ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,7,0,1,14,15,u,u,10,11] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 @@ -1146,7 +1146,7 @@ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,4,5,2,3,0,1,14,15,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,7,0,1,14,15,u,u,10,11] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] @@ -1225,7 +1225,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,4,5,2,3,0,1,14,15,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,7,0,1,14,15,u,u,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] @@ -1279,53 +1279,55 @@ ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3,4],xmm6[5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4],xmm7[5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,2,1,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm11[1,2,3,4,5,6],xmm2[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,0,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,7,6] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,2,1,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3,4,5,6],xmm2[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,7,6] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,4,6,7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 @@ -1338,10 +1340,10 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm3, (%rsi) ; AVX512F-SLOW-NEXT: vmovdqa %xmm6, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa %xmm7, (%rcx) -; AVX512F-SLOW-NEXT: vmovdqa %xmm8, (%r8) -; AVX512F-SLOW-NEXT: vmovdqa %xmm9, (%r9) -; AVX512F-SLOW-NEXT: vmovdqa %xmm10, (%r10) +; AVX512F-SLOW-NEXT: vmovdqa %xmm9, (%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %xmm10, (%r8) +; AVX512F-SLOW-NEXT: vmovdqa %xmm11, (%r9) +; AVX512F-SLOW-NEXT: vmovdqa %xmm7, (%r10) ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, (%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq @@ -1363,47 +1365,49 @@ ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,14,15,12,13,10,11,8,9,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm8[2,3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,8,9,6,7,4,5] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3,4],xmm6[5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,10,11,8,9,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4],xmm7[5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,10,11,8,9,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,4,5,2,3,0,1,14,15,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm12 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm11 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm11[1,2,3,4,5,6],xmm2[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,4,5,2,3,0,1,14,15,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm14 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3,4,5,6],xmm2[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u] @@ -1415,10 +1419,10 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vmovdqa %xmm3, (%rsi) ; AVX512F-FAST-NEXT: vmovdqa %xmm6, (%rdx) -; AVX512F-FAST-NEXT: vmovdqa %xmm7, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa %xmm8, (%r8) -; AVX512F-FAST-NEXT: vmovdqa %xmm9, (%r9) -; AVX512F-FAST-NEXT: vmovdqa %xmm10, (%r10) +; AVX512F-FAST-NEXT: vmovdqa %xmm9, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa %xmm10, (%r8) +; AVX512F-FAST-NEXT: vmovdqa %xmm11, (%r9) +; AVX512F-FAST-NEXT: vmovdqa %xmm7, (%r10) ; AVX512F-FAST-NEXT: vmovdqa %xmm0, (%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -1474,107 +1478,103 @@ ; SSE-LABEL: load_i16_stride7_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $216, %rsp -; SSE-NEXT: movdqa 80(%rdi), %xmm11 -; SSE-NEXT: movdqa 64(%rdi), %xmm10 +; SSE-NEXT: movdqa 192(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm13 +; SSE-NEXT: movdqa (%rdi), %xmm7 +; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: movaps 32(%rdi), %xmm15 +; SSE-NEXT: movaps 48(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm11 +; SSE-NEXT: movdqa 96(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,0,0] ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm6 -; SSE-NEXT: movdqa 128(%rdi), %xmm8 -; SSE-NEXT: movaps 160(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm13 -; SSE-NEXT: movdqa 176(%rdi), %xmm15 -; SSE-NEXT: movdqa 208(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm9 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm5[2,2] +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm6[2,2] ; SSE-NEXT: movaps {{.*#+}} xmm12 = [65535,65535,65535,0,0,65535,65535,65535] ; SSE-NEXT: movaps %xmm12, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,0,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,1,0,3] +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: movdqa 208(%rdi), %xmm5 ; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm7 ; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm10 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movaps 32(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] ; SSE-NEXT: movaps %xmm12, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,0,3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: pand %xmm7, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pand %xmm1, %xmm15 -; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: pandn %xmm10, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: pand %xmm1, %xmm11 +; SSE-NEXT: por %xmm2, %xmm11 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm14, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,1] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] ; SSE-NEXT: movdqa %xmm12, %xmm2 @@ -1583,230 +1583,234 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm12, %xmm3 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm4 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pand %xmm1, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: pand %xmm6, %xmm8 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,3,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7] ; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: movdqa (%rsp), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] +; SSE-NEXT: movaps %xmm9, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] ; SSE-NEXT: pandn %xmm4, %xmm12 ; SSE-NEXT: por %xmm3, %xmm12 -; SSE-NEXT: pand %xmm10, %xmm12 +; SSE-NEXT: pand %xmm7, %xmm12 ; SSE-NEXT: por %xmm0, %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3] -; SSE-NEXT: andps %xmm10, %xmm5 -; SSE-NEXT: orps %xmm3, %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,1,0,1] -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,1,0,3] +; SSE-NEXT: movaps %xmm9, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: andps %xmm7, %xmm2 +; SSE-NEXT: orps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,0,1] +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm11[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm10, %xmm0 -; SSE-NEXT: orps %xmm3, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm8[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm7, %xmm0 +; SSE-NEXT: orps %xmm10, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: andps %xmm10, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pand %xmm14, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: movdqa %xmm9, %xmm15 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm11 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: andps %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: andps %xmm7, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm11 ; SSE-NEXT: psrld $16, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm15 ; SSE-NEXT: psrld $16, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm3[4],xmm15[5],xmm3[5],xmm15[6],xmm3[6],xmm15[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pandn %xmm8, %xmm14 ; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,2] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] @@ -1814,53 +1818,53 @@ ; SSE-NEXT: # xmm2 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm2[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm14[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movapd %xmm9, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r9) +; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movapd %xmm5, 16(%r9) +; SSE-NEXT: movapd %xmm10, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm15, (%rax) -; SSE-NEXT: movaps %xmm11, 16(%rax) +; SSE-NEXT: movaps %xmm15, 16(%rax) +; SSE-NEXT: movaps %xmm11, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm13, (%rax) -; SSE-NEXT: movapd %xmm8, 16(%rax) +; SSE-NEXT: movapd %xmm13, 16(%rax) +; SSE-NEXT: movapd %xmm8, (%rax) ; SSE-NEXT: addq $216, %rsp ; SSE-NEXT: retq ; @@ -2352,7 +2356,7 @@ ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,1,0,2] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [3,6,2,5,3,6,2,5] @@ -2444,7 +2448,7 @@ ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1,2,3,4,5,6,7],ymm13[8],ymm9[9,10,11,12,13,14,15] @@ -2522,7 +2526,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm10[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] @@ -2711,12 +2715,12 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6,7,8,9,10],ymm8[11],ymm10[12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm10, %ymm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10 @@ -2727,43 +2731,43 @@ ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,0,3,4,5,4,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5,6,7,8,9,10,11],ymm11[12],ymm10[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4],ymm11[5,6,7,8,9,10,11],ymm10[12],ymm11[13,14,15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3,4,5,6,7],ymm10[8],ymm8[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vpor %ymm10, %ymm11, %ymm10 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3,4,5,6,7],ymm10[8],ymm8[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,2,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpor %ymm10, %ymm11, %ymm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,1,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm12[0,1,2,0,4,5,6,4] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,3,2,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpor %ymm11, %ymm10, %ymm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,1,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm9[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,1,2,0,4,5,6,4] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5,6],ymm9[7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[0,1,2,1,4,5,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] @@ -2840,307 +2844,157 @@ ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; -; AVX512F-ONLY-FAST-LABEL: load_i16_stride7_vf16: -; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] -; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [10,3,6,15,12,13,6,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13] -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm14[4],xmm12[5],xmm14[6],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm12, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3],xmm15[4],xmm14[5],xmm15[6],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm14, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm10, %ymm14, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0],ymm13[1,2,3,4,5,6,7],ymm10[8],ymm13[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2],xmm15[3],xmm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,3,7,10,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm13[1,2,3,4,5,6,7],ymm7[8],ymm13[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm9[1,2,3,4,5,6,7],ymm8[8],ymm9[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,3,u,0,3,7,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7],ymm11[8,9,10,11,12],ymm9[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] -; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,4,7,0,0,4,7,0] -; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm10, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm8, (%r9) -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm9, (%rax) -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX512F-ONLY-FAST-NEXT: vzeroupper -; AVX512F-ONLY-FAST-NEXT: retq -; -; AVX512DQ-FAST-LABEL: load_i16_stride7_vf16: -; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] -; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [10,3,6,15,12,13,6,15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm14[4],xmm12[5],xmm14[6],xmm12[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm12, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3],xmm15[4],xmm14[5],xmm15[6],xmm14[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm14, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5] -; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm10, %ymm14, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0],ymm13[1,2,3,4,5,6,7],ymm10[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2],xmm15[3],xmm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,3,7,10,14,u,u,u> -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm13[1,2,3,4,5,6,7],ymm7[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm13 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm9[1,2,3,4,5,6,7],ymm8[8],ymm9[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,3,u,0,3,7,u> -; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7],ymm11[8,9,10,11,12],ymm9[13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u> -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] -; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm12 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,4,7,0,0,4,7,0] -; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, (%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa %ymm10, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, (%r9) -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, (%rax) -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX512DQ-FAST-NEXT: vzeroupper -; AVX512DQ-FAST-NEXT: retq +; AVX512F-FAST-LABEL: load_i16_stride7_vf16: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13] +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %ymm16 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm16[0,1,0,2] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,9,u,13,4,u,u,7> +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm7 +; AVX512F-FAST-NEXT: vpermi2d %ymm5, %ymm7, %ymm10 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpor %ymm10, %ymm11, %ymm10 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,5,9,u,12,u,u,u> +; AVX512F-FAST-NEXT: vpermi2d %ymm7, %ymm5, %ymm11 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpor %ymm11, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm8[1,2,3,4,5,6,7],ymm11[8],ymm8[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,6,9,u,13,u,u,u> +; AVX512F-FAST-NEXT: vpermi2d %ymm7, %ymm5, %ymm10 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[2,3,4,5,10,11] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3,4,5],xmm12[6],xmm13[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [2,5,2,5,2,5,2,5] +; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1,2,3,4,5,6,7],ymm10[8],ymm12[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,11,14,u,u,5,u,u> +; AVX512F-FAST-NEXT: vpermi2d %ymm5, %ymm7, %ymm12 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpor %ymm12, %ymm11, %ymm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm16[0,1,1,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1,2,3,4,5,6,7],ymm12[8],ymm13[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm9 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5,6],ymm9[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,3,7,10,14,u,u,u> +; AVX512F-FAST-NEXT: vpermi2d %ymm7, %ymm5, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm14, %xmm3 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm13[2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm9[1,2,3,4,5,6,7],ymm3[8],ymm9[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,5,9,12,2,5,9,12] +; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm3 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,3,7,0,0,3,7,0] +; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm13[5,6,7],ymm3[8,9,10,11,12],ymm13[13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm14, %xmm14 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,4,7,11,14,u,u,u> +; AVX512F-FAST-NEXT: vpermi2d %ymm7, %ymm5, %ymm14 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm3[1,2,3,4,5,6,7],ymm13[8],ymm3[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [2,6,9,13,2,6,9,13] +; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0] +; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <9,12,0,3,7,u,u,u> +; AVX512F-FAST-NEXT: vpermi2d %ymm5, %ymm7, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa %ymm6, (%rsi) +; AVX512F-FAST-NEXT: vmovdqa %ymm8, (%rdx) +; AVX512F-FAST-NEXT: vmovdqa %ymm10, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm11, (%r8) +; AVX512F-FAST-NEXT: vmovdqa %ymm9, (%r9) +; AVX512F-FAST-NEXT: vmovdqa %ymm3, (%r10) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rax) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq ; ; AVX512BW-LABEL: load_i16_stride7_vf16: ; AVX512BW: # %bb.0: @@ -3229,102 +3083,100 @@ ; SSE-LABEL: load_i16_stride7_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $600, %rsp # imm = 0x258 -; SSE-NEXT: movdqa 304(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm10 -; SSE-NEXT: movdqa 128(%rdi), %xmm11 -; SSE-NEXT: movaps 160(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm5 +; SSE-NEXT: movdqa 192(%rdi), %xmm7 +; SSE-NEXT: movdqa 176(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa 16(%rdi), %xmm14 +; SSE-NEXT: movaps 32(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm4 -; SSE-NEXT: movdqa 176(%rdi), %xmm3 +; SSE-NEXT: movaps 48(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa 64(%rdi), %xmm15 +; SSE-NEXT: movdqa 96(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,0,0] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm6[2,2] -; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm5, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,1,0,3] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,0,3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa 320(%rdi), %xmm5 -; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: movdqa 208(%rdi), %xmm10 +; SSE-NEXT: pand %xmm5, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,0,0] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 272(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm0 +; SSE-NEXT: movaps 160(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] +; SSE-NEXT: movaps %xmm5, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: movdqa 224(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,0,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 240(%rdi), %xmm9 +; SSE-NEXT: movdqa 128(%rdi), %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pand %xmm5, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 416(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 416(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 400(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movaps 384(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movaps 384(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm1[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] +; SSE-NEXT: movaps %xmm5, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 ; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3334,274 +3186,271 @@ ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pand %xmm5, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 80(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm0 +; SSE-NEXT: movdqa 304(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 288(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 32(%rdi), %xmm0 +; SSE-NEXT: movaps 272(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 256(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] +; SSE-NEXT: movaps %xmm5, %xmm3 ; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 224(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: pand %xmm5, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm12, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm13[0,1,2,3,4,5] ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: pand %xmm2, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm14 +; SSE-NEXT: por %xmm1, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm9 ; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm4 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: pand %xmm6, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm4 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pand %xmm12, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,1] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm3 -; SSE-NEXT: orps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm5 +; SSE-NEXT: orps %xmm1, %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm5[0],xmm6[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm6 -; SSE-NEXT: orps %xmm4, %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm5[0],xmm8[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm8 +; SSE-NEXT: orps %xmm4, %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm14[2],xmm4[3],xmm14[3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] -; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: movdqa %xmm12, %xmm5 ; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm4[1] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: orps %xmm5, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1] -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,1,0,1] -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm14, %xmm5 -; SSE-NEXT: pandn %xmm8, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,1,0,3] +; SSE-NEXT: movss {{.*#+}} xmm9 = xmm6[0],xmm9[1,2,3] +; SSE-NEXT: andps %xmm12, %xmm9 +; SSE-NEXT: orps %xmm5, %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,1,0,1] +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: por %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,3,3] ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload @@ -3609,47 +3458,78 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: orps %xmm5, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm9, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm7, %xmm5 +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm7 ; SSE-NEXT: por %xmm5, %xmm6 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm2[0],xmm5[1,2,3] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm5 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pandn %xmm14, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm0[0],xmm5[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: andps %xmm14, %xmm5 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: andps %xmm12, %xmm2 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pand %xmm15, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -3657,113 +3537,56 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: andps %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: andps %xmm14, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: andps %xmm14, %xmm0 +; SSE-NEXT: andps %xmm12, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: psrld $16, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: punpcklwd (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] @@ -3773,253 +3596,282 @@ ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: psrld $16, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: psrld $16, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm4 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm4 ; SSE-NEXT: psrlq $16, %xmm4 -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,0,3] +; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,6,4,7] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: psrld $16, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: psrlq $16, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: psrld $16, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,6,4,7] +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: psrlq $16, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,5,6,4,7] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm11, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: psrld $16, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: psrlq $16, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,4,7] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: pand %xmm11, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm15 ; SSE-NEXT: por %xmm0, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,2] -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pand %xmm1, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,2] +; SSE-NEXT: punpcklwd (%rsp), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: pand %xmm5, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE-NEXT: movdqa %xmm10, %xmm13 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[0,2] +; SSE-NEXT: movdqa %xmm9, %xmm8 ; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm3[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm15[0],xmm4[1] -; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm14[0],xmm4[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[1,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,1,0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm15[0],xmm3[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm14[0],xmm3[1] +; SSE-NEXT: pshufd $85, (%rsp), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,2,2] +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,1,1] -; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,1,0,3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps %xmm2, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rdx) +; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps %xmm2, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps %xmm2, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rcx) +; SSE-NEXT: movaps %xmm2, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps %xmm2, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%r8) +; SSE-NEXT: movaps %xmm2, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%r8) +; SSE-NEXT: movaps %xmm2, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%r8) +; SSE-NEXT: movaps %xmm2, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%r9) +; SSE-NEXT: movaps %xmm2, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%r9) +; SSE-NEXT: movaps %xmm2, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movaps %xmm6, 32(%rax) +; SSE-NEXT: movaps %xmm7, 48(%rax) +; SSE-NEXT: movaps %xmm11, 16(%rax) ; SSE-NEXT: movaps %xmm5, (%rax) -; SSE-NEXT: movaps %xmm6, 48(%rax) -; SSE-NEXT: movaps %xmm10, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm0, (%rax) +; SSE-NEXT: movapd %xmm0, 32(%rax) ; SSE-NEXT: movapd %xmm1, 48(%rax) -; SSE-NEXT: movapd %xmm3, 32(%rax) -; SSE-NEXT: movapd %xmm4, 16(%rax) +; SSE-NEXT: movapd %xmm3, 16(%rax) +; SSE-NEXT: movapd %xmm4, (%rax) ; SSE-NEXT: addq $600, %rsp # imm = 0x258 ; SSE-NEXT: retq ; @@ -4874,7 +4726,7 @@ ; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm12[2],ymm6[3,4,5],ymm12[6],ymm6[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,8,9,8,9,8,9,0,1,14,15,0,1,10,11,24,25,24,25,24,25,24,25,16,17,30,31,16,17,26,27] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6],ymm4[7,8,9,10,11,12,13],ymm0[14],ymm4[15] @@ -4935,7 +4787,7 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] @@ -4995,7 +4847,7 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] @@ -5184,7 +5036,7 @@ ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm14 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm7[0,1,0,2] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FAST-NEXT: vpblendd $31, (%rsp), %ymm13, %ymm1 # 32-byte Folded Reload @@ -5209,7 +5061,7 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,4,5,6,7,6,7,12,13,16,17,18,19,20,21,22,23,20,21,22,23,22,23,28,29] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -5233,7 +5085,7 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [2,5,2,5,2,5,2,5] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,3,2,3,2,5] ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm11 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] @@ -5261,7 +5113,7 @@ ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm7[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,8,9,10,11,2,3,8,9,16,17,18,19,20,21,22,23,24,25,26,27,18,19,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] @@ -5287,7 +5139,7 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm1 @@ -5303,7 +5155,7 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm0[2],ymm10[3,4,5],ymm0[6],ymm10[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,3,7,2,6,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm12 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm4[1,2,3,4,5,6,7],ymm8[8],ymm4[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] @@ -5327,19 +5179,19 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1],ymm9[2],ymm12[3,4,5],ymm9[6],ymm12[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,3,7,2,6,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,3,3,0,3,7,7] ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,5,1,4,2,5,1,4] ; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm11, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7],ymm8[8,9,10,11,12],ymm4[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm6 @@ -5351,17 +5203,17 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,4,7,3,6,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,2,3,2,3,2,3,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4,5,6,7],ymm6[8],ymm4[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm11, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2,3,4],ymm2[5,6,7],ymm6[8,9,10,11,12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1],ymm7[2,3],ymm5[4,5],ymm7[6,7] @@ -5371,7 +5223,7 @@ ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7] ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,2,3,2,3,2,3,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] @@ -5380,12 +5232,12 @@ ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0] ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,6,1,5,2,6,1,5] ; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -5401,7 +5253,7 @@ ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <1,4,0,3,7,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] @@ -5575,7 +5427,7 @@ ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm7 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm1[5,6,7] @@ -5601,7 +5453,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm1 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,4,5,6,7,6,7,12,13,16,17,18,19,20,21,22,23,20,21,22,23,22,23,28,29] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm15, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -5627,7 +5479,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm8[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm14 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15,16,17,18,19,20,21,22,23,16,17,18,19,16,17,30,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -5653,7 +5505,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm8[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm8 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,8,9,10,11,2,3,8,9,16,17,18,19,20,21,22,23,24,25,26,27,18,19,24,25] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm5, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1,2,3,4,5,6,7],ymm11[8],ymm0[9,10,11,12,13,14,15] @@ -5673,7 +5525,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm14 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm14[2],ymm15[3,4,5],ymm14[6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,8,9,8,9,0,1,14,15,0,1,10,11,24,25,24,25,24,25,24,25,16,17,30,31,16,17,26,27] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7,8,9,10,11,12,13],ymm0[14],ymm6[15] @@ -5691,7 +5543,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm10 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15] @@ -5733,7 +5585,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 @@ -5773,7 +5625,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm7[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1,2,3,4,5,6],ymm9[7,8],ymm7[9,10,11,12,13,14],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2,3],ymm0[4,5,6,7] @@ -5789,7 +5641,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] @@ -5931,9 +5783,9 @@ ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -6003,8 +5855,7 @@ ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7,8,9,10,11,12,13],ymm3[14],ymm2[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,2,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3,4,5,6],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] @@ -6046,49 +5897,49 @@ ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm6[2],ymm9[3,4,5],ymm6[6],ymm9[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm10[4],xmm3[5],xmm10[6],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm9[4],xmm3[5],xmm9[6],xmm3[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u] ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3,4,5],xmm10[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3,4,5],xmm9[6],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,1,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm12[0,1,2,0,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm12[0,1,2,0,4,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm9[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm2, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm2, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm12 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm2[1],xmm12[2,3,4,5],xmm2[6],xmm12[7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm11[2],ymm15[3,4,5],ymm11[6],ymm15[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1,2,3],xmm9[4],xmm12[5],xmm9[6],xmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4],xmm12[5],xmm10[6],xmm12[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm3[2],ymm13[3,4],ymm3[5],ymm13[6,7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,2,0] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm2, %ymm12, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,0,0,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,6,5,4] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7],ymm12[8,9,10],ymm9[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,6,5,4] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 @@ -6112,30 +5963,30 @@ ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6],ymm1[7,8,9,10,11,12,13],ymm7[14],ymm1[15] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3,4,5],xmm7[6],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1],ymm3[2,3],ymm13[4,5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm13[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm12[3],ymm9[4,5,6,7,8,9,10],ymm12[11],ymm9[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0],xmm7[1],xmm10[2,3,4,5],xmm7[6],xmm10[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1],ymm3[2,3],ymm13[4,5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm13[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7,8,9,10],ymm12[11],ymm10[12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7],ymm9[8,9,10],ymm7[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3,4,5,6,7],ymm10[8,9,10],ymm7[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm9, %ymm12, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm10, %ymm12, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm7[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm12 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2,3,4,5],xmm12[6],xmm7[7] @@ -6157,7 +6008,7 @@ ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0],ymm7[1,2,3,4,5,6],ymm12[7,8],ymm7[9,10,11,12,13,14],ymm12[15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7] @@ -6173,9 +6024,9 @@ ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,6,4,6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] @@ -6216,7 +6067,7 @@ ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm6, %zmm26 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm26 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %zmm18, %zmm6, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm9, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, (%rsi) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, (%rcx) @@ -6236,300 +6087,319 @@ ; ; AVX512F-ONLY-FAST-LABEL: load_i16_stride7_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12] -; AVX512F-ONLY-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [10,3,6,15,12,13,6,15] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [3,6,10,13,3,6,10,13] -; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = <1,u,u,u,5,8,12,15> -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,6,9,u,13,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <1,u,u,u,4,8,11,15> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm0, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,5,9,u,12,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm18, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,u,u,4,7,11,14> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [8,1,12,5,12,5,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm22, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm28 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm28[0,1,0,2] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512F-ONLY-FAST-NEXT: subq $136, %rsp +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [3,6,10,13,3,6,10,13] +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm7, %ymm15, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm25[0,1,0,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,9,u,13,4,u,u,7> +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm13[4],xmm10[5],xmm13[6],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %ymm9, %ymm10, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 96(%rdi), %ymm27 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm26, %ymm27, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm6[2],ymm3[3,4,5],ymm6[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-ONLY-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm26 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3],xmm14[4],xmm13[5],xmm14[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,u,u,4,7,11,14> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 352(%rdi), %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %ymm29 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm28, %ymm29, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2],xmm13[3,4,5,6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4],xmm1[5],xmm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4,5,6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,5,9,u,12,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm27, %ymm26, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm13[3],ymm9[4,5],ymm13[6],ymm9[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3,4,5],ymm3[6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm14[4],xmm0[5],xmm14[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm7[2],ymm14[3,4,5],ymm7[6],ymm14[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0],xmm10[1],xmm15[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2,3],xmm0[4],xmm11[5],xmm0[6],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm12[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3,4,5,6],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3,4,5],xmm7[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0],xmm6[1],xmm10[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,6,9,u,13,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm27, %ymm26, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm9[1],ymm13[2,3],ymm9[4],ymm13[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm2, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm7[3],ymm14[4,5],ymm7[6],ymm14[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm28, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm25, %ymm2, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm9[2],ymm8[3,4,5],ymm9[6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4],xmm0[5],xmm7[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm24, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm23, %zmm22, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3],xmm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm19, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %ymm7, %ymm0, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3,4,5],xmm7[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm28[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm13[2],ymm7[3,4,5],ymm13[6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm12[4],xmm2[5],xmm12[6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3,4,5],xmm12[6],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,11,2,11,12,5,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm14, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1,2],ymm2[3,4,5,6,7],ymm14[8,9,10],ymm2[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm14, %ymm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm17 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,11,14,u,u,5,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm26, %ymm27, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm9[1],ymm13[2,3,4],ymm9[5],ymm13[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm9, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm31 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm14[1],ymm7[2,3],ymm14[4],ymm7[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm25[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,5,9,12,2,5,9,12] +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm14[1],ymm7[2,3,4],ymm14[5],ymm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm15, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm18, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,3,7,10,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,3,3,u,0,3,7,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm28, %ymm14, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5,6,7],ymm4[8,9,10,11,12],ymm14[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm10, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm4, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm12, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm12[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm12, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm13[3],ymm7[4,5],ymm13[6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm2[1],xmm12[2,3,4,5],xmm2[6],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <2,u,u,u,6,9,13,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm12, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7],ymm12[8,9,10],ymm11[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm12, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm16, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1],ymm11[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3,4,5],xmm12[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] -; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm28, %ymm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm12[5,6,7],ymm1[8,9,10,11,12],ymm12[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <3,u,u,u,6,10,13,u> -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm10, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm6, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm7, %ymm14, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,3,3,u,0,3,7,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm25, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm17, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm10, %xmm16 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5],xmm1[6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <9,u,u,u,12,0,3,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm29, %ymm28, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm0, %ymm2, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm30, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm11[2],ymm4[3,4,5],ymm11[6],ymm4[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm8[4],xmm3[5],xmm8[6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <9,u,u,u,13,0,4,7> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm29, %ymm28, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm8, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3],xmm8[4],xmm10[5],xmm8[6],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2],ymm9[3],ymm4[4,5],ymm9[6],ymm4[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1],xmm15[2],xmm10[3],xmm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm12, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1],xmm15[2,3,4,5],xmm10[6],xmm15[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <10,u,u,u,4,13,u,1> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm29, %ymm28, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm15, %ymm10, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0],xmm1[1],xmm10[2,3,4,5],xmm1[6],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0],ymm4[1],ymm9[2,3],ymm4[4],ymm9[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm15[1],xmm10[2,3,4,5],xmm15[6],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <2,u,u,u,6,9,13,u> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm28, %ymm29, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3,4,5,6,7],ymm15[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm15, %ymm10, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,3,7,10,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm27, %ymm26, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm10, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3,4,5],xmm0[6],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm7, %ymm30, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm12, %ymm10, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,8,11,15,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm30, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm7[1],ymm13[2,3,4],ymm7[5],ymm13[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,10,3,14,7,10,3] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm6, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1,2],ymm6[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <3,u,u,u,6,10,13,u> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm28, %ymm29, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7],ymm7[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm6, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm14[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,4,7,0,0,4,7,0] +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm25, %ymm6, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm14, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,4,7,11,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm27, %ymm26, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm14[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1],ymm4[2],ymm9[3,4],ymm4[5],ymm9[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,10,3,14,7,10,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm28, %ymm29, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1,2],ymm12[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm13[1],xmm12[2],xmm13[3],xmm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0,1,2],ymm2[3,4,5,6,7],ymm11[8,9,10],ymm2[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <9,12,0,3,7,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm26, %ymm27, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm11[2,3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm26, %zmm6, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm27, %zmm6, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm31, %zmm4, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm31, %zmm11, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, (%rsp), %zmm10, %zmm18 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm24, %zmm17 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm20, %zmm4, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm22 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm25, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm18, %zmm4, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm12, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%r9) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm30, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm21 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm25, %zmm22 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm24, %zmm10, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm5, %zmm10, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $136, %rsp ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; @@ -6604,9 +6474,9 @@ ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm22 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -6676,8 +6546,7 @@ ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6],ymm0[7,8,9,10,11,12,13],ymm3[14],ymm0[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,2,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4,5,6],xmm0[7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] @@ -6786,7 +6655,7 @@ ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3,4,5],xmm10[6],xmm12[7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm15[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm15[2,3,0,1] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3],ymm12[4,5,6,7,8,9,10],ymm9[11],ymm12[12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0],ymm7[1],ymm3[2,3],ymm7[4],ymm3[5,6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm5 @@ -6907,293 +6776,300 @@ ; ; AVX512DQ-FAST-LABEL: load_i16_stride7_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm31 = [2,6,9,13,2,6,9,13] -; AVX512DQ-FAST-NEXT: # ymm31 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm24 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [2,5,9,12,2,5,9,12] -; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,6,15,12,13,6,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FAST-NEXT: # ymm18 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <1,u,u,u,5,8,12,15> -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm20, %zmm31, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,u,u,u,4,8,11,15> -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpermd %zmm20, %zmm19, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,u,u,u,4,7,11,14> -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm3, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm18, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm27 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm27[0,1,0,2] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6],ymm8[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm15[4],xmm12[5],xmm15[6],xmm12[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %ymm9, %ymm12, %ymm23 -; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm12 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, %ymm0 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm10, %ymm15, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm24 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm24[0,1,0,2] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,9,u,13,4,u,u,7> +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 96(%rdi), %ymm26 +; AVX512DQ-FAST-NEXT: vpermi2d %ymm25, %ymm26, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vporq %ymm1, %ymm2, %ymm30 +; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512DQ-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm23 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm15[3],xmm11[4],xmm15[5],xmm11[6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3,4,5,6],xmm14[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm25 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3,4,5],xmm11[6],xmm13[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %ymm2, %ymm11, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm13[2],ymm2[3,4,5],ymm13[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm14[4],xmm11[5],xmm14[6],xmm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm15 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm26 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1,2,3],xmm5[4],xmm11[5],xmm5[6],xmm11[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3,4,5,6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm10, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2],ymm13[3],ymm2[4,5],ymm13[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1],xmm11[2,3,4,5],xmm6[6],xmm11[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [2,5,2,5,2,5,2,5] -; AVX512DQ-FAST-NEXT: vpermd %ymm27, %ymm11, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3,4,5,6],ymm11[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm11, %zmm17 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1],ymm9[2],ymm8[3,4,5],ymm9[6],ymm8[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm14[4],xmm11[5],xmm14[6],xmm11[7] -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm21, %zmm14 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm14, %ymm11, %ymm11 -; AVX512DQ-FAST-NEXT: vpermd %zmm20, %zmm18, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm10, %zmm18, %zmm17 -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm14, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm30 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,u,u,4,7,11,14> +; AVX512DQ-FAST-NEXT: vmovdqa64 352(%rdi), %ymm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %ymm28 +; AVX512DQ-FAST-NEXT: vpermi2d %ymm27, %ymm28, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4],xmm1[5],xmm2[6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4,5,6],xmm0[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,5,9,u,12,u,u,u> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm26, %ymm25, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm10[2],ymm7[3,4,5],ymm10[6],ymm7[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm3[1],xmm6[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,14,15,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,6,9,u,13,u,u,u> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm26, %ymm25, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm1, %ymm31 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2,5,2,5,2,5,2,5] +; AVX512DQ-FAST-NEXT: vpermd %ymm24, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm21 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,11,14,u,u,5,u,u> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm25, %ymm26, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vporq %ymm1, %ymm2, %ymm22 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm24[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm20 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm6, %xmm2 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, %ymm0 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm4, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <9,u,u,u,12,0,3,7> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm28, %ymm27, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2d %ymm3, %ymm9, %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm3[2],ymm9[3,4,5],ymm3[6],ymm9[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0],ymm9[1],ymm3[2,3,4],ymm9[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FAST-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm29, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm12[2],ymm11[3,4,5],ymm12[6],ymm11[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <9,u,u,u,13,0,4,7> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm28, %ymm27, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm31, %zmm3, %zmm17 ; AVX512DQ-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm10, %zmm0, %zmm17 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3],xmm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %ymm10, %ymm11, %ymm21 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm27[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm10, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm6[2],ymm10[3,4,5],ymm6[6],ymm10[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5],xmm5[6],xmm7[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3,4,5],xmm7[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,3,7,10,14,u,u,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [2,11,2,11,12,5,8,9] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm29, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm21, %zmm18, %zmm11 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm0, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6],xmm8[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm15 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0],xmm8[1],xmm15[2,3,4,5],xmm8[6],xmm15[7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3,4,5,6,7],ymm1[8,9,10],ymm5[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm7, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm21 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm2[1],ymm13[2,3,4],ymm2[5],ymm13[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <10,u,u,u,4,13,u,1> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm28, %ymm27, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0,1,2],ymm5[3,4,5,6,7],ymm15[8,9,10],ymm5[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm15, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm22, %zmm3, %zmm20 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm0, %zmm20 {%k1} +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3,4,5],xmm2[6],xmm5[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3,4,5],xmm8[6],xmm5[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,u,u,u,6,9,13,u> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm27, %ymm28, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm8, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm15 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,3,7,10,14,u,u,u> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm26, %ymm25, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm22 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm15 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm3, %zmm23 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm23 {%k1} +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm2[1],xmm6[2,3,4,5],xmm2[6],xmm6[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm2 -; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm15, %xmm7 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,u,u,u,6,9,13,u> -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm6[3],ymm10[4,5],ymm6[6],ymm10[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm13 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3,4,5],xmm13[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7],ymm7[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm31, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,4,7,11,14,u,u,u> -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm19, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,3,3,u,0,3,7,u> -; AVX512DQ-FAST-NEXT: vpermd %ymm27, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm12, %xmm1 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm13 -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm13, %xmm5 -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm7[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm5[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpermi2d %ymm10, %ymm7, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,3,3,u,0,3,7,u> +; AVX512DQ-FAST-NEXT: vpermd %ymm24, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7],ymm2[8,9,10,11,12],ymm4[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm0, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm6 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <3,u,u,u,6,10,13,u> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm27, %ymm28, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3,4,5,6,7],ymm5[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm10, %ymm29, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm7[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,7,0,0,4,7,0] ; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm27, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpermd %ymm24, %ymm5, %ymm5 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm14[4,5,2,3,4,5,10,11,12,13,u,u,u,u,u,u,20,21,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7],ymm7[8,9,10,11,12],ymm5[13,14,15] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm12 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <3,u,u,u,6,10,13,u> -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0],xmm7[1],xmm14[2],xmm7[3],xmm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm12, %zmm12 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7],ymm12[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm12, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,4,8,11,15,u,u,u> -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm18, %zmm13 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,10,3,14,7,10,3] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm5, %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,4,7,11,14,u,u,u> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm26, %ymm25, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,14,7,10,3] +; AVX512DQ-FAST-NEXT: vpermi2d %ymm27, %ymm28, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm9, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2],xmm9[3],xmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <9,12,0,3,7,u,u,u> +; AVX512DQ-FAST-NEXT: vpermi2d %ymm25, %ymm26, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm4, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm23, %zmm1, %zmm25 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm26 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm26, %zmm1, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, (%rdx) +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm30, %zmm1, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm19, %zmm1, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, (%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, (%rdx) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, (%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, (%r9) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -7340,395 +7216,395 @@ ; SSE-LABEL: load_i16_stride7_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $1352, %rsp # imm = 0x548 -; SSE-NEXT: movdqa 640(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 624(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm12 -; SSE-NEXT: movdqa 128(%rdi), %xmm14 -; SSE-NEXT: movaps 160(%rdi), %xmm7 +; SSE-NEXT: movdqa 528(%rdi), %xmm5 +; SSE-NEXT: movdqa 512(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm9 -; SSE-NEXT: movdqa 176(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa 80(%rdi), %xmm14 +; SSE-NEXT: movdqa 64(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,0,0] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm7[2,2] -; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{.*#+}} xmm7 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,1,0,3] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa 656(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: movdqa 544(%rdi), %xmm8 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 608(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 592(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: movdqa 560(%rdi), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 496(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 480(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: movdqa 448(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 576(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 464(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 80(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm0 +; SSE-NEXT: movdqa 192(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 32(%rdi), %xmm0 +; SSE-NEXT: movaps 160(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: movdqa 112(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 544(%rdi), %xmm0 +; SSE-NEXT: movdqa 656(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 528(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 512(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa 640(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 624(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 496(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 480(%rdi), %xmm0 +; SSE-NEXT: movaps 608(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 592(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: movdqa 448(%rdi), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: movdqa 560(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 464(%rdi), %xmm11 +; SSE-NEXT: movdqa 576(%rdi), %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm0 +; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 416(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm0 +; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 288(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 384(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm0 +; SSE-NEXT: movaps 272(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 256(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: movdqa 336(%rdi), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: movdqa 224(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 352(%rdi), %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 880(%rdi), %xmm0 +; SSE-NEXT: movdqa 768(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 864(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 848(%rdi), %xmm0 +; SSE-NEXT: movdqa 752(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 736(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm13 +; SSE-NEXT: movdqa %xmm9, %xmm13 ; SSE-NEXT: pandn %xmm0, %xmm13 -; SSE-NEXT: movaps 832(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 816(%rdi), %xmm0 +; SSE-NEXT: movaps 720(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 704(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: movdqa 784(%rdi), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: movdqa 672(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 800(%rdi), %xmm0 +; SSE-NEXT: movdqa 688(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm13, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm0 +; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 304(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm0 +; SSE-NEXT: movdqa 416(%rdi), %xmm13 +; SSE-NEXT: movdqa 400(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 272(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm0 +; SSE-NEXT: movaps 384(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 368(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: movdqa 224(%rdi), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: movdqa 352(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 768(%rdi), %xmm0 +; SSE-NEXT: movdqa 880(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 752(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 736(%rdi), %xmm0 +; SSE-NEXT: movdqa 864(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 848(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps 720(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 704(%rdi), %xmm0 +; SSE-NEXT: movaps 832(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 816(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: movdqa 672(%rdi), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2] +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: movdqa 784(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: movdqa 688(%rdi), %xmm0 +; SSE-NEXT: movdqa 800(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm15[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm1 +; SSE-NEXT: psrld $16, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn %xmm12, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: psrld $16, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm13, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: pand %xmm14, %xmm10 +; SSE-NEXT: por %xmm1, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pand %xmm13, %xmm11 +; SSE-NEXT: pand %xmm14, %xmm11 ; SSE-NEXT: por %xmm1, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -7736,196 +7612,169 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm1 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pand %xmm14, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: orps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm0[1] @@ -7933,314 +7782,263 @@ ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm2[0],xmm6[1,2,3] +; SSE-NEXT: andps %xmm9, %xmm6 ; SSE-NEXT: orps %xmm1, %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm11[2],xmm8[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: orps %xmm3, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd $196, (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3] +; SSE-NEXT: andps %xmm9, %xmm6 +; SSE-NEXT: orps %xmm2, %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm9, %xmm0 +; SSE-NEXT: orps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm7[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: orps %xmm3, %xmm0 +; SSE-NEXT: andps %xmm9, %xmm0 +; SSE-NEXT: orps %xmm6, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm8[0],xmm0[1,2,3] +; SSE-NEXT: andps %xmm9, %xmm0 +; SSE-NEXT: orps %xmm7, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm7[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: orps %xmm8, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm8, %xmm3 -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,4,7] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm9[0],xmm1[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm1 -; SSE-NEXT: orps %xmm8, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm9[0],xmm5[1,2,3] +; SSE-NEXT: andps %xmm1, %xmm5 +; SSE-NEXT: orps %xmm8, %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: movdqa %xmm3, %xmm9 ; SSE-NEXT: pandn %xmm8, %xmm9 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: pand %xmm3, %xmm8 ; SSE-NEXT: por %xmm9, %xmm8 -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,4,7] +; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: andnps %xmm8, %xmm9 +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm8[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm8[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm11[0],xmm0[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: orps %xmm9, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm11[0],xmm10[1,2,3] +; SSE-NEXT: andps %xmm1, %xmm10 +; SSE-NEXT: orps %xmm9, %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm15[2],xmm9[3],xmm15[3] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,1] -; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[0,1,0,1] -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: por %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm15, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,4,7] +; SSE-NEXT: pand %xmm3, %xmm9 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm2[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[2,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,1,0,1] +; SSE-NEXT: pandn %xmm11, %xmm3 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: andnps %xmm3, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,2,3,3] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm3[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm11[0],xmm4[1,2,3] -; SSE-NEXT: andps %xmm15, %xmm4 -; SSE-NEXT: orps %xmm9, %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[0,1,1,0,4,5,6,7] +; SSE-NEXT: movss {{.*#+}} xmm13 = xmm11[0],xmm13[1,2,3] +; SSE-NEXT: andps %xmm1, %xmm13 +; SSE-NEXT: orps %xmm9, %xmm13 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm13, %xmm9 +; SSE-NEXT: pandn %xmm14, %xmm9 +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: pand %xmm13, %xmm11 +; SSE-NEXT: por %xmm9, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm11[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm9 = xmm2[0],xmm9[1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm9 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm9 = xmm3[0],xmm9[1,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: andnps %xmm3, %xmm0 +; SSE-NEXT: andps %xmm1, %xmm9 +; SSE-NEXT: orps %xmm9, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: andps %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm13, %xmm9 +; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,7] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: andps %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,1,1,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,1,1,0,4,5,6,7] +; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: andnps %xmm0, %xmm9 +; SSE-NEXT: andps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm1, %xmm10 +; SSE-NEXT: orps %xmm3, %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -8248,46 +8046,44 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm10, %xmm9 +; SSE-NEXT: andnps %xmm1, %xmm9 +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: andps %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -8297,22 +8093,99 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: andps %xmm15, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; SSE-NEXT: movaps %xmm10, %xmm7 +; SSE-NEXT: andnps %xmm1, %xmm7 +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: andps %xmm10, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; SSE-NEXT: andnps %xmm1, %xmm10 +; SSE-NEXT: orps %xmm0, %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -8321,7 +8194,8 @@ ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,0,3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] @@ -8333,31 +8207,29 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrlq $16, %xmm1 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -8367,13 +8239,13 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -8386,8 +8258,7 @@ ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] @@ -8403,10 +8274,10 @@ ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -8415,10 +8286,11 @@ ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -8431,22 +8303,25 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm3[4],xmm15[5],xmm3[5],xmm15[6],xmm3[6],xmm15[7],xmm3[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,3] +; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -8459,25 +8334,23 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm3[4],xmm14[5],xmm3[5],xmm14[6],xmm3[6],xmm14[7],xmm3[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm1 -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; SSE-NEXT: psrlq $16, %xmm4 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd $196, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -8490,16 +8363,17 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: psrlq $16, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: psrlq $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload @@ -8507,7 +8381,7 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -8517,37 +8391,39 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm10 ; SSE-NEXT: psrld $16, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: psrlq $16, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: psrlq $16, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,3] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] @@ -8567,7 +8443,7 @@ ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movdqa %xmm2, %xmm0 @@ -8586,7 +8462,7 @@ ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movdqa %xmm2, %xmm0 @@ -8605,7 +8481,7 @@ ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -8618,14 +8494,14 @@ ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pandn (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 @@ -8635,10 +8511,9 @@ ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm13, %xmm0 @@ -8646,47 +8521,47 @@ ; SSE-NEXT: por %xmm0, %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,0,3] +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pandn %xmm6, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm3, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -8694,10 +8569,10 @@ ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,2,2,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] @@ -8711,13 +8586,13 @@ ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] @@ -8731,8 +8606,8 @@ ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -8751,8 +8626,8 @@ ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -8771,8 +8646,8 @@ ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -8783,7 +8658,7 @@ ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] @@ -8791,9 +8666,9 @@ ; SSE-NEXT: # xmm1 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,1,0,3] @@ -8811,18 +8686,18 @@ ; SSE-NEXT: # xmm3 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,1,0,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] @@ -8830,8 +8705,8 @@ ; SSE-NEXT: # xmm4 = mem[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,1,1,3,4,5,6,7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -8841,54 +8716,54 @@ ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 64(%rsi) +; SSE-NEXT: movaps %xmm3, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rsi) +; SSE-NEXT: movaps %xmm3, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 96(%rdx) +; SSE-NEXT: movaps %xmm3, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rdx) +; SSE-NEXT: movaps %xmm3, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 64(%rdx) +; SSE-NEXT: movaps %xmm3, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rdx) +; SSE-NEXT: movaps %xmm3, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 96(%rcx) +; SSE-NEXT: movaps %xmm3, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 64(%rcx) +; SSE-NEXT: movaps %xmm3, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rcx) +; SSE-NEXT: movaps %xmm3, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rcx) +; SSE-NEXT: movaps %xmm3, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 96(%r8) @@ -8931,7 +8806,8 @@ ; SSE-NEXT: movaps %xmm3, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%rax) -; SSE-NEXT: movaps %xmm13, (%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, 112(%rax) ; SSE-NEXT: movapd %xmm1, 96(%rax) @@ -8946,10 +8822,9 @@ ; ; AVX1-ONLY-LABEL: load_i16_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1544, %rsp # imm = 0x608 -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: subq $1512, %rsp # imm = 0x5E8 +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] @@ -8957,15 +8832,15 @@ ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7] @@ -8978,9 +8853,9 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,3,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 @@ -8991,196 +8866,194 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm6[2],xmm7[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[2],xmm8[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 624(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa 624(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm4[2],xmm3[2],zero +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm0[2],xmm6[2],zero +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[2],xmm4[2],zero +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm0[2],xmm3[2],zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,3,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 832(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 832(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa 800(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 816(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa 800(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 816(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 880(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 880(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa 672(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa 672(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 720(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm4[2],xmm3[2],zero -; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm10 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm0[2],xmm3[2],zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 752(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 752(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa 784(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa 784(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6],mem[7] +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm13[6],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] @@ -9189,22 +9062,25 @@ ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm6[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm3[1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7] @@ -9213,142 +9089,143 @@ ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1,2,3,4,5],mem[6],xmm0[7] +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm12[6],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-ONLY-NEXT: vpslld $16, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm5[0],mem[1],xmm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm12[1],xmm14[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm14[6],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm0[6],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslld $16, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm12[0],mem[1],xmm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm4 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5],xmm8[6],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2,3,4,5],mem[6],xmm3[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslld $16, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm11[1],xmm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $16, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpsllq $16, %xmm9, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -9362,33 +9239,32 @@ ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm3[1],xmm5[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -9398,442 +9274,433 @@ ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3],xmm9[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,2,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsllq $16, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2,3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm1, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm10, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1],xmm4[2,3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpsllq $16, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsllq $16, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufd $196, (%rsp), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm2[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm4[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,2,3,4,5],xmm8[6,7] ; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6],xmm15[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6],xmm15[7] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm1, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm10, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm13, %ymm6 -; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3,4,5],xmm0[6],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,0,0,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsllq $16, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd $236, (%rsp), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm6[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[0,1,2,3,4,5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,5,6],xmm14[7] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm1, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm14, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm10, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1,2,3,4,5],mem[6],xmm8[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,0,0,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1,2],xmm8[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm8[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm14[6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,4,5,6,7,8,9,4,5,8,9,2,3] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm13 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm10, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm7, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm0[0,1,2,3,4,5],mem[6],xmm0[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1,2],xmm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm10[0],mem[1],xmm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,7,7,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,7,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm7[0],xmm13[1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,4,5,8,9,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm15, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm1, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm12, %ymm14, %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm10, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm12, %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm15, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3,4,5],xmm12[6],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm4[1,2],xmm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm13[6,7] +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm7[0,1,2,3,4,5],mem[6],xmm7[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm5[1,2],xmm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm5[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0],xmm1[1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,7,7,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm7[0],mem[1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm1, %ymm13 -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm1, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm10, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3,4,5],xmm1[6],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm1[0],mem[1],xmm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,1,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,7,7,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm13[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm9, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2,3,4,5],xmm7[6],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm2[1,2],xmm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm7[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm1, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm10, %ymm12 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd (%rsp), %xmm11, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2,3,4,5],xmm7[6],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm4[1,2],xmm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[2,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm7[0],mem[1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,14,15,4,5,6,7,0,1,4,5,8,9,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2],xmm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm6[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,14,15,4,5,6,7,0,1,4,5,8,9,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1],mem[2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm15[2],xmm6[3],xmm15[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1,2],xmm6[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,3,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $16, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $16, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw $12, (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,3,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -9843,8 +9710,8 @@ ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrlq $16, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,3] @@ -9859,205 +9726,236 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2,3,4,5],mem[6],xmm2[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[0,1,2,3,4,5],mem[6],xmm0[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm11[1],xmm15[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm11[0],mem[1],xmm11[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm14[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm0[6],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2,3,4,5],mem[6],xmm2[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm12, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm0[1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7] ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm0[6],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5],xmm9[6],xmm0[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm12[0],mem[1],xmm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2,3,4,5],mem[6],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7] ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = zero,xmm2[1],mem[0],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,5],xmm7[6],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm3[1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm13[1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1],xmm6[1],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = zero,xmm8[1],mem[0],zero +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm2[0],mem[1],xmm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm0[1],xmm9[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] @@ -10067,44 +9965,54 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = zero,xmm14[1],mem[0],zero -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = zero,xmm4[1],mem[0],zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm9[1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0],xmm5[1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[0,1,0,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[3,3,3,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm10, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = zero,xmm0[1],mem[0],zero ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpunpckhdq (%rsp), %xmm0, %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -10115,8 +10023,8 @@ ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] ; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload @@ -10128,100 +10036,59 @@ ; AVX1-ONLY-NEXT: # xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = zero,xmm0[1],mem[0],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; AVX1-ONLY-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm9[6,7] ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm7, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) @@ -10230,23 +10097,24 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX1-ONLY-NEXT: addq $1544, %rsp # imm = 0x608 +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) +; AVX1-ONLY-NEXT: addq $1512, %rsp # imm = 0x5E8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i16_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1464, %rsp # imm = 0x5B8 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm14 +; AVX2-SLOW-NEXT: subq $1432, %rsp # imm = 0x598 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 @@ -10264,94 +10132,91 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm13 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm14[2],ymm7[3,4,5],ymm14[6],ymm7[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm6 -; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm14 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm13 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm15[2],ymm9[3,4,5],ymm15[6],ymm9[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm7[1],ymm11[2,3,4],ymm7[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4,5],ymm3[6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm9[2],ymm11[3,4,5],ymm9[6],ymm11[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm0[1],ymm11[2,3,4],ymm0[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm15 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm6[2],ymm14[3,4],ymm6[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7,8,9,10],ymm2[11],ymm1[12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm14 -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm13 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm14 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm4 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7,8,9,10],ymm4[11],ymm3[12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2],ymm15[3],ymm9[4,5],ymm15[6],ymm9[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm12 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm6[2],ymm15[3,4],ymm6[5],ymm15[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7,8,9,10],ymm4[11],ymm3[12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm15 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -10365,112 +10230,117 @@ ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm6[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm8 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23] ; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm4 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5],xmm4[6],xmm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm11[2,3],ymm6[4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm11[2,3,0,1] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1] +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5],xmm4[6],xmm3[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm15 -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm12[2,3],ymm7[4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm10[1],ymm0[2,3],ymm10[4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm12 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5],xmm4[6],xmm3[7] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm14[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7,8,9,10,11],ymm4[12],ymm3[13,14,15] ; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2],ymm1[3],mem[4,5],ymm1[6],mem[7] +; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5],mem[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[1,3,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] ; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm8[1],ymm14[2,3,4],ymm8[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2],ymm0[3],ymm13[4,5],ymm0[6],ymm13[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] ; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] +; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] ; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm5[1],ymm11[2,3,4],ymm5[5],ymm11[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm15[0,1,2],mem[3],ymm15[4,5],mem[6],ymm15[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] ; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] @@ -10479,18 +10349,16 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,1,0,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,4,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -10506,12 +10374,10 @@ ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,4,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] @@ -10520,35 +10386,36 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %ymm15 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm15[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm10[7] ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm0[3],ymm5[4,5],ymm0[6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm9 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,4,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm14[0,1,0,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm0[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm5[0,1,0,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm8[2],ymm6[3,4,5],ymm8[6],ymm6[7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm13[4],xmm1[5],xmm13[6],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm13[2],ymm12[3,4,5],ymm13[6],ymm12[7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4],xmm1[5],xmm14[6],xmm1[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] @@ -10557,10 +10424,10 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm4[2],ymm11[3,4,5],ymm4[6],ymm11[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] @@ -10569,11 +10436,10 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] @@ -10582,29 +10448,29 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm12[2],ymm10[3,4,5],ymm12[6],ymm10[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm8[2],ymm11[3,4,5],ymm8[6],ymm11[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm6 -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm8[0,1,1,2] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,1,1,2] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] @@ -10612,15 +10478,13 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm12[0,1,1,2] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm8[0,1,1,2] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] @@ -10628,15 +10492,14 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,1,1,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm15[0,1,1,2] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] @@ -10644,14 +10507,14 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm15[0,1,1,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm5[0,1,1,2] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] @@ -10659,13 +10522,13 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm11[1],ymm6[2,3],ymm11[4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm7[0,1,1,3] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] @@ -10674,15 +10537,16 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm4[1],ymm13[2,3],ymm4[4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm12[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm8[0,1,1,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,2,0,4,5,6,4] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] @@ -10690,8 +10554,9 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm14[1],ymm3[2,3],ymm14[4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] @@ -10705,13 +10570,15 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm5[1],ymm11[2,3],ymm5[4],ymm11[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm9[1],ymm4[2,3],ymm9[4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm5[0,1,1,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,2,0,4,5,6,4] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] @@ -10719,24 +10586,22 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm10[2],ymm14[3,4,5],ymm10[6],ymm14[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm9[2],ymm8[3,4,5],ymm9[6],ymm8[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[8,9,8,9,8,9,8,9,0,1,14,15,0,1,10,11,24,25,24,25,24,25,24,25,16,17,30,31,16,17,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7,8,9,10,11,12,13],ymm4[14],ymm5[15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm14[2],ymm7[3,4],ymm14[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] @@ -10748,25 +10613,23 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm6[2],mem[3,4,5],ymm6[6],mem[7] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm6[2],ymm15[3,4,5],ymm6[6],ymm15[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[8,9,8,9,8,9,8,9,0,1,14,15,0,1,10,11,24,25,24,25,24,25,24,25,16,17,30,31,16,17,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6],ymm4[7,8,9,10,11,12,13],ymm2[14],ymm4[15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm9[2],ymm15[3,4],ymm9[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm12[1],mem[2,3,4],ymm12[5],mem[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm8 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 @@ -10777,24 +10640,24 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4,5],mem[6],ymm2[7] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4,5],ymm2[6],mem[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [8,9,8,9,8,9,8,9,0,1,14,15,0,1,10,11,24,25,24,25,24,25,24,25,16,17,30,31,16,17,26,27] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7,8,9,10,11,12,13],ymm2[14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, (%rsp), %ymm5, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] +; AVX2-SLOW-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm12[1],mem[2,3,4],ymm12[5],mem[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 @@ -10805,9 +10668,9 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4],ymm1[5],mem[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 @@ -10816,15 +10679,15 @@ ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm7[2],ymm12[3,4,5],ymm7[6],ymm12[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm11[2],mem[3,4,5],ymm11[6],mem[7] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] @@ -10834,491 +10697,489 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm14[2,3],ymm7[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 656(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 656(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 640(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5],xmm3[6],xmm2[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,6] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7],ymm1[8,9,10,11,12],ymm4[13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6],xmm3[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2],ymm6[3],mem[4,5],ymm6[6],mem[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm6[3],ymm15[4,5],ymm6[6],ymm15[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm9[2,3],ymm15[4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm8[2],mem[3,4],ymm8[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3],xmm1[4],xmm4[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm3[6],xmm4[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,6] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7],ymm1[8,9,10,11,12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm8[2],ymm10[3,4],ymm8[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 880(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1,2,3,4,5,6,7],ymm4[8],ymm5[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm15[2],mem[3,4],ymm15[5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3],xmm4[4],xmm5[5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm6[6],xmm5[7] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,6] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7,8],ymm5[9,10,11,12,13,14],ymm6[15] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm10[2,3],mem[4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm12 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm11 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1,2,3,4,5,6,7],ymm3[8],ymm4[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $72, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5],mem[6],ymm3[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6],ymm4[7,8],ymm3[9,10,11,12,13,14],ymm4[15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7,8],ymm5[9],ymm4[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2],ymm5[3],mem[4,5],ymm5[6],mem[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1],mem[2,3],ymm5[4,5],mem[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4],xmm5[5],xmm6[6,7] -; AVX2-SLOW-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[0],xmm2[1],mem[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7],ymm5[8,9,10,11,12],ymm2[13,14,15] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7,8],ymm5[9],ymm4[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm9[0,1,2],mem[3],ymm9[4,5],mem[6],ymm9[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4],xmm5[5],xmm6[6,7] +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3],xmm4[4],xmm5[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 880(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 864(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm4[6],xmm6[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm5[1,2,3,4,5,6,7],ymm3[8],ymm5[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3],xmm3[4],xmm5[5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,5],xmm5[6],xmm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm12 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7,8],ymm6[9,10,11,12,13,14],ymm7[15] +; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1],ymm10[2,3],mem[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1,2,3,4,5,6,7],ymm6[8],ymm3[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4,5,6,7,8],ymm6[9],ymm3[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2],ymm9[3],ymm15[4,5],ymm9[6],ymm15[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm3[3],xmm7[4],xmm3[5],xmm7[6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4,5,6,7,8],ymm4[9],ymm0[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2],ymm3[3],mem[4,5],ymm3[6],mem[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = mem[0],xmm3[1],mem[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm15[2,3],mem[4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm4, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6,7,8],ymm6[9],ymm1[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2],ymm6[3],mem[4,5],ymm6[6],mem[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4],xmm6[5],xmm7[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm6[1,2,3,4,5,6,7],ymm1[8],ymm6[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $237, (%rsp), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0],ymm6[1],mem[2,3],ymm6[4],mem[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6,7,8],ymm7[9],ymm6[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4],xmm7[5],xmm8[6,7] ; AVX2-SLOW-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7,8],ymm5[9],ymm4[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2],ymm12[3],mem[4,5],ymm12[6],mem[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7],ymm7[8,9,10,11,12],ymm4[13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4,5,6,7],ymm6[8],ymm4[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1],mem[2,3],ymm6[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4],xmm6[5],xmm7[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4,5,6,7,8],ymm5[9],ymm3[10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2],ymm11[3],mem[4,5],ymm11[6],mem[7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%r9) +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 64(%rax) -; AVX2-SLOW-NEXT: addq $1464, %rsp # imm = 0x5B8 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-SLOW-NEXT: addq $1432, %rsp # imm = 0x598 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i16_stride7_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $1544, %rsp # imm = 0x608 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[2,3,4],ymm0[5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm13[2],ymm10[3,4,5],ymm13[6],ymm10[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm9[2],ymm14[3,4,5],ymm9[6],ymm14[7] +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7] +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm0 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm15[1],ymm6[2,3,4],ymm15[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm6[1],ymm14[2,3,4],ymm6[5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4,5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4],ymm10[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm2, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm9[3],ymm14[4,5],ymm9[6],ymm14[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,5,1,u,4,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm9 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm15[2],ymm8[3,4],ymm15[5],ymm8[6,7] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm0 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1],ymm6[2],ymm14[3,4],ymm6[5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm15[3],ymm7[4,5],ymm15[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm10[2],mem[3,4],ymm10[5],mem[6,7] ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm10[1],ymm0[2,3],ymm10[4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm11[1],ymm6[2,3],ymm11[4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm8[2,3],ymm13[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm14[2,3],ymm7[4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,6,1,u,5,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm15[2,3],ymm8[4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm15 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2],ymm1[3],mem[4,5],ymm1[6],mem[7] +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm4 @@ -11326,554 +11187,561 @@ ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7] +; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2],ymm13[3],mem[4,5],ymm13[6],mem[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2],ymm15[3],mem[4,5],ymm15[6],mem[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7] +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,6,2,5,3,6,2,5] ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,1,0,2] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm13 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm15 -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] +; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 640(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm10 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm15[7] ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm12[3],ymm7[4,5],ymm12[6],ymm7[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 864(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm11[2],ymm8[3,4,5],ymm11[6],ymm8[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm14[2],ymm15[3,4,5],ymm14[6],ymm15[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm14[2],ymm8[3,4,5],ymm14[6],ymm8[7] +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm9 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,4,5,6,7,4,5,6,7,6,7,12,13,16,17,18,19,20,21,22,23,20,21,22,23,22,23,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7] +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm7[2],ymm12[3,4,5],ymm7[6],ymm12[7] +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm14[3],ymm9[4,5],ymm14[6],ymm9[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,3,2,3,2,5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm3 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm2, %ymm3 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7] -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm3 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm9[1],ymm4[2,3],ymm9[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,2,3,8,9,16,17,18,19,20,21,22,23,24,25,26,27,18,19,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm15[1],ymm7[2,3],ymm15[4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,1,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm14[0,1,1,3] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5],xmm5[6],xmm0[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm6[0,1,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm15[0,1,1,3] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm5[1,2,3,4,5,6,7],ymm10[8],ymm5[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5],xmm6[6],xmm5[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3,4,5],xmm11[6],xmm5[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm12[0,1,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,1,1,3] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm15[1],ymm7[2,3,4],ymm15[5],ymm7[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm9 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm12 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm1[0,1],mem[2],ymm1[3,4,5],mem[6],ymm1[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm14[2],ymm12[3,4,5],ymm14[6],ymm12[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,3,7,2,6,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm5[1],ymm10[2,3,4],ymm5[5],ymm10[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm15 +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm8[2],mem[3,4],ymm8[5],mem[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm4 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm7[2],mem[3,4,5],ymm7[6],mem[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,7,2,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm13[1],mem[2,3,4],ymm13[5],mem[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, (%rsp), %ymm14, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm12, %xmm15 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm12[2],ymm11[3,4,5],ymm12[6],ymm11[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $187, (%rsp), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm13[2],mem[3,4,5],ymm13[6],mem[7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm15[1],mem[2,3,4],ymm15[5],mem[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm15, %xmm13 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4,5],ymm3[6],mem[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,3,7,2,6,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,5,1,4,2,5,1,4] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm15 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7],ymm5[8,9,10,11,12],ymm4[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,4,7,3,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm9, %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4,5,6,7],ymm6[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,3,3,3,0,3,7,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,5,1,4,2,5,1,4] +; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm3, %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7],ymm6[8,9,10,11,12],ymm4[13,14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm13 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm13 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm10[0,1,2],mem[3],ymm10[4,5],mem[6],ymm10[7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm9, %ymm13 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4,5,6,7],ymm6[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm3[5,6,7],ymm4[8,9,10,11,12],ymm3[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm6[2,3],ymm10[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,4,7,3,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,2,3,2,3,2,3,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm11, %ymm4 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2],ymm7[3],mem[4,5],ymm7[6],mem[7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1],ymm6[2],mem[3,4],ymm6[5],mem[6,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm3, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7],ymm6[8,9,10,11,12],ymm4[13,14,15] -; AVX2-FAST-NEXT: vpblendd $51, (%rsp), %ymm14, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm13 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm9, %ymm13 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4,5,6,7],ymm6[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm8[0,1],mem[2,3],ymm8[4,5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FAST-NEXT: vpblendd $183, (%rsp), %ymm13, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2],ymm13[3],mem[4,5],ymm13[6],mem[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,4,7,3,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm9[2,3],mem[4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2],ymm2[3],mem[4,5],ymm2[6],mem[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,4,7,3,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm5[0,1],mem[2,3],ymm5[4,5],mem[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,4,7,0,0,4,7,0] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,6,1,5,2,6,1,5] ; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm15[0,1,2],mem[3],ymm15[4,5],mem[6],ymm15[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0],ymm10[1],mem[2,3],ymm10[4],mem[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <1,4,0,3,7,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm1[0,1,2],mem[3],ymm1[4,5],mem[6],ymm1[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <1,4,0,3,7,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7],ymm2[8,9,10,11,12],ymm3[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm8[0,1,2],mem[3],ymm8[4,5],mem[6],ymm8[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7],ymm3[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1,2,3,4,5,6,7],ymm5[8],ymm3[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm9, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7],ymm5[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $183, (%rsp), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm4 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -11893,10 +11761,10 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) @@ -11913,9 +11781,9 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rax) @@ -11926,11 +11794,11 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-NEXT: addq $1544, %rsp # imm = 0x608 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -11938,12 +11806,12 @@ ; AVX2-FAST-PERLANE-LABEL: load_i16_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $1528, %rsp # imm = 0x5F8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 @@ -11951,7 +11819,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] @@ -11961,107 +11829,110 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm4, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm10[2],ymm14[3,4,5],ymm10[6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm15[2],ymm9[3,4,5],ymm15[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm8[1],ymm13[2,3,4],ymm8[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2],ymm15[3,4,5],ymm3[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm11[2],ymm3[3,4,5],ymm11[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm13[1],ymm9[2,3,4],ymm13[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm7[2],ymm5[3,4],ymm7[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7,8,9,10],ymm2[11],ymm1[12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7,8,9,10],ymm4[11],ymm3[12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm4, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm13[2],ymm9[3,4],ymm13[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm6[2],ymm15[3,4],ymm6[5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7,8,9,10],ymm4[11],ymm3[12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2],ymm11[3],mem[4,5],ymm11[6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm4, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm10[2],ymm5[3,4],ymm10[5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7,8,9,10],ymm4[11],ymm3[12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm6[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm10[2,3],ymm5[4,5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7] @@ -12069,253 +11940,254 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm14[1],ymm0[2,3],ymm14[4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5],xmm4[6],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5],xmm4[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm13[2,3],ymm7[4,5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm13[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm12[2,3],ymm7[4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm11[1],ymm5[2,3],ymm11[4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5],xmm4[6],xmm3[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm14[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7,8,9,10,11],ymm4[12],ymm3[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2],ymm1[3],mem[4,5],ymm1[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5],mem[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm0[1],mem[2,3,4],ymm0[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm8[0,1,2],mem[3],ymm8[4,5],mem[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm13[0,1,2],mem[3],ymm13[4,5],mem[6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm3, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm6[1],ymm10[2,3,4],ymm6[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm11[1],ymm15[2,3,4],ymm11[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm11[1],ymm5[2,3,4],ymm11[5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm9[3],ymm13[4,5],ymm9[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm15 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm0[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm10[3],ymm5[4,5],ymm10[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm11, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 864(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm14[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm11, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm15[4],xmm4[5],xmm15[6],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm3[3],ymm10[4,5],ymm3[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm13, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 864(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm3[0,1,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm13, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm12[4],xmm4[5],xmm12[6],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm0 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,4,5,6,7,6,7,12,13,16,17,18,19,20,21,22,23,20,21,22,23,22,23,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm5[2],ymm10[3,4,5],ymm5[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm3[2],ymm8[3,4,5],ymm3[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm13, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm13[2],ymm9[3,4,5],ymm13[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5],ymm15[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm9, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm13[3],ymm9[4,5],ymm13[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm4 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm13[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15,16,17,18,19,20,21,22,23,16,17,18,19,16,17,30,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm6[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,1,1,2] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,1,1,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm15[0,1,1,2] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -12323,12 +12195,11 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,1,2] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -12337,401 +12208,405 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm6[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm13[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,6,7,8,9,10,11,2,3,8,9,16,17,18,19,20,21,22,23,24,25,26,27,18,19,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm3[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm13, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm14[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm8[1],ymm11[2,3],ymm8[4],ymm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm15[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm8, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7,8,9,10,11,12,13],ymm2[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm7[2],ymm12[3,4],ymm7[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm9[1],ymm15[2,3,4],ymm9[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2],xmm3[3],xmm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm14 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7,8,9,10,11,12,13],ymm2[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm0[2],ymm7[3,4,5],ymm0[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,8,9,8,9,8,9,0,1,14,15,0,1,10,11,24,25,24,25,24,25,24,25,16,17,30,31,16,17,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm1[6],ymm3[7,8,9,10,11,12,13],ymm1[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm9[1],ymm6[2,3,4],ymm9[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0],xmm4[1],xmm12[2],xmm4[3],xmm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1,2,3,4,5,6,7],ymm3[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm1[0,1],mem[2],ymm1[3,4,5],mem[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6],ymm4[7,8,9,10,11,12,13],ymm3[14],ymm4[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm9[2],ymm15[3,4],ymm9[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm13, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm9[0,1],mem[2],ymm9[3,4],mem[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0],ymm14[1],mem[2,3,4],ymm14[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1,2,3,4,5,6,7],ymm3[8],ymm4[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm11[2],mem[3,4,5],ymm11[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6],ymm4[7,8,9,10,11,12,13],ymm3[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm10[2],mem[3,4],ymm10[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm13[0,1],mem[2],ymm13[3,4,5],mem[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7,8,9,10,11,12,13],ymm2[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm5[1],mem[2,3,4],ymm5[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4,5],mem[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6],ymm0[7,8,9,10,11,12,13],ymm3[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4],ymm1[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4,5],ymm2[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm4[2],ymm14[3,4],ymm4[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm7[2,3],ymm12[4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, (%rsp), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6],ymm2[7,8],ymm0[9,10,11,12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 656(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5],xmm1[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm5[1,2,3,4,5,6,7],ymm2[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3,4,5,6],ymm5[7,8],ymm2[9,10,11,12,13,14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm9[2,3],ymm15[4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3],xmm5[4],xmm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 656(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 640(%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm3[6],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm0[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm5[1,2,3,4,5,6,7],ymm2[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm9[3],ymm13[4,5],ymm9[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3,4,5,6],ymm5[7,8],ymm2[9,10,11,12,13,14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2],xmm6[3],xmm2[4],xmm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 880(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 864(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm15 = xmm6[0,1,2,3,4,5],xmm2[6],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5,6,7],ymm13[8,9,10,11,12],ymm15[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm13[1,2,3,4,5,6,7],ymm5[8],ymm13[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6],ymm3[7,8],ymm2[9,10,11,12,13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4],xmm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0,1,2,3,4,5],xmm1[6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5,6,7],ymm9[8,9,10,11,12],ymm12[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0],ymm9[1,2,3,4,5,6,7],ymm6[8],ymm9[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,2],ymm11[3],mem[4,5],ymm11[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm6[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1,2,3,4,5,6],ymm9[7,8],ymm6[9,10,11,12,13,14],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1],ymm10[2,3],mem[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0,1],xmm5[2],xmm13[3],xmm5[4],xmm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1],xmm9[2],xmm12[3],xmm9[4],xmm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 880(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 864(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5],xmm5[6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm4[2,3],ymm14[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2],ymm2[3],mem[4,5],ymm2[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6],ymm5[7,8],ymm4[9,10,11,12,13,14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5,6,7],ymm9[8,9,10,11,12],ymm12[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0],ymm9[1,2,3,4,5,6,7],ymm6[8],ymm9[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1],ymm5[2],ymm12[3,4],ymm5[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm6[2],xmm9[3],xmm6[4],xmm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0,1,2,3,4,5],xmm0[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7],ymm6[8,9,10,11,12],ymm0[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2],ymm9[3],ymm14[4,5],ymm9[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7,8],ymm5[9,10,11,12,13,14],ymm6[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2],ymm1[3],mem[4,5],ymm1[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm12[2,3],mem[4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4],xmm1[5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm2[0],mem[1],xmm2[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2],ymm13[3],mem[4,5],ymm13[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm15[2,3],ymm7[4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0],xmm3[1],mem[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7],ymm1[8,9,10,11,12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm9[0],mem[1],ymm9[2,3],mem[4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm8[2,3],ymm3[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3],xmm7[4],xmm1[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = mem[0],xmm6[1],mem[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2],ymm2[3],mem[4,5],ymm2[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3],xmm6[4],xmm2[5],xmm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = mem[0],xmm3[1],mem[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7],ymm2[8,9,10,11,12],ymm6[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm15[2,3],ymm11[4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3],xmm6[4],xmm2[5],xmm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0],xmm3[1],mem[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7],ymm5[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4,5,6,7,8],ymm5[9],ymm2[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5],mem[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1],mem[2,3],ymm5[4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4],xmm5[5],xmm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm5[1,2,3,4,5,6,7],ymm2[8],ymm5[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4,5,6,7,8],ymm7[9],ymm5[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4],xmm7[5],xmm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[0],xmm8[1],mem[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0],ymm7[1,2,3,4,5,6,7],ymm5[8],ymm7[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm12[0,1],mem[2,3],ymm12[4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4],xmm7[5],xmm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[0],xmm7[1],mem[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7],ymm2[8,9,10,11,12],ymm3[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6,7,8],ymm4[9],ymm3[10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7],ymm4[8,9,10,11,12],ymm3[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm14[1],ymm9[2,3],ymm14[4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4,5,6,7,8],ymm7[9],ymm4[10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -12749,10 +12624,10 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rcx) @@ -12769,9 +12644,9 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%rax) @@ -12782,441 +12657,439 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: addq $1528, %rsp # imm = 0x5F8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride7_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $1800, %rsp # imm = 0x708 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 480(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-SLOW-NEXT: subq $1768, %rsp # imm = 0x6E8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 544(%rdi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 700(%rdi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 672(%rdi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 160(%rdi), %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm19[0,1,0,2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm17 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 480(%rdi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 544(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm13[1],ymm7[2,3,4],ymm13[5],ymm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm22[0,1,0,2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 700(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 672(%rdi), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6,7,8,9,10],ymm0[11],ymm5[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3,4,5],xmm6[6],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm7, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm2[2],ymm13[3,4,5],ymm2[6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4],xmm0[5],xmm7[6],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,0,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 528(%rdi), %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6,7,8,9,10],ymm8[11],ymm9[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm8, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 608(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1],ymm2[2],ymm11[3,4,5],ymm2[6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm2[2],ymm14[3,4,5],ymm2[6],ymm14[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm14, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 240(%rdi), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0],xmm1[1],xmm14[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 528(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6,7,8,9,10],ymm5[11],ymm6[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm26 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 608(%rdi), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm5[2],ymm12[3,4,5],ymm5[6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm20 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm20[0,1,0,2] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm27[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm27[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 688(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm17, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm17 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm10[1],xmm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm10, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm15[2,3],ymm1[4,5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm15, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4],ymm4[5,6,7,8,9,10,11],ymm3[12],ymm4[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm6, %ymm4, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm2[3],ymm7[4,5],ymm2[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3,4,5],xmm4[6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm19[0,1,1,2] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,0,3,4,5,4,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm14, %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm22 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 528(%rdi), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm13, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm5, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm13[2,3],ymm0[4,5],ymm13[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4],ymm8[5,6,7,8,9,10,11],ymm6[12],ymm8[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm31, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm26, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm22[0,1,1,2] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,0,3,4,5,4,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm3, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm14, (%rsp) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7,8,9,10,11],ymm9[12],ymm8[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm7, %ymm5, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm20[0,1,1,2] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm17, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[1,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3,4,5],xmm3[6],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm20[0,1,1,2] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[1,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm6, %ymm5, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm22[0,1,1,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,2,1,4,5,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm16, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm20[0,1,1,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm14[1],ymm11[2,3,4],ymm14[5],ymm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm19[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,1,2,1,4,5,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm3, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm3, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm16, %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4],xmm5[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,5],xmm1[6],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm18, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm10, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3],xmm7[4],xmm8[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 656(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0,1,2,3,4,5],xmm5[6],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,7,6] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm21, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1],ymm4[2,3],ymm14[4,5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7],ymm8[8,9,10,11,12],ymm6[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm7[1],ymm12[2,3,4],ymm7[5],ymm12[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm20[0,1,1,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpsrlq $48, %xmm23, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4],xmm3[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,5],xmm0[6],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7],ymm4[8,9,10,11,12],ymm6[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm22, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm4, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm7[2],ymm12[3,4],ymm7[5],ymm12[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3],xmm4[4],xmm6[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 656(%rdi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,5],xmm4[6],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm8[5,6,7],ymm3[8,9,10,11,12],ymm8[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpsrld $16, %xmm15, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm14[2,3],ymm11[4,5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3],xmm8[4],xmm3[5],xmm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0],xmm5[1],xmm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7],ymm3[8,9,10,11,12],ymm5[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm14[3],ymm11[4,5],ymm14[6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm27[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,2,1,4,5,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX512F-ONLY-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 {%k1} # 16-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm7[1],xmm5[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4],xmm1[5],xmm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm6[1],xmm4[2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 736(%rdi), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm6 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 800(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 768(%rdi), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm3[3],ymm10[4,5],ymm3[6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm15 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,10,11,10,11,10,11,8,9,6,7,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 832(%rdi), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 864(%rdi), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm13[2],ymm5[3,4],ymm13[5],ymm5[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm11[2],ymm5[3,4],ymm11[5],ymm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm4[3],ymm14[4,5],ymm4[6],ymm14[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm19 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,2,1,4,5,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[0,1,2,1,4,5,6,5] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 16-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm1, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 736(%rdi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 352(%rdi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm7[3],ymm2[4,5],ymm7[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7,8,9,10,11,12,13],ymm4[14],ymm1[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 800(%rdi), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 768(%rdi), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm5, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm4, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm3, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 832(%rdi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 864(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm9[3],ymm14[4,5],ymm9[6],ymm14[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm14, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm13[3],ymm6[4,5],ymm13[6],ymm6[7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6],ymm8[7,8],ymm1[9,10,11,12,13,14],ymm8[15] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm15, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6],ymm2[7,8],ymm1[9,10,11,12,13,14],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,12,13,12,13,12,13,10,11,8,9,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm17 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm1, %xmm27 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm27[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,0,2,4,5,6,7] @@ -13226,2763 +13099,2895 @@ ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm24 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6],ymm8[7,8],ymm1[9,10,11,12,13,14],ymm8[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm5[1],ymm12[2,3],ymm5[4],ymm12[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6],ymm2[7,8],ymm1[9,10,11,12,13,14],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm13[2,3],ymm2[4,5],ymm13[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,0,2,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm14[1],ymm7[2,3,4],ymm14[5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,0,1,14,15,8,9,10,11,4,5,6,7,20,21,20,21,16,17,30,31,24,25,26,27,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3,4,5,6,7,8],ymm0[9],ymm8[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm22, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7,8],ymm0[9],ymm2[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm6[2],ymm13[3,4,5],ymm6[6],ymm13[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm13, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm15, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] ; AVX512F-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm0, %xmm27 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm27[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm27[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm5[1],ymm12[2,3,4],ymm5[5],ymm12[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm12[2],ymm11[3,4,5],ymm12[6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm8[4],xmm1[5],xmm8[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm22, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm13[3],ymm2[4,5],ymm13[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm8, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm8[2],ymm4[3,4,5],ymm8[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm15, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[0,1,2,0,4,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[0,1,2,0,4,5,6,4] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, (%rsp), %xmm1 # 16-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm23 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm8[4],xmm1[5],xmm8[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2],ymm11[3,4,5],ymm10[6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,2,0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm8, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7],ymm8[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[0,1,2,0,4,5,6,4] +; AVX512F-ONLY-SLOW-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,2,0,4,5,6,4] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm24 ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm19 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,0] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm14[2],ymm13[3,4,5],ymm14[6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm9[2],ymm7[3,4,5],ymm9[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm9, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,0,0,0,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3,4,5,6,7],ymm1[8,9,10],ymm8[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm12[2],ymm10[3,4,5],ymm12[6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27> -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6],ymm8[7,8,9,10,11,12,13],ymm0[14],ymm8[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm8[2],ymm11[3,4,5],ymm8[6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [8,9,8,9,8,9,8,9,0,1,14,15,0,1,10,11,24,25,24,25,24,25,24,25,16,17,30,31,16,17,26,27] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6],ymm2[7,8,9,10,11,12,13],ymm0[14],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5],xmm0[6],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm7[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3],ymm9[4,5,6,7,8,9,10],ymm14[11],ymm9[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm12, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm14, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm13, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm14[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5,6,7,8,9,10],ymm13[11],ymm3[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm9[1],ymm15[2,3],ymm9[4],ymm15[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm15, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3,4,5],xmm15[6],xmm13[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm9, %ymm14, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm3, %ymm13, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4,5],ymm0[6],mem[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm10[0,1],mem[2],ymm10[3,4,5],mem[6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6],ymm9[7,8,9,10,11,12,13],ymm0[14],ymm9[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm9 = ymm8[0,1],mem[2],ymm8[3,4],mem[5],ymm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm27, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3,4,5],xmm0[6],xmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6],ymm3[7,8,9,10,11,12,13],ymm0[14],ymm3[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm23, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3,4,5],xmm0[6],xmm13[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm26[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm26, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7,8,9,10],ymm15[11],ymm14[12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm14, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm2[2,3],ymm14[4,5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm21[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7,8,9,10],ymm15[11],ymm13[12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3,4,5],xmm15[6],xmm13[7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm14, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm1, %ymm13, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3,4,5],xmm14[6],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3],xmm15[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm8, %xmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3,4,5],xmm13[6],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm9, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3],xmm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm7, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm7, %xmm22 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm14, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm13, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm8, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm11, %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm11[2,3],ymm13[4,5],ymm11[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm11, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm13, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm6, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm2[3],ymm7[4,5],ymm2[6],ymm7[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4],ymm0[5,6,7,8,9,10,11],ymm13[12],ymm0[13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm6, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm2[3],ymm14[4,5],ymm2[6],ymm14[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm0[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4],ymm0[5,6,7,8,9,10,11],ymm15[12],ymm0[13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0],ymm5[1],ymm2[2,3],ymm5[4],ymm2[5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm5, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3,4,5],xmm15[6],xmm13[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2,3,4,5],xmm13[6],xmm15[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm13 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2],xmm15[3],xmm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm15[1],xmm11[2],xmm15[3],xmm11[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm12, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm15 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm0[1,2,3,4,5,6],ymm12[7,8],ymm0[9,10,11,12,13,14],ymm12[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm0[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1,2,3,4,5,6],ymm11[7,8],ymm0[9,10,11,12,13,14],ymm11[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm12[2,3],ymm5[4,5],ymm12[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm12, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm27, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3],ymm0[4],ymm10[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4,5,6,7,8],ymm12[9],ymm0[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2],ymm2[3],ymm11[4,5],ymm2[6],ymm11[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm23, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3],ymm8[4],ymm0[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm0[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4,5,6,7,8],ymm11[9],ymm0[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2],ymm9[3],ymm7[4,5],ymm9[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,4,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4,5],mem[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,3,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm27, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm3[0,1],mem[2],ymm3[3,4,5],mem[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,3,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm3, %ymm8, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7,8],ymm15[9],ymm14[10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm14, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm15[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2,3,4,5,6,7,8],ymm9[9],ymm15[10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm9, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,6,4,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm27, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0],ymm1[1],ymm9[2,3,4],ymm1[5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm10[1],xmm13[2],xmm10[3],xmm13[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm29, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,0,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm1[0,1],mem[2],ymm1[3,4,5],mem[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm9 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm6 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm13, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm9, %zmm13, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,3,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm10[1,2],ymm5[3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7],ymm5[8,9,10],ymm4[11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1],ymm9[2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm9, %zmm23, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm28, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm24, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm3[0,1],mem[2],ymm3[3,4,5],mem[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm9, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm9, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,3,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm2, %ymm8, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1,2],ymm8[3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7],ymm7[8,9,10],ymm5[11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm9 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm21 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm24 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm2 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm18 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm11, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm16 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm20 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm9, %zmm20 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm16 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm13, %zmm16 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm14, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm14, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 64(%rsi) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rsi) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 64(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 64(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 64(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm15, %zmm22 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm8 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm21, %zmm8 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm15, %zmm26 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm24, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 64(%r9) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm26, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm25, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $1800, %rsp # imm = 0x708 +; AVX512F-ONLY-SLOW-NEXT: addq $1768, %rsp # imm = 0x6E8 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i16_stride7_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1736, %rsp # imm = 0x6C8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <2,5,9,u,12,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 480(%rdi), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm4, %ymm6, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 672(%rdi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm20 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 700(%rdi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm18[0,1,0,2] +; AVX512F-ONLY-FAST-NEXT: subq $1832, %rsp # imm = 0x728 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm19[0,1,0,2] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,9,4,13,4,13,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm12[2],ymm6[3,4,5],ymm12[6],ymm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm3, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm6, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm12[3],ymm7[4,5],ymm12[6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm1[1],xmm5[2,3,4,5],xmm1[6],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm5, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1],ymm10[2],ymm11[3,4,5],ymm10[6],ymm11[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm7, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 544(%rdi), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm3, %ymm6, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 480(%rdi), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 672(%rdi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 700(%rdi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,5,9,u,12,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm14, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm14, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm13, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm2[1],xmm6[2,3,4,5],xmm2[6],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm15[2],ymm8[3,4,5],ymm15[6],ymm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm8, %ymm13 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0],xmm13[1],xmm14[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm6[1],xmm9[2,3,4,5],xmm6[6],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm22, %zmm19, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,4,5,6,7,6,7,12,13,16,17,18,19,20,21,22,23,20,21,22,23,22,23,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0],xmm5[1],xmm6[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm23, %ymm18, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2,3,4,5],xmm1[6],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm26 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm26[0,1,0,2] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 688(%rdi), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,6,9,u,13,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1],xmm4[2,3,4,5],xmm7[6],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm8, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0],xmm4[1],xmm8[2,3,4,5],xmm4[6],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [2,5,2,5,2,5,2,5] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm18, %ymm11, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm14, %xmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 608(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm1[2],ymm5[3,4,5],ymm1[6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm31 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm31[0,1,0,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 688(%rdi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm3[1],xmm7[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,6,9,u,13,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm17, %ymm4, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm22, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm9, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2,3,4,5],xmm3[6],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm26, %ymm11, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm17 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2],xmm3[3],xmm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [10,3,6,15,12,13,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm20, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm9, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm11[1],ymm5[2,3],ymm11[4],ymm5[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm9[1],xmm3[2,3,4,5],xmm9[6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm18[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm15 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm15, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,1,0,3,2,3,2,5] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm19, %ymm16, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3,4,5,6],ymm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm10, %xmm20 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm28, %ymm27, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm28, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm31, %ymm16, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [2,11,14,7,4,5,14,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm26, %ymm10, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,2,3,8,9,16,17,18,19,20,21,22,23,24,25,26,27,18,19,24,25] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm19[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2,3,4,5,6],ymm12[7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm22, %zmm20, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm27, %ymm28, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0],ymm3[1],ymm9[2,3,4],ymm3[5],ymm9[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm7, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3,4,5],xmm6[6],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm26[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3,4,5],xmm7[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm31[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm3[1],ymm15[2,3,4],ymm3[5],ymm15[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm21, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm2, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3,4],ymm11[5],ymm5[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm19, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm3, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [3,6,10,13,3,6,10,13] -; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm25, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-ONLY-FAST-NEXT: movw $992, %ax # imm = 0x3E0 -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm16 {%k1} # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm13[1],ymm10[2,3,4],ymm13[5],ymm10[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 680(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm28, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,3,3,3,0,3,7,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm18, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,5,9,12,2,5,9,12] -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm8, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpsrlq $48, %xmm16, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,5,9,12,2,5,9,12] +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm23, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,3,3,3,0,3,7,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm19, %ymm6, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7],ymm7[8,9,10,11,12],ymm3[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm13, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm20, %xmm7 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 736(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3],xmm7[4],xmm3[5],xmm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,u,u,4,7,11,14> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4,5,6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm25, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 {%k1} # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm26, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm17, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 416(%rdi), %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm20, %ymm0, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 864(%rdi), %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %ymm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm23, %ymm0, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm10, %ymm13, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm31, %ymm6, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpsrld $16, %xmm7, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,4,7,0,0,4,7,0] -; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm18, %ymm10, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm14, %ymm16, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,4,7,0,0,4,7,0] +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm19, %ymm6, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,6,9,13,2,6,9,13] -; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm11, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7],ymm4[8,9,10,11,12],ymm2[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} xmm18 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm3, %zmm8, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4],xmm5[5],xmm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm14, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3,4,5,6],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm14, %zmm8, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <1,u,u,u,4,8,11,15> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3,4,5,6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm14, %zmm11, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm0[4],xmm7[5],xmm0[6],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm15, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3,4,5,6],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm3, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7],ymm1[8,9,10,11,12],ymm5[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm13, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm10, %ymm16, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm10, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm31, %ymm6, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7],ymm4[8,9,10,11,12],ymm3[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [3,6,10,13,3,6,10,13] +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm14, %ymm8, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: movw $992, %ax # imm = 0x3E0 +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3],xmm3[4],xmm0[5],xmm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [10,11,10,11,10,11,10,11,8,9,6,7,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 352(%rdi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,u,u,4,7,11,14> +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm13, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3,4,5,6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm15, %ymm8, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, (%rsp), %zmm1, %zmm2 {%k1} # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 736(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4],xmm1[5],xmm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 800(%rdi), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 768(%rdi), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm14, %ymm12, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4,5,6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm19 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm12[2],ymm5[3,4,5],ymm12[6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm29 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <1,u,u,u,5,8,12,15> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm6, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm14, %zmm25, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,12,13,12,13,12,13,10,11,8,9,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <9,u,u,u,12,0,3,7> +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm13, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm11, %ymm7, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4,5,6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm20, %ymm16, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,2,3,4,5,6,7,2,3,4,5,10,11,12,13,20,21,18,19,20,21,22,23,18,19,20,21,26,27,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm2[2],ymm4[3,4,5],ymm2[6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4],xmm0[5],xmm7[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm3, %zmm25, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm9, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm12, %ymm14, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm9[2],ymm0[3,4,5],ymm9[6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm0[1],ymm9[2,3],ymm0[4],ymm9[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm26, %ymm10, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm13[2],ymm2[3,4,5],ymm13[6],ymm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm12[3],ymm5[4,5],ymm12[6],ymm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [2,11,2,11,12,5,8,9] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm8, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm9, %ymm8, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm23, %ymm24, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1],ymm2[2],ymm5[3,4,5],ymm2[6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm5[1],ymm2[2,3],ymm5[4],ymm2[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm16, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm10[2],ymm2[3,4,5],ymm10[6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <9,u,u,u,13,0,4,7> +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm13, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm11, %ymm3, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 864(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 832(%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm1[2],ymm8[3,4,5],ymm1[6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm9, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0],xmm2[1],xmm9[2,3,4,5],xmm2[6],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm12, %ymm14, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm9[2],ymm6[3,4,5],ymm9[6],ymm6[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm10[3],ymm2[4,5],ymm10[6],ymm2[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [10,3,10,3,4,13,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm13, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm11, %ymm3, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7],ymm5[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm25 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <0,3,7,10,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm17, %zmm19, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm18, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm13[3],ymm7[4,5],ymm13[6],ymm7[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2,3,4,5],xmm1[6],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm15, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm14[1],xmm9[2,3,4,5],xmm14[6],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm12, %ymm14, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4],xmm7[5],xmm1[6],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <2,u,u,u,6,9,13,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm9, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm22, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm2[1],ymm10[2,3],ymm2[4],ymm10[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,u,u,6,9,13,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm13, %ymm4, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm8, %xmm24 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,3,7,10,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm23, %ymm1, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm19, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm14, %ymm12, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm18, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm6[1],ymm9[2,3],ymm6[4],ymm9[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm9, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3,4,5],xmm7[6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm5, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm18, %ymm6, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm8, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm19, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3,4,5],xmm9[6],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm11, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm28 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm30 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <0,4,7,11,14,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm17, %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[2,3],ymm0[4],ymm13[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm13, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3,4,5],xmm14[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3],xmm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <3,u,u,u,6,10,13,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm14, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm10, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm5, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1],xmm15[2],xmm10[3],xmm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4],ymm1[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <3,u,u,u,6,10,13,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm13, %ymm5, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm7, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7],ymm7[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,4,7,11,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm23, %ymm8, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm14, %ymm12, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm5[1],xmm2[2,3,4,5],xmm5[6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm3[1],ymm7[2,3,4],ymm3[5],ymm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm5[1],xmm15[2],xmm5[3],xmm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm21 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm5, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm18, %ymm6, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm9, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm19, %zmm18 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,10,3,14,7,10,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm13, %ymm8, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm13 ; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,8,11,15,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm26, %zmm10, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,10,3,14,7,10,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm7, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1,2],ymm5[3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm10, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm10[2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm4[1],ymm8[2,3,4],ymm4[5],ymm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm29, %zmm7, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm13 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm17 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm11 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm12, %zmm10, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm13, %zmm10, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm4[1,2],ymm7[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <9,12,0,3,7,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm25, %ymm13, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm14, %ymm8, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2],xmm6[3],xmm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2],ymm5[3,4,5,6,7],ymm4[8,9,10],ymm5[11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm22, %ymm13, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm8 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm17 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm21 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm8, %zmm6, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm9, %zmm6, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm6 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm19, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm5 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm5, %zmm12 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm7 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm10, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm10 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm13, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm14, %zmm13 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm19, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 64(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 64(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm8, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm8 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm9, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm10, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm11, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm12, %zmm24 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm13, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 64(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 64(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 64(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm16, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm14, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1736, %rsp # imm = 0x6C8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $1832, %rsp # imm = 0x728 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: load_i16_stride7_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $1496, %rsp # imm = 0x5D8 -; AVX512DQ-SLOW-NEXT: vmovdqa 480(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm28 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 544(%rdi), %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3] +; AVX512DQ-SLOW-NEXT: subq $1528, %rsp # imm = 0x5F8 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm9 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpbroadcastw 700(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 672(%rdi), %xmm4 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm24 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm15[2],ymm3[3,4,5],ymm15[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm5 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm19 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm19[0,1,0,2] +; AVX512DQ-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm8 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 480(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2],ymm4[3,4,5],ymm3[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm12 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 544(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm21 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm8[2],ymm3[3,4,5],ymm8[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm4 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm16 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm16[0,1,0,2] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpbroadcastw 252(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 224(%rdi), %xmm15 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vpbroadcastw 700(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 672(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; AVX512DQ-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm20 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm23 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm11[2],ymm9[3,4,5],ymm11[6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm10 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0],xmm15[1],xmm10[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,0,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 528(%rdi), %xmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7,8,9,10],ymm6[11],ymm7[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2],ymm2[3],ymm14[4,5],ymm2[6],ymm14[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm27 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3,4,5],xmm6[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm15[3],ymm5[4,5],ymm15[6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm22 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 608(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm10[2],ymm6[3,4,5],ymm10[6],ymm6[7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm22 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm22[0,1,0,2] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm30[0,1,1,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 688(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm0, %xmm25 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 240(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm8[1],xmm1[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4],ymm4[5,6,7,8,9,10,11],ymm6[12],ymm4[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm8, %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm6 -; AVX512DQ-SLOW-NEXT: vpor %ymm6, %ymm8, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6,7,8,9,10],ymm0[11],ymm5[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm16 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3,4,5],xmm2[6],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm18 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm16[0,1,1,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,0,3,4,5,4,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm10, %xmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm15, %xmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm31 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7,8,9,10,11],ymm9[12],ymm8[13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm28, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm8, %ymm4 -; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm4, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqa 608(%rdi), %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm12[2],ymm14[3,4,5],ymm12[6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm31 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm31[0,1,0,2] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 688(%rdi), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm7 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm7[1],xmm3[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3,4,5],xmm4[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm22[0,1,1,2] +; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm10[3],ymm6[4,5],ymm10[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm17 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm19[0,1,1,2] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm8, %xmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4],ymm6[5,6,7,8,9,10,11],ymm0[12],ymm6[13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm26, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5],xmm6[6],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm4[1,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm31[0,1,1,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm28 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm4[1],ymm15[2,3,4],ymm4[5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[1,3,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpor %ymm5, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm7, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm7, %ymm4 -; AVX512DQ-SLOW-NEXT: vpor %ymm4, %ymm6, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm8[1],ymm1[2,3,4],ymm8[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2],xmm0[3],xmm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2],ymm10[3],ymm13[4,5],ymm10[6],ymm13[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512DQ-SLOW-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm16[0,1,1,3] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm28[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm1 -; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm17, %xmm8 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm19[0,1,1,3] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw 232(%rdi), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm8 +; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm25, %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm22[0,1,1,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm27[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm5 -; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm25, %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm12, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm31[0,1,1,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw 680(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm6, %xmm4 +; AVX512DQ-SLOW-NEXT: vpsrlq $48, %xmm6, %xmm3 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3],xmm0[4],xmm4[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,5],xmm0[6],xmm6[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm20, %xmm8 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm14[2],ymm3[3,4],ymm14[5],ymm3[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3],xmm7[4],xmm8[5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 656(%rdi), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0,1,2,3,4,5],xmm4[6],xmm7[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 208(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,5],xmm2[6],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7],ymm0[8,9,10,11,12],ymm6[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm21, %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm8, %xmm12 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3],xmm6[4],xmm8[5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 656(%rdi), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %xmm6 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,5],xmm3[6],xmm6[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,7,6] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11 -; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm24, %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm15 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512DQ-SLOW-NEXT: vpsrld $16, %xmm23, %xmm9 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm4, %xmm11 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm6[1],xmm0[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm5[1],xmm2[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8 ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7],ymm8[8,9,10,11,12],ymm6[13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7],ymm8[8,9,10,11,12],ymm5[13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm5, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm24 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm30[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] ; AVX512DQ-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 {%k1} # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm14[2,3],ymm3[4,5],ymm14[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm16 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 {%k1} # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm14, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm14[2,3],ymm4[4,5],ymm14[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3],xmm5[4],xmm1[5],xmm5[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm6[1],xmm3[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4],xmm1[5],xmm2[6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm7[1],xmm4[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6],ymm3[7,8,9,10,11,12,13],ymm5[14],ymm3[15] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11,10,11,10,11,10,11,8,9,6,7,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6],xmm3[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm10[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm13, %ymm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm20 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,2,1,4,5,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 16-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 704(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 736(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512DQ-SLOW-NEXT: vmovdqa 704(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 736(%rdi), %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm0[2,3],ymm5[4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4],xmm1[5],xmm4[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 800(%rdi), %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 768(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, %ymm7 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6],ymm4[7,8,9,10,11,12,13],ymm6[14],ymm4[15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm4, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5,6],xmm4[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 832(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 864(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm8, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 800(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 768(%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm6 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm1[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1,2,3,4,5,6],ymm12[7,8],ymm1[9,10,11,12,13,14],ymm12[15] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [12,13,12,13,12,13,12,13,10,11,8,9,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 832(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 864(%rdi), %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm1, %xmm30 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm30[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm21 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,2,1,4,5,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 288(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm10[3],ymm5[4,5],ymm10[6],ymm5[7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 352(%rdi), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6],ymm3[7,8,9,10,11,12,13],ymm12[14],ymm3[15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6],xmm2[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa 416(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm9[2],ymm3[3,4],ymm9[5],ymm3[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,3,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm7[3],ymm13[4,5],ymm7[6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm18 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1,2,3],xmm1[4],xmm12[5],xmm1[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3,4,5,6],ymm14[7,8],ymm12[9,10,11,12,13,14],ymm14[15] -; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm14 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3,4,5,6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm8 -; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm12, %xmm30 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm30[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm3[3],ymm15[4,5],ymm3[6],ymm15[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1,2,3],xmm1[4],xmm12[5],xmm1[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6],ymm13[7,8],ymm12[9,10,11,12,13,14],ymm13[15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3,4,5,6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm12 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,2,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm11, %ymm24 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm12 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3,4,5,6,7,8],ymm1[9],ymm12[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm5[2],ymm14[3,4,5],ymm5[6],ymm14[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm22 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm12, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm1, %ymm17, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm23 -; AVX512DQ-SLOW-NEXT: vextracti32x4 $1, %ymm1, %xmm30 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm30[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm1[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1,2,3,4,5,6],ymm12[7,8],ymm1[9,10,11,12,13,14],ymm12[15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm3[2,3],ymm9[4,5],ymm3[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm23 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm31 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,0,1,14,15,8,9,10,11,4,5,6,7,20,21,20,21,16,17,30,31,24,25,26,27,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3,4,5,6,7,8],ymm0[9],ymm3[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm8[2],ymm4[3,4,5],ymm8[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm16 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm8 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm12[4],xmm3[5],xmm12[6],xmm3[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm30 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm30, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm12 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm29 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm31 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm5 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm7 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3,4,5,6,7,8],ymm1[9],ymm7[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm15[2],ymm3[3,4,5],ymm15[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm3, %ymm0 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm11[4],xmm7[5],xmm11[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm1, %ymm17, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm2[3],ymm10[4,5],ymm2[6],ymm10[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm3 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3,4,5],xmm7[6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm28[0,1,2,0,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm7 = mem[0,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm1, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm6[3],ymm14[4,5],ymm6[6],ymm14[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm30 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3,4,5],xmm1[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm14[2],ymm9[3,4,5],ymm14[6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm11[4],xmm7[5],xmm11[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm22 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,1,2,0] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm29 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm13, %ymm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm7[1],ymm13[2,3,4],ymm7[5],ymm13[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm14 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm5[2],ymm10[3,4,5],ymm5[6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $242, %ymm0, %ymm30, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm28 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm28 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,2,0,4,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm24 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm4[2],ymm6[3,4,5],ymm4[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm6 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm4[2],ymm15[3,4],ymm4[5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,0] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm11, %ymm11 -; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm11, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,5,4] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7],ymm11[8,9,10],ymm7[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm17 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm17 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3,4,5],xmm7[6],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm27[0,1,2,0,4,5,6,4] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm7 = mem[0,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm1, %zmm16 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm15, %ymm28 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3,4,5],xmm1[6],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm21 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,1,2,0] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm10[2],ymm2[3,4,5],ymm10[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm2, %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm18 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5],xmm13[6],xmm11[7] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,0,0,0,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7],ymm7[8,9,10],ymm11[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm16 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm16 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm18 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm25[0,1,2,0,4,5,6,4] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm25 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm10, %ymm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm5, %ymm28 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm12, %ymm26 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,0] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm9[2],ymm11[3,4,5],ymm9[6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm13[4],xmm3[5],xmm13[6],xmm3[7] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm25 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm10[2],ymm14[3,4,5],ymm10[6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,8,9,8,9,8,9,0,1,14,15,0,1,10,11,24,25,24,25,24,25,24,25,16,17,30,31,16,17,26,27] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm20 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27> -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm11 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6],ymm11[7,8,9,10,11,12,13],ymm1[14],ymm11[15] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm13 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3,4,5],xmm11[6],xmm13[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1],ymm12[2,3],ymm0[4,5],ymm12[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm22[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm24 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5,6,7,8,9,10],ymm14[11],ymm13[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm6[3],ymm2[4,5],ymm6[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3,4,5],xmm0[6],xmm13[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm15[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6,7,8,9,10],ymm12[11],ymm13[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm13[0],xmm3[1],xmm13[2,3,4,5],xmm3[6],xmm13[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm12, %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm31 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm12, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm24 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm5 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4,5],ymm0[6],mem[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6],ymm3[7,8,9,10,11,12,13],ymm0[14],ymm3[15] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm12 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm9[3],ymm7[4,5],ymm9[6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm27 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm29, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm26[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm23 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm12[3],ymm3[4,5,6,7,8,9,10],ymm12[11],ymm3[12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm31 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3,4,5],xmm12[6],xmm14[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm13, %ymm13 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm13, %ymm12, %ymm12 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm11, %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = ymm6[0,1],mem[2],ymm6[3,4,5],mem[6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm11, %ymm12 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7,8,9,10,11,12,13],ymm11[14],ymm12[15] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm12 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm12 = mem[0,1],ymm10[2],mem[3,4],ymm10[5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm11, %zmm20, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3,4,5],xmm11[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1],ymm14[2,3],ymm10[4,5],ymm14[6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm21[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm29 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7,8,9,10],ymm13[11],ymm12[12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7],ymm7[8,9,10],ymm11[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm12, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm0, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4],ymm7[5,6,7,8,9,10,11],ymm11[12],ymm7[13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 +; AVX512DQ-SLOW-NEXT: vpor %ymm3, %ymm12, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = ymm15[0,1,2],mem[3],ymm15[4,5],mem[6],ymm15[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4],ymm0[5,6,7,8,9,10,11],ymm3[12],ymm0[13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm15[1],ymm6[2,3],ymm15[4],ymm6[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm6, %ymm31 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm12 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm12[1],xmm3[2,3,4,5],xmm12[6],xmm3[7] +; AVX512DQ-SLOW-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm12 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm12 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm29 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7],ymm7[8,9,10],ymm11[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm12, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm9, %ymm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm18 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6],ymm12[7,8],ymm11[9,10,11,12,13,14],ymm12[15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm15[2,3],ymm5[4,5],ymm15[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm12, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm21 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm10, %ymm9 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm3[1,2,3,4,5,6],ymm12[7,8],ymm3[9,10,11,12,13,14],ymm12[15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1],ymm8[2,3],ymm11[4,5],ymm8[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm8, %ymm19 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm11, %ymm8 ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1],ymm11[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm13 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm0, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4],ymm7[5,6,7,8,9,10,11],ymm12[12],ymm7[13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm21 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm1, %ymm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm10 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2],xmm14[3],xmm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0,1,2],ymm12[3,4,5,6,7],ymm7[8,9,10],ymm12[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1],ymm3[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm13 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4],ymm0[5,6,7,8,9,10,11],ymm12[12],ymm0[13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm7[1],ymm14[2,3],ymm7[4],ymm14[5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm7, %ymm26 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3,4,5],xmm11[6],xmm12[7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm2[1],ymm5[2,3,4],ymm2[5],ymm5[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm10 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2],xmm12[3],xmm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-SLOW-NEXT: vpor %ymm7, %ymm10, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpor %ymm0, %ymm10, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2],ymm1[3],ymm6[4,5],ymm1[6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm6, %ymm5 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3,4,5,6],ymm12[7,8],ymm10[9,10,11,12,13,14],ymm12[15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm10, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6],ymm11[7,8],ymm10[9,10,11,12,13,14],ymm11[15] +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm20, %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm20, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm0, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6,7,8],ymm8[9],ymm7[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2],ymm15[3],ymm2[4,5],ymm15[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm10 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm22, %zmm2 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3],ymm0[4],ymm9[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4,5,6,7,8],ymm10[9],ymm0[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,4,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,7,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1],ymm7[2],mem[3,4],ymm7[5],mem[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4,5],ymm2[6],mem[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,3,1] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm7[1,2],ymm2[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm0 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4,5,6,7,8],ymm5[9],ymm2[10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,6,4,6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,7,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4,5],ymm0[6],mem[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,3,1] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1,2],ymm0[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm14 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,6,4,6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm20, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2],xmm0[3],xmm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm22, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm27, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm22, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm28, %ymm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm29, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm23, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1],ymm1[2],mem[3,4,5],ymm1[6],mem[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,3,1] -; AVX512DQ-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm8, %ymm6, %ymm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1,2],ymm6[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rsi) +; AVX512DQ-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm24 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm30 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm2, %zmm9 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 64(%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 64(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, (%r8) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 64(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, (%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, (%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 64(%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, (%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 64(%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQ-SLOW-NEXT: addq $1496, %rsp # imm = 0x5D8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512DQ-SLOW-NEXT: addq $1528, %rsp # imm = 0x5F8 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: load_i16_stride7_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <2,5,9,u,12,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm17, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa 480(%rdi), %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm15[2],ymm6[3,4,5],ymm15[6],ymm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm22 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vporq %ymm4, %ymm6, %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqa 672(%rdi), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm20 -; AVX512DQ-FAST-NEXT: vpbroadcastw 700(%rdi), %xmm7 -; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm23 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm23[0,1,0,2] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: subq $1512, %rsp # imm = 0x5E8 +; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm18[0,1,0,2] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,9,4,13,4,13,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm13[2],ymm6[3,4,5],ymm13[6],ymm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm7 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm10 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm2, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm8 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm6, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm12 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm12, %xmm2 -; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm3 -; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm13[3],ymm7[4,5],ymm13[6],ymm7[7] +; AVX512DQ-FAST-NEXT: vmovdqa 224(%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm6 +; AVX512DQ-FAST-NEXT: vpbroadcastw 252(%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 512(%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa 544(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpermi2d %ymm6, %ymm7, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm20 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 480(%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7] ; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, %ymm11 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm17 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm5[2],ymm10[3,4,5],ymm5[6],ymm10[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm12[1],xmm0[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm27 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm1 -; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm17, %zmm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 672(%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512DQ-FAST-NEXT: vpbroadcastw 700(%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,5,9,u,12,u,u,u> +; AVX512DQ-FAST-NEXT: vmovdqa %ymm12, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm23 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm10, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm19 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, %ymm12 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3,4,5],xmm4[6],xmm6[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm15[2],ymm1[3,4,5],ymm15[6],ymm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm15, %ymm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,4,5,6,7,6,7,12,13,16,17,18,19,20,21,22,23,20,21,22,23,22,23,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa 240(%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0],xmm2[1],xmm7[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2d %ymm22, %ymm20, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm25 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3,4,5],xmm3[6],xmm8[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm7[4],xmm2[5],xmm7[6],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %ymm26 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm3 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm26[0,1,0,2] +; AVX512DQ-FAST-NEXT: vmovdqa 608(%rdi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm4[2],ymm0[3,4,5],ymm4[6],ymm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm22 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %ymm29 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa 688(%rdi), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm14[1],xmm7[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm20 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <2,6,9,u,13,u,u,u> -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm13, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3,4,5],xmm6[6],xmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm17, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm7 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm29[0,1,0,2] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa 688(%rdi), %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm0[1],xmm5[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, %xmm15 +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5],xmm4[6],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <2,6,9,u,13,u,u,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %ymm21, %ymm6, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm4, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm24 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3,4,5],xmm4[6],xmm7[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm29 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [2,5,2,5,2,5,2,5] -; AVX512DQ-FAST-NEXT: vpermd %ymm23, %ymm10, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm11[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm22 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm12 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm27, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu %ymm15, (%rsp) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm17, %zmm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm8, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm8, %ymm27 +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm3, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3,4,5],xmm3[6],xmm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2],ymm1[3],ymm10[4,5],ymm1[6],ymm10[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm17 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm10, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm6 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm14, %xmm30 -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm7 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm27 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [10,3,6,15,12,13,6,15] -; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm17, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,1,0,3,2,3,2,5] +; AVX512DQ-FAST-NEXT: vpermd %ymm18, %ymm16, %ymm10 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5,6],ymm10[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm19 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm8, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2d %ymm26, %ymm25, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm26, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm2[1],ymm9[2,3],ymm2[4],ymm9[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm10[1],xmm7[2,3,4,5],xmm10[6],xmm7[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm5 +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm2[3],ymm7[4,5],ymm2[6],ymm7[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vpermd %ymm29, %ymm16, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm15, %xmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [2,11,14,7,4,5,14,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm6 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm24, %ymm11, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm5, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5],xmm6[6],xmm5[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,2,3,8,9,16,17,18,19,20,21,22,23,24,25,26,27,18,19,24,25] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm18[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2d %ymm25, %ymm26, %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm8 ; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm8, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm9[1],ymm1[2,3],ymm9[4],ymm1[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, %ymm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5,6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm8 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3,4,5],xmm8[6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm12 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm23[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm14 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1,2,3,4,5,6],ymm14[7] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm12 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm14, %zmm21 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2],xmm4[3],xmm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vpermd %zmm18, %zmm17, %zmm11 -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm7 -; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm4, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm5[1],ymm2[2,3],ymm5[4],ymm2[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1],xmm4[2,3,4,5],xmm7[6],xmm4[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm7 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm26[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm31 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm29[0,1,1,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm1[1],ymm15[2,3,4],ymm1[5],ymm15[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm6 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm12 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm8 -; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm22, %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm11 +; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm19, %xmm7 ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm3, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm19 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u> +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm10 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQ-FAST-NEXT: vpbroadcastw 680(%rdi), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm20, %xmm5 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm5 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm21, %ymm3, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,3,3,3,0,3,7,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm18, %ymm7, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5,6,7],ymm6[8,9,10,11,12],ymm8[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm14, %xmm8 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 416(%rdi), %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm21, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 864(%rdi), %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm31 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm22, %ymm3, %ymm31 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm13, %ymm0 +; AVX512DQ-FAST-NEXT: vpermi2d %ymm10, %ymm13, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpermd %ymm29, %ymm7, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm7 +; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm16, %xmm4 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, %xmm8 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm9, %ymm16, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,7,0,0,4,7,0] +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm18, %ymm5, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7],ymm1[8,9,10,11,12],ymm6[13,14,15] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa %ymm13, %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm10, %ymm6 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm10, %ymm16, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %ymm29, %ymm5, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm29 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm9, %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX512DQ-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm16 {%k1} # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4],ymm0[5],ymm11[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQ-FAST-NEXT: vpbroadcastw 680(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm9 -; AVX512DQ-FAST-NEXT: vpsrlq $48, %xmm20, %xmm3 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,3,3,3,0,3,7,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm23, %ymm4, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,5,9,12,2,5,9,12] -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm6, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5,6,7],ymm5[8,9,10,11,12],ymm1[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm11, %xmm5 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa 736(%rdi), %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm5[2,3],ymm10[4,5],ymm5[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm20 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3],xmm5[4],xmm1[5],xmm5[6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,u,u,u,4,7,11,14> -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm10 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3,4,5,6],xmm10[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm10 -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm13, %zmm12 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm10[7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 {%k1} # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm4, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm6, %zmm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15] -; AVX512DQ-FAST-NEXT: vpsrld $16, %xmm30, %xmm3 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm22 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0] -; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm23, %ymm12, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,6,9,13,2,6,9,13] -; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm6, %zmm2 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3],xmm15[4],xmm5[5],xmm15[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm17 -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm16, %zmm14 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm15 = xmm14[0,1,2],xmm15[3,4,5,6],xmm14[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512DQ-FAST-NEXT: vpermd %zmm16, %zmm6, %zmm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm6[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm28 {%k1} # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4],xmm1[5],xmm3[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [10,11,10,11,10,11,10,11,8,9,6,7,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 352(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,u,u,4,7,11,14> +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, %ymm1 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm9, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3,4,5,6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm5 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2,3],xmm6[4],xmm14[5],xmm6[6],xmm14[7] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,u,u,u,4,8,11,15> -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm7, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3,4,5,6],xmm15[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm16, %zmm10, %zmm15 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm15 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm15[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm6, %ymm2, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm24 {%k1} # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa 736(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3],xmm7[4],xmm1[5],xmm7[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 800(%rdi), %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa 768(%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vpermi2d %ymm12, %ymm15, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4,5,6],xmm0[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm14, %ymm30 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,12,13,12,13,12,13,10,11,8,9,6,7,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <9,u,u,u,12,0,3,7> +; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, %ymm10 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm8, %ymm7, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3,4,5,6],xmm10[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm10 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm21, %ymm16, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,2,3,4,5,6,7,2,3,4,5,10,11,12,13,20,21,18,19,20,21,22,23,18,19,20,21,26,27,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm18 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm23 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2,3],xmm0[4],xmm10[5],xmm0[6],xmm10[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpermi2d %ymm15, %ymm12, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm0[2],ymm3[3,4,5],ymm0[6],ymm3[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm26 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %ymm21, %ymm2, %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm6 +; AVX512DQ-FAST-NEXT: vpermi2d %ymm22, %ymm25, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm3[2],ymm6[3,4,5],ymm3[6],ymm6[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm17 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %ymm22, %ymm16, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2,3],xmm6[4],xmm15[5],xmm6[6],xmm15[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2],xmm6[3,4,5,6],xmm4[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm10, %zmm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm13[2],ymm5[3,4,5],ymm13[6],ymm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm11 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,u,u,u,5,8,12,15> -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm7, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,6,10,13,3,6,10,13] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermd %zmm16, %zmm5, %zmm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm11[2],ymm13[3,4,5],ymm11[6],ymm13[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <9,u,u,u,13,0,4,7> +; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, %ymm6 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm8, %ymm4, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] +; AVX512DQ-FAST-NEXT: vpermi2d %ymm15, %ymm12, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm14[2],ymm7[3,4,5],ymm14[6],ymm7[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm6 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4],xmm1[5],xmm15[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm27 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm27 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermd %ymm26, %ymm12, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm1[2],ymm5[3,4,5],ymm1[6],ymm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm4 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm15 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2,3,4,5],xmm1[6],xmm8[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm4, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm25 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,10,3,4,13,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, %ymm4 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm8, %ymm2, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [2,11,2,11,12,5,8,9] -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm12, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm10, %ymm1 +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm1, %ymm1 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm21 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm21 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 864(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 832(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2],ymm3[3,4,5],ymm1[6],ymm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2],ymm6[3],ymm2[4,5],ymm6[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm19 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0],xmm8[1],xmm12[2,3,4,5],xmm8[6],xmm12[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2d %ymm15, %ymm12, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6],xmm5[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm30 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3,4,5],xmm10[6],xmm0[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,u,u,u,6,9,13,u> +; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, %ymm4 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm9, %ymm2, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm19 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm8, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm1, %ymm1 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm31 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm18 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm14[2],ymm10[3,4],ymm14[5],ymm10[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm31 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,3,7,10,14,u,u,u> -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermd %zmm22, %zmm8, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm14, %ymm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm31 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm25, %zmm31 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm11 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3,4,5],xmm1[6],xmm14[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm15, %ymm21 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,3,7,10,14,u,u,u> +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %ymm22, %ymm4, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm18 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm7[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm16, %zmm6 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2d %ymm12, %ymm15, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512DQ-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm3[1],ymm7[2,3],ymm3[4],ymm7[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3,4,5],xmm5[6],xmm2[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <2,u,u,u,6,9,13,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm16, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm14, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm14 -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm1 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermd %zmm26, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm29 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm7 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm23 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %ymm31, %ymm25, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm16, %zmm30 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm30 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512DQ-FAST-NEXT: vextracti32x4 $1, %ymm26, %xmm0 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm3[1],ymm10[2,3],ymm3[4],ymm10[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm30 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm6, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm29 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm24 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm14, %xmm27 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = <0,4,7,11,14,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm22, %zmm19, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm5[1],ymm11[2,3],ymm5[4],ymm11[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm11, %ymm20 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3,4,5],xmm15[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm12 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2],xmm15[3],xmm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <3,u,u,u,6,10,13,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm16, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <3,u,u,u,6,10,13,u> +; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, %ymm4 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm9, %ymm2, %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm8, %ymm12, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vpermd %zmm26, %zmm19, %zmm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1],ymm8[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm18 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3,4,5],xmm8[6],xmm0[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm15 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0],ymm3[1],ymm15[2,3,4],ymm3[5],ymm15[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm16 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm19 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm20 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm14 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm14 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm14[2,3],ymm10[4,5],ymm14[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm14, %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm26 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm27 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm23 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,4,7,11,14,u,u,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm10 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm22, %ymm4, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,2,3,2,3,2,3,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2d %ymm12, %ymm15, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm2[1],xmm6[2,3,4,5],xmm2[6],xmm6[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm17 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm6, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2d %ymm31, %ymm25, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm6 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm16, %zmm3 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm4 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,10,3,14,7,10,3] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm9, %ymm4, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm7, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2],xmm7[3],xmm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <1,4,8,11,15,u,u,u> -; AVX512DQ-FAST-NEXT: vpermd %zmm22, %zmm6, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2],xmm4[3],xmm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm7 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,10,3,14,7,10,3] -; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm7, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1,2],ymm5[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7],ymm2[8,9,10],ymm4[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm12[3],ymm2[4,5],ymm12[6],ymm2[7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vpermd %zmm26, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm11[1],ymm3[2,3,4],ymm11[5],ymm3[6,7] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm10 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0],xmm4[1],xmm10[2],xmm4[3],xmm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <9,12,0,3,7,u,u,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm9 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm21, %ymm7, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm9, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vpermt2d %ymm12, %ymm4, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm10[2],ymm4[3,4],ymm10[5],ymm4[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpermd %zmm28, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1,2],ymm7[3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3,4,5,6,7],ymm3[8,9,10],ymm4[11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm3, %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm7 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm6 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm8 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm12 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm7, %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, (%rdx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 64(%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 64(%r8) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 64(%r9) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, (%r9) -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm12[1],xmm2[2],xmm12[3],xmm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm2 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm25, %ymm7, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2],ymm6[3],ymm14[4,5],ymm6[6],ymm14[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm16, %zmm29 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm29 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm28, %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm24, %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm9 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 64(%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, (%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, (%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 64(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-FAST-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 64(%rax) +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 64(%rax) +; AVX512DQ-FAST-NEXT: addq $1512, %rsp # imm = 0x5E8 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll @@ -541,7 +541,7 @@ ; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,0,4] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] @@ -560,7 +560,7 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3] ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = [3,7,3,3] ; AVX512F-SLOW-NEXT: vpermt2d %xmm13, %xmm15, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -602,7 +602,7 @@ ; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm5 ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm6 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,4] ; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm0 ; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm2, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm8 @@ -614,7 +614,7 @@ ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [1,5,1,1] ; AVX512F-FAST-NEXT: vmovdqa %xmm13, %xmm15 ; AVX512F-FAST-NEXT: vpermt2d %xmm12, %xmm14, %xmm15 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] @@ -622,11 +622,11 @@ ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 ; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm15 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,6] ; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm0, %xmm15 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [3,7,3,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [3,7,3,3] ; AVX512F-FAST-NEXT: vpermt2d %xmm12, %xmm15, %xmm13 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm13[0,1],xmm7[2,3] ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] @@ -877,28 +877,28 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) +; SSE-NEXT: movaps %xmm3, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rdx) +; SSE-NEXT: movaps %xmm3, 16(%rcx) ; SSE-NEXT: movaps %xmm8, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rcx) +; SSE-NEXT: movaps %xmm3, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%r8) -; SSE-NEXT: movaps %xmm1, (%r9) ; SSE-NEXT: movapd %xmm2, 16(%r9) +; SSE-NEXT: movaps %xmm1, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm9, (%rax) ; SSE-NEXT: movaps %xmm7, 16(%rax) +; SSE-NEXT: movaps %xmm9, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm6, (%rax) ; SSE-NEXT: movapd %xmm10, 16(%rax) +; SSE-NEXT: movaps %xmm6, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps %xmm14, (%rax) @@ -1285,7 +1285,7 @@ ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm26 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm27 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,0,0,4] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm29 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm8[3] @@ -1350,7 +1350,7 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [3,7,3,3] ; AVX512F-SLOW-NEXT: vpermt2d %xmm14, %xmm17, %xmm15 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm13[2,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] @@ -1453,7 +1453,7 @@ ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm29 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,0,0,4] ; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm1 ; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm7, %xmm1 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 @@ -1487,7 +1487,7 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [1,5,1,1] ; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm1 ; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm13, %xmm1 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] @@ -1503,7 +1503,7 @@ ; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 ; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm0 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [2,6,2,6] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,6] ; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm11, %xmm0 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm0[2,3] @@ -1520,7 +1520,7 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm19 = [3,7,3,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} xmm19 = [3,7,3,3] ; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm19, %xmm12 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] @@ -1705,14 +1705,14 @@ ; SSE-NEXT: subq $728, %rsp # imm = 0x2D8 ; SSE-NEXT: movdqa 496(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 480(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 480(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 240(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 224(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm6 @@ -1739,14 +1739,13 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] ; SSE-NEXT: movdqa 464(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 448(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa 432(%rdi), %xmm0 @@ -1767,46 +1766,47 @@ ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 368(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 352(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 320(%rdi), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa 304(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; SSE-NEXT: movdqa 288(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] ; SSE-NEXT: movdqa 272(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa 256(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: movdqa 96(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rdi), %xmm11 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa 32(%rdi), %xmm14 @@ -1839,12 +1839,10 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm12, %xmm0 @@ -1856,6 +1854,7 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1917,247 +1916,245 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: punpckhwd (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: movdqa (%rsp), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm15[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm5[0],xmm11[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm6[0],xmm11[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa %xmm12, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm13[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[2,3] +; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: unpckhps (%rsp), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm13[2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 48(%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 48(%rax) -; SSE-NEXT: movaps %xmm9, (%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%rax) +; SSE-NEXT: movaps %xmm8, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm12, 48(%rax) -; SSE-NEXT: movapd %xmm11, 32(%rax) +; SSE-NEXT: movapd %xmm12, 32(%rax) +; SSE-NEXT: movapd %xmm11, 48(%rax) ; SSE-NEXT: movapd %xmm10, 16(%rax) -; SSE-NEXT: movaps %xmm13, (%rax) +; SSE-NEXT: movaps %xmm14, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm1, 48(%rax) ; SSE-NEXT: movaps %xmm2, 32(%rax) -; SSE-NEXT: movaps %xmm14, 16(%rax) +; SSE-NEXT: movaps %xmm15, 16(%rax) ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: addq $728, %rsp # imm = 0x2D8 ; SSE-NEXT: retq @@ -2566,288 +2563,288 @@ ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm11, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: addq $856, %rsp # imm = 0x358 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i16_stride8_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $984, %rsp # imm = 0x3D8 -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX2-ONLY-NEXT: subq $1000, %rsp # imm = 0x3E8 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 304(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, %xmm7 -; AVX2-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 304(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm14 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm10[7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm15 = ymm9[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6],ymm11[7] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm2[0,1,0,2] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa %xmm7, %xmm14 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5,6],ymm5[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm10[5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa %xmm3, %xmm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm13, %xmm8 -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm12[2],xmm15[2],xmm12[3],xmm15[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4],ymm4[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm14, %xmm7 +; AVX2-ONLY-NEXT: vmovdqa %xmm11, %xmm9 +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm13 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = mem[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm8[2],mem[2],xmm8[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = mem[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0],xmm10[1],xmm14[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, %xmm15 -; AVX2-ONLY-NEXT: vpbroadcastd %xmm4, %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm2, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] @@ -2855,11 +2852,10 @@ ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm0[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] @@ -2867,12 +2863,12 @@ ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm9 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2880,20 +2876,21 @@ ; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm7 -; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %xmm9 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; AVX2-ONLY-NEXT: vpbroadcastd %xmm3, %xmm12 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %xmm13 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm12[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] @@ -2920,31 +2917,30 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa %xmm15, %xmm13 -; AVX2-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm15[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] @@ -2956,8 +2952,8 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] @@ -2971,41 +2967,43 @@ ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm7 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[2,2,2,2] -; AVX2-ONLY-NEXT: vmovdqa %xmm10, %xmm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm11, %xmm7 -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[2,2,2,2] +; AVX2-ONLY-NEXT: vmovdqa %xmm11, %xmm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm10, %xmm8 +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm12, %xmm13 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5,6],ymm8[7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm7 = ymm4[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm8 = xmm15[2],mem[2],xmm15[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = mem[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm12 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = mem[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm11[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] @@ -3013,54 +3011,53 @@ ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = mem[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm8[2],xmm14[2],xmm8[3],xmm14[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm9[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r9) -; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r9) +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: addq $984, %rsp # imm = 0x3D8 +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-ONLY-NEXT: addq $1000, %rsp # imm = 0x3E8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -3077,7 +3074,7 @@ ; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,4] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] @@ -3237,7 +3234,7 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [3,7,3,3] ; AVX512F-SLOW-NEXT: vpermt2d %xmm9, %xmm8, %xmm5 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm1 = xmm5[0,1],mem[2,3] @@ -3275,7 +3272,7 @@ ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm21 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm31 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm4 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,4] ; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm2, %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm3 # 16-byte Folded Reload @@ -3403,7 +3400,7 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [3,7,3,3] ; AVX512F-SLOW-NEXT: vpermt2d %xmm18, %xmm12, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] @@ -3460,7 +3457,7 @@ ; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,4] ; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 ; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm1, %xmm0 @@ -3555,7 +3552,7 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,1,1] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 ; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm1, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm13 @@ -3587,7 +3584,7 @@ ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm3 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm18[2],xmm5[2],xmm18[3],xmm5[3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,6] ; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm0, %xmm3 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm21[2],xmm20[2],xmm21[3],xmm20[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] @@ -3623,7 +3620,7 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm29 = [3,7,3,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} xmm29 = [3,7,3,3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 ; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm29, %xmm1 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] @@ -3661,7 +3658,7 @@ ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa %xmm6, %xmm1 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,4,0,4] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,4] ; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm0, %xmm1 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm20 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -3725,7 +3722,7 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm28 ; AVX512F-FAST-NEXT: vmovdqa %xmm3, %xmm0 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [1,5,1,1] ; AVX512F-FAST-NEXT: vpermt2d %xmm30, %xmm8, %xmm0 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm20[0],xmm6[1],xmm20[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -3753,7 +3750,7 @@ ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm20[2],xmm6[3],xmm20[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,6] ; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm0, %xmm4 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm30[2],xmm3[3],xmm30[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] @@ -3978,224 +3975,222 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i16_stride8_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1752, %rsp # imm = 0x6D8 -; SSE-NEXT: movdqa 752(%rdi), %xmm2 +; SSE-NEXT: subq $1800, %rsp # imm = 0x708 +; SSE-NEXT: movdqa 240(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 592(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 576(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 736(%rdi), %xmm3 +; SSE-NEXT: movdqa 624(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm4 +; SSE-NEXT: movdqa 608(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm6 +; SSE-NEXT: movdqa 528(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 512(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm0 +; SSE-NEXT: movdqa 560(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 544(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: movdqa %xmm0, %xmm7 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,0,0] -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 720(%rdi), %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 704(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa 192(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 688(%rdi), %xmm2 +; SSE-NEXT: movdqa 176(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm0 +; SSE-NEXT: movdqa 160(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 656(%rdi), %xmm2 +; SSE-NEXT: movdqa 144(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 640(%rdi), %xmm0 +; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 624(%rdi), %xmm0 +; SSE-NEXT: movdqa 752(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 608(%rdi), %xmm1 +; SSE-NEXT: movdqa 736(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 592(%rdi), %xmm0 +; SSE-NEXT: movdqa 720(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm13 +; SSE-NEXT: movdqa 704(%rdi), %xmm13 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 560(%rdi), %xmm2 +; SSE-NEXT: movdqa 688(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 544(%rdi), %xmm0 +; SSE-NEXT: movdqa 672(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 528(%rdi), %xmm2 +; SSE-NEXT: movdqa 656(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 512(%rdi), %xmm0 +; SSE-NEXT: movdqa 640(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 496(%rdi), %xmm0 +; SSE-NEXT: movdqa 368(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 480(%rdi), %xmm1 +; SSE-NEXT: movdqa 352(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 464(%rdi), %xmm0 +; SSE-NEXT: movdqa 336(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa 320(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 432(%rdi), %xmm2 +; SSE-NEXT: movdqa 304(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 416(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm2 +; SSE-NEXT: movdqa 288(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; SSE-NEXT: movdqa 272(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 384(%rdi), %xmm0 +; SSE-NEXT: movdqa 256(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1008(%rdi), %xmm0 +; SSE-NEXT: movdqa 880(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 992(%rdi), %xmm1 +; SSE-NEXT: movdqa 864(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 976(%rdi), %xmm0 +; SSE-NEXT: movdqa 848(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 960(%rdi), %xmm15 +; SSE-NEXT: movdqa 832(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 944(%rdi), %xmm2 +; SSE-NEXT: movdqa 816(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 928(%rdi), %xmm0 +; SSE-NEXT: movdqa 800(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 912(%rdi), %xmm2 +; SSE-NEXT: movdqa 784(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 896(%rdi), %xmm0 +; SSE-NEXT: movdqa 768(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 368(%rdi), %xmm0 +; SSE-NEXT: movdqa 496(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm1 +; SSE-NEXT: movdqa 480(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm0 +; SSE-NEXT: movdqa 464(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: movdqa 448(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa 304(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm1 +; SSE-NEXT: movdqa 432(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm1 +; SSE-NEXT: movdqa 416(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 400(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, (%rsp) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movdqa 384(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 880(%rdi), %xmm0 +; SSE-NEXT: movdqa 1008(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 864(%rdi), %xmm1 +; SSE-NEXT: movdqa 992(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 848(%rdi), %xmm0 +; SSE-NEXT: movdqa 976(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 832(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movdqa 816(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 800(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 784(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 768(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa 960(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa 944(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 928(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movdqa 912(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 896(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4205,51 +4200,30 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: movdqa 32(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: movdqa 32(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; SSE-NEXT: movdqa (%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -4258,24 +4232,15 @@ ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -4283,284 +4248,317 @@ ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,2,2] +; SSE-NEXT: movaps %xmm11, %xmm10 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movapd %xmm7, %xmm0 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movapd %xmm13, %xmm0 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movapd %xmm2, %xmm0 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movapd %xmm8, %xmm0 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm11[2],xmm3[3],xmm11[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movdqa %xmm6, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -4570,19 +4568,21 @@ ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -4594,10 +4594,8 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4619,59 +4617,59 @@ ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, (%rsp), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,1,1] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm14, %xmm15 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -4680,23 +4678,23 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, %xmm0 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -4710,87 +4708,87 @@ ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,2,2] -; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movapd (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: movapd %xmm14, %xmm0 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm8[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: unpckhps (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps %xmm9, %xmm7 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: pshufd $255, (%rsp), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm6[2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] @@ -4805,7 +4803,8 @@ ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -4820,8 +4819,7 @@ ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,3,3,3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload @@ -4829,67 +4827,67 @@ ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3] +; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 64(%rsi) +; SSE-NEXT: movaps %xmm15, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, (%rsi) +; SSE-NEXT: movaps %xmm15, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 96(%rdx) +; SSE-NEXT: movaps %xmm15, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 32(%rdx) +; SSE-NEXT: movaps %xmm15, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 64(%rdx) +; SSE-NEXT: movaps %xmm15, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, (%rdx) +; SSE-NEXT: movaps %xmm15, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 96(%rcx) +; SSE-NEXT: movaps %xmm15, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 32(%rcx) +; SSE-NEXT: movaps %xmm15, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 48(%rcx) +; SSE-NEXT: movaps %xmm15, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, (%rcx) +; SSE-NEXT: movaps %xmm15, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 80(%rcx) +; SSE-NEXT: movaps %xmm15, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 96(%r8) @@ -4922,12 +4920,12 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 112(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 96(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps %xmm9, 112(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps %xmm9, 96(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps %xmm9, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movaps %xmm12, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload @@ -4936,12 +4934,12 @@ ; SSE-NEXT: movaps %xmm14, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 16(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, (%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movaps %xmm9, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm9, 112(%rax) -; SSE-NEXT: movapd %xmm10, 96(%rax) -; SSE-NEXT: movapd %xmm11, 80(%rax) +; SSE-NEXT: movapd %xmm10, 112(%rax) +; SSE-NEXT: movapd %xmm11, 96(%rax) +; SSE-NEXT: movapd %xmm13, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: movaps %xmm9, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload @@ -4960,63 +4958,63 @@ ; SSE-NEXT: movaps %xmm6, 32(%rax) ; SSE-NEXT: movaps %xmm8, 16(%rax) ; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: addq $1752, %rsp # imm = 0x6D8 +; SSE-NEXT: addq $1800, %rsp # imm = 0x708 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i16_stride8_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2056, %rsp # imm = 0x808 -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: subq $2040, %rsp # imm = 0x7F8 +; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 624(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 752(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 720(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 672(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] @@ -5027,56 +5025,56 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 880(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 832(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 816(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 800(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa 784(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 976(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 944(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 928(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa 912(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 896(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5086,59 +5084,59 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 624(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 880(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 608(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa 592(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 832(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 560(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 816(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 800(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 784(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 752(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa 720(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 976(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 944(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 672(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 928(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 656(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 896(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5431,71 +5429,24 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,1,0,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm15[0],xmm9[1],xmm15[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[0,0,0,0] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm12 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload @@ -5529,7 +5480,6 @@ ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -5542,15 +5492,33 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -5558,378 +5526,407 @@ ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,1,0,1] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm10[2,3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm12[1],xmm13[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm13[0],mem[0],xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm2[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm1[1],xmm13[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm4[0],mem[0],xmm4[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm6[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm12[0],mem[0],xmm12[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm11[0,1,2,3,4,5],xmm14[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm13[0,1,2],xmm14[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm4[1],xmm13[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0,1,2],xmm14[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm11[2],mem[2],xmm11[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, (%rsp), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm2[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm12[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm15[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm11[0],xmm8[1],xmm11[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, (%rsp), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) -; AVX1-ONLY-NEXT: addq $2056, %rsp # imm = 0x808 +; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: addq $2040, %rsp # imm = 0x7F8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i16_stride8_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $2472, %rsp # imm = 0x9A8 -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm2 +; AVX2-ONLY-NEXT: subq $2408, %rsp # imm = 0x968 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 304(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5942,9 +5939,9 @@ ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5960,28 +5957,28 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 880(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 624(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa 848(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 592(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovdqa 784(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 816(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa 560(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5989,22 +5986,22 @@ ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6019,38 +6016,38 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 336(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa 304(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6060,10 +6057,10 @@ ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6074,56 +6071,56 @@ ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 624(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa 880(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] ; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm6, %xmm6 -; AVX2-ONLY-NEXT: vmovdqa 592(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovdqa 848(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastd %xmm7, %xmm7 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastd %xmm9, %xmm9 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovdqa 784(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 560(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovdqa 816(%rdi), %xmm11 ; AVX2-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm7[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm9[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] ; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm10 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,1,0,2] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm9[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] ; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5,6],ymm10[7] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm10[0,1,0,2] ; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6144,8 +6141,8 @@ ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm10[1],xmm12[2,3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] ; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] @@ -6163,27 +6160,27 @@ ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,1,1] ; AVX2-ONLY-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,1,1] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm12[1],xmm4[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] @@ -6196,7 +6193,7 @@ ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd $2, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -6216,7 +6213,7 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -6240,11 +6237,11 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] @@ -6255,26 +6252,25 @@ ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload @@ -6290,7 +6286,7 @@ ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload @@ -6305,12 +6301,12 @@ ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -6326,9 +6322,9 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4],ymm3[5],ymm14[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6337,7 +6333,8 @@ ; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[3,3,3,3] +; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = mem[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -6345,8 +6342,7 @@ ; AVX2-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3,4],ymm3[5],ymm10[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] @@ -6361,7 +6357,7 @@ ; AVX2-ONLY-NEXT: # xmm3 = mem[3,3,3,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] @@ -6387,73 +6383,27 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, %xmm6 -; AVX2-ONLY-NEXT: vpbroadcastd %xmm12, %xmm1 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovdqa %xmm2, %xmm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpbroadcastd %xmm4, %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm2, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovdqa %xmm3, %xmm8 +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm2, %xmm9 +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] @@ -6474,8 +6424,9 @@ ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] @@ -6484,26 +6435,25 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpbroadcastd %xmm2, %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm4, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovdqa %xmm3, %xmm9 -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] @@ -6534,12 +6484,12 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] @@ -6551,119 +6501,168 @@ ; AVX2-ONLY-NEXT: vpbroadcastd %xmm1, %xmm0 ; AVX2-ONLY-NEXT: vpbroadcastd %xmm4, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] -; AVX2-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm0[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastd %xmm4, %xmm12 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %xmm13 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,1,2,0,4,5,6,4] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm15 = ymm0[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm14[7] ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,1,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm15 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa %xmm10, %xmm4 -; AVX2-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; AVX2-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa %xmm6, %xmm10 -; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm11 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4],ymm2[5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] +; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] +; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] @@ -6677,19 +6676,19 @@ ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] @@ -6698,200 +6697,203 @@ ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[2,2,2,2] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] -; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm6[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] ; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm11 = ymm5[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm11[7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[3,1,2,3,7,5,6,7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm10 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5],ymm10[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm10 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0],xmm10[1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = mem[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4],ymm3[5],ymm14[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = mem[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd $255, (%rsp), %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = mem[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm6 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm8 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = mem[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] +; AVX2-ONLY-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] +; AVX2-ONLY-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-ONLY-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = mem[3,3,3,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] -; AVX2-ONLY-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX2-ONLY-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15] -; AVX2-ONLY-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm14, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 96(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 64(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm3, (%rax) -; AVX2-ONLY-NEXT: addq $2472, %rsp # imm = 0x9A8 +; AVX2-ONLY-NEXT: addq $2408, %rsp # imm = 0x968 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -6908,7 +6910,7 @@ ; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [0,0,0,4] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,2,2,2] @@ -7302,7 +7304,7 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm16 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm16 = [3,7,3,3] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm0 ; AVX512F-SLOW-NEXT: vpermt2d %xmm20, %xmm16, %xmm0 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -7377,7 +7379,7 @@ ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm0 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,0,0,4] ; AVX512F-SLOW-NEXT: vpermt2d %xmm5, %xmm4, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload @@ -7705,7 +7707,7 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm16 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm16 = [3,7,3,3] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm1 ; AVX512F-SLOW-NEXT: vpermt2d %xmm20, %xmm16, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -7813,7 +7815,7 @@ ; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,4] ; AVX512F-FAST-NEXT: vmovdqa %xmm3, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 ; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 @@ -8048,7 +8050,7 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [1,5,1,1] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm8 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpermt2d %xmm30, %xmm11, %xmm8 @@ -8119,7 +8121,7 @@ ; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm1 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm20[2],xmm16[3],xmm20[3] ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,6] ; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm0, %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, %xmm4 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm22[2],xmm30[2],xmm22[3],xmm30[3] @@ -8222,7 +8224,7 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm18 = [3,7,3,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} xmm18 = [3,7,3,3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 ; AVX512F-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm18, %xmm0 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -8295,7 +8297,7 @@ ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm0 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [0,4,0,4] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,0,0,4] ; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm5, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm16 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -8478,7 +8480,7 @@ ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [1,5,1,1] ; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm15, %xmm0 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm16[0],xmm9[1],xmm16[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -8544,7 +8546,7 @@ ; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm1 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm16[2],xmm9[3],xmm16[3] ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,6] ; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm0, %xmm1 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm27 ; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload @@ -8649,7 +8651,7 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm1 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm24 = [3,7,3,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} xmm24 = [3,7,3,3] ; AVX512F-FAST-NEXT: vpermt2d %xmm27, %xmm24, %xmm1 ; AVX512F-FAST-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm1 = xmm1[0,1],mem[2,3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll @@ -53,25 +53,15 @@ ; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; -; AVX1-LABEL: load_i32_stride2_vf4: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX1-NEXT: vmovaps %xmm2, (%rsi) -; AVX1-NEXT: vmovaps %xmm0, (%rdx) -; AVX1-NEXT: retq -; -; AVX512-LABEL: load_i32_stride2_vf4: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vmovaps (%rdi), %xmm1 -; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],mem[1,3] -; AVX512-NEXT: vpmovqd %ymm0, (%rsi) -; AVX512-NEXT: vmovaps %xmm1, (%rdx) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: load_i32_stride2_vf4: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX-NEXT: vmovaps %xmm2, (%rsi) +; AVX-NEXT: vmovaps %xmm0, (%rdx) +; AVX-NEXT: retq %wide.vec = load <8 x i32>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> %strided.vec1 = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <4 x i32> @@ -87,16 +77,16 @@ ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm2 ; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm3[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; SSE-NEXT: movaps %xmm5, (%rsi) -; SSE-NEXT: movaps %xmm4, 16(%rsi) -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE-NEXT: movaps %xmm5, 16(%rsi) +; SSE-NEXT: movaps %xmm4, (%rsi) ; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride2_vf8: @@ -126,45 +116,53 @@ ; ; AVX512F-SLOW-LABEL: load_i32_stride2_vf8: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-SLOW-NEXT: vmovaps (%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],mem[1,3],ymm1[5,7],mem[5,7] -; AVX512F-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] -; AVX512F-SLOW-NEXT: vpmovqd %zmm0, (%rsi) -; AVX512F-SLOW-NEXT: vmovaps %ymm1, (%rdx) +; AVX512F-SLOW-NEXT: vmovaps (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX512F-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX512F-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX512F-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-SLOW-NEXT: vmovaps %ymm2, (%rsi) +; AVX512F-SLOW-NEXT: vmovaps %ymm0, (%rdx) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i32_stride2_vf8: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] -; AVX512F-FAST-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2 -; AVX512F-FAST-NEXT: vpmovqd %zmm0, (%rsi) -; AVX512F-FAST-NEXT: vmovdqa %ymm2, (%rdx) +; AVX512F-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14] +; AVX512F-FAST-NEXT: vmovaps (%rdi), %ymm1 +; AVX512F-FAST-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX512F-FAST-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15] +; AVX512F-FAST-NEXT: vpermi2ps %ymm2, %ymm1, %ymm3 +; AVX512F-FAST-NEXT: vmovaps %ymm0, (%rsi) +; AVX512F-FAST-NEXT: vmovaps %ymm3, (%rdx) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: load_i32_stride2_vf8: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vmovaps (%rdi), %ymm1 -; AVX512BW-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],mem[1,3],ymm1[5,7],mem[5,7] -; AVX512BW-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] -; AVX512BW-SLOW-NEXT: vpmovqd %zmm0, (%rsi) -; AVX512BW-SLOW-NEXT: vmovaps %ymm1, (%rdx) +; AVX512BW-SLOW-NEXT: vmovaps (%rdi), %ymm0 +; AVX512BW-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512BW-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX512BW-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX512BW-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX512BW-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512BW-SLOW-NEXT: vmovaps %ymm2, (%rsi) +; AVX512BW-SLOW-NEXT: vmovaps %ymm0, (%rdx) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: load_i32_stride2_vf8: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] -; AVX512BW-FAST-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2 -; AVX512BW-FAST-NEXT: vpmovqd %zmm0, (%rsi) -; AVX512BW-FAST-NEXT: vmovdqa %ymm2, (%rdx) +; AVX512BW-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14] +; AVX512BW-FAST-NEXT: vmovaps (%rdi), %ymm1 +; AVX512BW-FAST-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX512BW-FAST-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0 +; AVX512BW-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15] +; AVX512BW-FAST-NEXT: vpermi2ps %ymm2, %ymm1, %ymm3 +; AVX512BW-FAST-NEXT: vmovaps %ymm0, (%rsi) +; AVX512BW-FAST-NEXT: vmovaps %ymm3, (%rdx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %wide.vec = load <16 x i32>, ptr %in.vec, align 64 @@ -182,48 +180,48 @@ ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm2 ; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps 80(%rdi), %xmm4 -; SSE-NEXT: movaps 64(%rdi), %xmm5 -; SSE-NEXT: movaps 112(%rdi), %xmm6 -; SSE-NEXT: movaps 96(%rdi), %xmm7 +; SSE-NEXT: movaps 112(%rdi), %xmm4 +; SSE-NEXT: movaps 96(%rdi), %xmm5 +; SSE-NEXT: movaps 80(%rdi), %xmm6 +; SSE-NEXT: movaps 64(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm6[0,2] ; SSE-NEXT: movaps %xmm5, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm4[0,2] -; SSE-NEXT: movaps %xmm2, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm3[0,2] -; SSE-NEXT: movaps %xmm0, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm1[0,2] +; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm3[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm6[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; SSE-NEXT: movaps %xmm9, 32(%rsi) -; SSE-NEXT: movaps %xmm8, 48(%rsi) -; SSE-NEXT: movaps %xmm11, (%rsi) -; SSE-NEXT: movaps %xmm10, 16(%rsi) -; SSE-NEXT: movaps %xmm5, 32(%rdx) -; SSE-NEXT: movaps %xmm7, 48(%rdx) -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE-NEXT: movaps %xmm9, 48(%rsi) +; SSE-NEXT: movaps %xmm8, 32(%rsi) +; SSE-NEXT: movaps %xmm11, 16(%rsi) +; SSE-NEXT: movaps %xmm10, (%rsi) +; SSE-NEXT: movaps %xmm5, 48(%rdx) +; SSE-NEXT: movaps %xmm7, 32(%rdx) ; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride2_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,2],ymm4[0,2],ymm0[4,6],ymm4[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm2[1,3],ymm1[5,7],ymm2[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm4[1,3],ymm0[5,7],ymm4[5,7] -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[0,2],ymm4[0,2],ymm1[4,6],ymm4[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm4[1,3],ymm1[5,7],ymm4[5,7] +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -233,18 +231,18 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -271,30 +269,30 @@ define void @load_i32_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind { ; SSE-LABEL: load_i32_stride2_vf32: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 32(%rdi), %xmm1 -; SSE-NEXT: movaps 48(%rdi), %xmm8 -; SSE-NEXT: movaps 208(%rdi), %xmm9 -; SSE-NEXT: movaps 192(%rdi), %xmm3 -; SSE-NEXT: movaps 80(%rdi), %xmm11 -; SSE-NEXT: movaps 64(%rdi), %xmm2 -; SSE-NEXT: movaps 240(%rdi), %xmm10 -; SSE-NEXT: movaps 224(%rdi), %xmm5 -; SSE-NEXT: movaps 112(%rdi), %xmm12 -; SSE-NEXT: movaps 96(%rdi), %xmm4 -; SSE-NEXT: movaps 144(%rdi), %xmm13 -; SSE-NEXT: movaps 128(%rdi), %xmm6 -; SSE-NEXT: movaps 176(%rdi), %xmm14 -; SSE-NEXT: movaps 160(%rdi), %xmm7 +; SSE-NEXT: movaps (%rdi), %xmm1 +; SSE-NEXT: movaps 16(%rdi), %xmm7 +; SSE-NEXT: movaps 32(%rdi), %xmm0 +; SSE-NEXT: movaps 240(%rdi), %xmm9 +; SSE-NEXT: movaps 224(%rdi), %xmm3 +; SSE-NEXT: movaps 112(%rdi), %xmm11 +; SSE-NEXT: movaps 96(%rdi), %xmm2 +; SSE-NEXT: movaps 208(%rdi), %xmm10 +; SSE-NEXT: movaps 192(%rdi), %xmm5 +; SSE-NEXT: movaps 80(%rdi), %xmm12 +; SSE-NEXT: movaps 64(%rdi), %xmm4 +; SSE-NEXT: movaps 176(%rdi), %xmm13 +; SSE-NEXT: movaps 160(%rdi), %xmm6 +; SSE-NEXT: movaps 144(%rdi), %xmm14 +; SSE-NEXT: movaps 128(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm4, %xmm15 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm12[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm12[1,3] ; SSE-NEXT: movaps %xmm2, %xmm12 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm11[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm11[1,3] -; SSE-NEXT: movaps %xmm7, %xmm11 +; SSE-NEXT: movaps %xmm8, %xmm11 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm14[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm14[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm14[1,3] ; SSE-NEXT: movaps %xmm6, %xmm14 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm13[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm13[1,3] @@ -305,30 +303,30 @@ ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm9[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm9[1,3] ; SSE-NEXT: movaps %xmm1, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm8[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm8[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm7[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm7[1,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: movaps 48(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm8[1,3] -; SSE-NEXT: movaps %xmm10, 96(%rsi) -; SSE-NEXT: movaps %xmm12, 32(%rsi) -; SSE-NEXT: movaps %xmm13, 112(%rsi) -; SSE-NEXT: movaps %xmm15, 48(%rsi) -; SSE-NEXT: movaps %xmm14, 64(%rsi) -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps %xmm11, 80(%rsi) -; SSE-NEXT: movaps %xmm9, 16(%rsi) -; SSE-NEXT: movaps %xmm3, 96(%rdx) -; SSE-NEXT: movaps %xmm5, 112(%rdx) -; SSE-NEXT: movaps %xmm6, 64(%rdx) -; SSE-NEXT: movaps %xmm7, 80(%rdx) -; SSE-NEXT: movaps %xmm2, 32(%rdx) -; SSE-NEXT: movaps %xmm4, 48(%rdx) -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm7[1,3] +; SSE-NEXT: movaps %xmm10, 112(%rsi) +; SSE-NEXT: movaps %xmm12, 48(%rsi) +; SSE-NEXT: movaps %xmm13, 96(%rsi) +; SSE-NEXT: movaps %xmm15, 32(%rsi) +; SSE-NEXT: movaps %xmm14, 80(%rsi) +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps %xmm11, 64(%rsi) +; SSE-NEXT: movaps %xmm9, (%rsi) +; SSE-NEXT: movaps %xmm3, 112(%rdx) +; SSE-NEXT: movaps %xmm5, 96(%rdx) +; SSE-NEXT: movaps %xmm6, 80(%rdx) +; SSE-NEXT: movaps %xmm8, 64(%rdx) +; SSE-NEXT: movaps %xmm2, 48(%rdx) +; SSE-NEXT: movaps %xmm4, 32(%rdx) ; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride2_vf32: @@ -337,30 +335,30 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,2],ymm4[0,2],ymm3[4,6],ymm4[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,2],ymm6[0,2],ymm1[4,6],ymm6[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[0,2],ymm8[0,2],ymm0[4,6],ymm8[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,2],ymm4[0,2],ymm0[4,6],ymm4[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm2[0,2],ymm10[0,2],ymm2[4,6],ymm10[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm6[1,3],ymm1[5,7],ymm6[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3],ymm4[1,3],ymm3[5,7],ymm4[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm8[1,3],ymm0[5,7],ymm8[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm10[1,3],ymm2[5,7],ymm10[5,7] -; AVX1-ONLY-NEXT: vmovaps %ymm11, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm2[0,2],ymm6[0,2],ymm2[4,6],ymm6[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm1[0,2],ymm8[0,2],ymm1[4,6],ymm8[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm3[0,2],ymm10[0,2],ymm3[4,6],ymm10[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm4[1,3],ymm0[5,7],ymm4[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm8[1,3],ymm1[5,7],ymm8[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm6[1,3],ymm2[5,7],ymm6[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3],ymm10[1,3],ymm3[5,7],ymm10[5,7] +; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -370,34 +368,34 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm7[0,2],ymm6[0,2],ymm7[4,6],ymm6[4,6] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,1,3] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm5[0,2],ymm4[0,2],ymm5[4,6],ymm4[4,6] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,1,3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,1,3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,1,3] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[1,3],ymm6[1,3],ymm7[5,7],ymm6[5,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,1,3] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,3],ymm4[1,3],ymm5[5,7],ymm4[5,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm9, 64(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm11, (%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 32(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm9, 96(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm10, (%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -432,21 +430,21 @@ ; SSE-LABEL: load_i32_stride2_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps 208(%rdi), %xmm7 -; SSE-NEXT: movaps 192(%rdi), %xmm1 -; SSE-NEXT: movaps 80(%rdi), %xmm10 -; SSE-NEXT: movaps 64(%rdi), %xmm0 -; SSE-NEXT: movaps 240(%rdi), %xmm11 -; SSE-NEXT: movaps 224(%rdi), %xmm3 -; SSE-NEXT: movaps 112(%rdi), %xmm13 -; SSE-NEXT: movaps 96(%rdi), %xmm2 -; SSE-NEXT: movaps 272(%rdi), %xmm9 -; SSE-NEXT: movaps 144(%rdi), %xmm14 -; SSE-NEXT: movaps 128(%rdi), %xmm4 -; SSE-NEXT: movaps 304(%rdi), %xmm12 -; SSE-NEXT: movaps 288(%rdi), %xmm6 -; SSE-NEXT: movaps 176(%rdi), %xmm15 -; SSE-NEXT: movaps 160(%rdi), %xmm5 +; SSE-NEXT: movaps 240(%rdi), %xmm7 +; SSE-NEXT: movaps 224(%rdi), %xmm1 +; SSE-NEXT: movaps 112(%rdi), %xmm10 +; SSE-NEXT: movaps 96(%rdi), %xmm0 +; SSE-NEXT: movaps 208(%rdi), %xmm11 +; SSE-NEXT: movaps 192(%rdi), %xmm3 +; SSE-NEXT: movaps 80(%rdi), %xmm13 +; SSE-NEXT: movaps 64(%rdi), %xmm2 +; SSE-NEXT: movaps 304(%rdi), %xmm9 +; SSE-NEXT: movaps 176(%rdi), %xmm14 +; SSE-NEXT: movaps 160(%rdi), %xmm4 +; SSE-NEXT: movaps 272(%rdi), %xmm12 +; SSE-NEXT: movaps 256(%rdi), %xmm6 +; SSE-NEXT: movaps 144(%rdi), %xmm15 +; SSE-NEXT: movaps 128(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm2, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm13[0,2] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -482,236 +480,236 @@ ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm12[1,3] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm0 +; SSE-NEXT: movaps 288(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm9[1,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 352(%rdi), %xmm15 +; SSE-NEXT: movaps 336(%rdi), %xmm0 +; SSE-NEXT: movaps 320(%rdi), %xmm15 ; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,3],xmm0[1,3] -; SSE-NEXT: movaps 336(%rdi), %xmm0 -; SSE-NEXT: movaps 320(%rdi), %xmm13 +; SSE-NEXT: movaps 368(%rdi), %xmm0 +; SSE-NEXT: movaps 352(%rdi), %xmm13 ; SSE-NEXT: movaps %xmm13, %xmm12 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,3],xmm0[1,3] -; SSE-NEXT: movaps 432(%rdi), %xmm0 -; SSE-NEXT: movaps 416(%rdi), %xmm9 +; SSE-NEXT: movaps 400(%rdi), %xmm0 +; SSE-NEXT: movaps 384(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, %xmm14 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm0[1,3] -; SSE-NEXT: movaps 400(%rdi), %xmm0 -; SSE-NEXT: movaps 384(%rdi), %xmm6 +; SSE-NEXT: movaps 432(%rdi), %xmm0 +; SSE-NEXT: movaps 416(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, %xmm10 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm0[1,3] -; SSE-NEXT: movaps 496(%rdi), %xmm1 -; SSE-NEXT: movaps 480(%rdi), %xmm4 +; SSE-NEXT: movaps 464(%rdi), %xmm1 +; SSE-NEXT: movaps 448(%rdi), %xmm4 ; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm1[1,3] -; SSE-NEXT: movaps 464(%rdi), %xmm3 -; SSE-NEXT: movaps 448(%rdi), %xmm1 +; SSE-NEXT: movaps 496(%rdi), %xmm3 +; SSE-NEXT: movaps 480(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] -; SSE-NEXT: movaps 32(%rdi), %xmm11 -; SSE-NEXT: movaps 48(%rdi), %xmm2 +; SSE-NEXT: movaps (%rdi), %xmm11 +; SSE-NEXT: movaps 16(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm11, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,3],xmm2[1,3] -; SSE-NEXT: movaps (%rdi), %xmm8 -; SSE-NEXT: movaps 16(%rdi), %xmm2 +; SSE-NEXT: movaps 32(%rdi), %xmm8 +; SSE-NEXT: movaps 48(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm8, %xmm7 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm2[1,3] -; SSE-NEXT: movaps %xmm0, 224(%rsi) -; SSE-NEXT: movaps %xmm12, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps %xmm5, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rsi) +; SSE-NEXT: movaps %xmm0, 240(%rsi) +; SSE-NEXT: movaps %xmm12, 176(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) -; SSE-NEXT: movaps %xmm10, 192(%rsi) +; SSE-NEXT: movaps %xmm5, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps %xmm0, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) -; SSE-NEXT: movaps %xmm7, (%rsi) -; SSE-NEXT: movaps %xmm14, 208(%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps %xmm10, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps %xmm1, 224(%rdx) -; SSE-NEXT: movaps %xmm4, 240(%rdx) -; SSE-NEXT: movaps %xmm6, 192(%rdx) -; SSE-NEXT: movaps %xmm9, 208(%rdx) -; SSE-NEXT: movaps %xmm13, 160(%rdx) -; SSE-NEXT: movaps %xmm15, 176(%rdx) +; SSE-NEXT: movaps %xmm7, 16(%rsi) +; SSE-NEXT: movaps %xmm14, 192(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm3, (%rsi) +; SSE-NEXT: movaps %xmm1, 240(%rdx) +; SSE-NEXT: movaps %xmm4, 224(%rdx) +; SSE-NEXT: movaps %xmm6, 208(%rdx) +; SSE-NEXT: movaps %xmm9, 192(%rdx) +; SSE-NEXT: movaps %xmm13, 176(%rdx) +; SSE-NEXT: movaps %xmm15, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movaps %xmm8, (%rdx) -; SSE-NEXT: movaps %xmm11, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm8, 16(%rdx) +; SSE-NEXT: movaps %xmm11, (%rdx) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride2_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[0,2],ymm8[0,2],ymm9[4,6],ymm8[4,6] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2],ymm8[0,2],ymm9[4,6],ymm8[4,6] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm3, %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm3, %ymm11 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2],ymm10[0,2],ymm11[4,6],ymm10[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm7[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm5[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm5[0,2],ymm12[0,2],ymm5[4,6],ymm12[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm4[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm6[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,3],ymm8[1,3],ymm9[5,7],ymm8[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm6[0,2],ymm15[0,2],ymm6[4,6],ymm15[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3],ymm15[1,3],ymm6[5,7],ymm15[5,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm7[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[0,2],ymm12[0,2],ymm7[4,6],ymm12[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[0,2],ymm14[0,2],ymm5[4,6],ymm14[4,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[1,3],ymm10[1,3],ymm11[5,7],ymm10[5,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm6[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3],ymm12[1,3],ymm7[5,7],ymm12[5,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm4[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,3],ymm14[1,3],ymm5[5,7],ymm14[5,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm2[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,3],ymm8[1,3],ymm9[5,7],ymm8[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,2],ymm14[0,2],ymm2[4,6],ymm14[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm14[1,3],ymm2[5,7],ymm14[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[0,2],ymm12[0,2],ymm4[4,6],ymm12[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,3],ymm12[1,3],ymm4[5,7],ymm12[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm6[0,2],ymm11[0,2],ymm6[4,6],ymm11[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3],ymm11[1,3],ymm6[5,7],ymm11[5,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm11[0,2],ymm1[4,6],ymm11[4,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm11[1,3],ymm1[5,7],ymm11[5,7] -; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm7[0,2],ymm15[0,2],ymm7[4,6],ymm15[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3],ymm15[1,3],ymm7[5,7],ymm15[5,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm2[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,3],ymm12[1,3],ymm5[5,7],ymm12[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[0,2],ymm15[0,2],ymm2[4,6],ymm15[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm15[1,3],ymm2[5,7],ymm15[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[0,2],ymm14[0,2],ymm4[4,6],ymm14[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,3],ymm14[1,3],ymm4[5,7],ymm14[5,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm14[0,2],ymm0[4,6],ymm14[4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm14[1,3],ymm0[5,7],ymm14[5,7] +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride2_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[0,2],ymm14[0,2],ymm15[4,6],ymm14[4,6] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,2],ymm14[0,2],ymm15[4,6],ymm14[4,6] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,3],ymm14[1,3],ymm15[5,7],ymm14[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,2],ymm12[0,2],ymm13[4,6],ymm12[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[0,2],ymm8[0,2],ymm9[4,6],ymm8[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,3],ymm8[1,3],ymm9[5,7],ymm8[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm13[0,2],ymm12[0,2],ymm13[4,6],ymm12[4,6] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,3],ymm12[1,3],ymm13[5,7],ymm12[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm11[0,2],ymm10[0,2],ymm11[4,6],ymm10[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[0,2],ymm6[0,2],ymm7[4,6],ymm6[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[1,3],ymm6[1,3],ymm7[5,7],ymm6[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm11[0,2],ymm10[0,2],ymm11[4,6],ymm10[4,6] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[1,3],ymm10[1,3],ymm11[5,7],ymm10[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm7[0,2],ymm4[0,2],ymm7[4,6],ymm4[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,3],ymm4[1,3],ymm7[5,7],ymm4[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,2],ymm5[0,2],ymm8[4,6],ymm5[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,3],ymm5[1,3],ymm8[5,7],ymm5[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,2],ymm6[0,2],ymm9[4,6],ymm6[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,3],ymm6[1,3],ymm9[5,7],ymm6[5,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm1[0,2],ymm2[0,2],ymm1[4,6],ymm2[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],ymm2[1,3],ymm1[5,7],ymm2[5,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2],ymm2[0,2],ymm3[4,6],ymm2[4,6] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,3],ymm2[1,3],ymm3[5,7],ymm2[5,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm11[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm7[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm8[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm5[0,2],ymm3[0,2],ymm5[4,6],ymm3[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[1,3],ymm3[1,3],ymm5[5,7],ymm3[5,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2],ymm2[0,2],ymm4[4,6],ymm2[4,6] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,3],ymm2[1,3],ymm4[5,7],ymm2[5,7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm11[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 224(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm13[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 160(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm15[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rsi) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -730,22 +728,22 @@ ; AVX512-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 ; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-NEXT: vpermt2d %zmm3, %zmm8, %zmm11 -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512-NEXT: vpermt2d %zmm1, %zmm8, %zmm11 +; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] -; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm5 ; AVX512-NEXT: vpermt2d %zmm6, %zmm12, %zmm7 -; AVX512-NEXT: vpermt2d %zmm3, %zmm12, %zmm2 +; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm5 ; AVX512-NEXT: vpermt2d %zmm1, %zmm12, %zmm0 +; AVX512-NEXT: vpermt2d %zmm3, %zmm12, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm10, 192(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm11, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm9, 128(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm11, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm5, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm7, 128(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <128 x i32>, ptr %in.vec, align 64 @@ -756,6 +754,7 @@ ret void } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX1: {{.*}} ; AVX2: {{.*}} ; AVX2-FAST: {{.*}} ; AVX2-FAST-PERLANE: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -128,22 +128,22 @@ ; SSE-LABEL: load_i32_stride3_vf4: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[2,0] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,3] -; SSE-NEXT: movaps %xmm3, (%rsi) +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[2,0] +; SSE-NEXT: movaps %xmm5, (%rsi) ; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps %xmm5, (%rcx) +; SSE-NEXT: movaps %xmm4, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride3_vf4: @@ -154,8 +154,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,3,2,1] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1,2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0,3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] @@ -213,40 +212,40 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i32_stride3_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps 80(%rdi), %xmm0 -; SSE-NEXT: movaps 64(%rdi), %xmm4 -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movaps 16(%rdi), %xmm6 -; SSE-NEXT: movaps 32(%rdi), %xmm3 -; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm6[0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm6[0,2] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movaps %xmm4, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[2,3,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,1,1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm4[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[0,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,3] -; SSE-NEXT: movaps %xmm6, 16(%rsi) -; SSE-NEXT: movaps %xmm5, (%rsi) -; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps %xmm11, 16(%rcx) -; SSE-NEXT: movaps %xmm8, (%rcx) +; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movaps 80(%rdi), %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movaps 32(%rdi), %xmm5 +; SSE-NEXT: movdqa 48(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[2,0] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm4[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[2,0] +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[2,0] +; SSE-NEXT: movaps %xmm10, 16(%rsi) +; SSE-NEXT: movaps %xmm8, (%rsi) +; SSE-NEXT: movaps %xmm3, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm9, 16(%rcx) +; SSE-NEXT: movaps %xmm7, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride3_vf8: @@ -255,27 +254,27 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm4[3,0],ymm0[6,4],ymm4[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm5[2,0],ymm4[4,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm5[0,2],ymm3[4,7],ymm5[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[1,0],ymm5[2,0],ymm0[5,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm0[2,0],ymm5[3,0],ymm0[6,4],ymm5[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[0,0],ymm6[2,0],ymm5[4,4],ymm6[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,2],ymm4[0,3],ymm7[5,6],ymm4[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,3],ymm2[6,4],ymm1[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,1],ymm0[0,3],ymm4[4,5],ymm0[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,1],ymm0[0,3],ymm5[4,5],ymm0[4,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -399,98 +398,77 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i32_stride3_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movaps 96(%rdi), %xmm6 -; SSE-NEXT: movaps 128(%rdi), %xmm1 -; SSE-NEXT: movaps 112(%rdi), %xmm13 -; SSE-NEXT: movaps 144(%rdi), %xmm11 -; SSE-NEXT: movaps 176(%rdi), %xmm10 -; SSE-NEXT: movaps 160(%rdi), %xmm9 -; SSE-NEXT: movaps (%rdi), %xmm7 -; SSE-NEXT: movaps 16(%rdi), %xmm8 -; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rdi), %xmm2 +; SSE-NEXT: movaps 160(%rdi), %xmm5 +; SSE-NEXT: movaps 176(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdi), %xmm7 +; SSE-NEXT: movaps 112(%rdi), %xmm8 +; SSE-NEXT: movaps 128(%rdi), %xmm10 +; SSE-NEXT: movaps 64(%rdi), %xmm12 +; SSE-NEXT: movaps 80(%rdi), %xmm13 +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm4 +; SSE-NEXT: movaps 32(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdi), %xmm15 -; SSE-NEXT: movaps 80(%rdi), %xmm14 -; SSE-NEXT: movaps 64(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm14[1,0] -; SSE-NEXT: movaps %xmm15, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] -; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0] -; SSE-NEXT: movaps %xmm11, %xmm4 -; SSE-NEXT: movaps %xmm11, %xmm2 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm15, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm3[0,0] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm14[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm9[0,0] -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0] -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm4[2,0] +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm9[2,0] +; SSE-NEXT: movaps %xmm13, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm12[2,0] +; SSE-NEXT: movaps %xmm15, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm11[2,0] +; SSE-NEXT: movaps %xmm10, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm8[2,0] +; SSE-NEXT: movaps %xmm7, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm14[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm5[2,0] +; SSE-NEXT: movaps %xmm2, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm3[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm13[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm12[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm13[2,0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm8[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm10[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm8[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm8[0,0] -; SSE-NEXT: movaps %xmm8, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm12[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm14[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm10[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm10[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,3] -; SSE-NEXT: movaps %xmm5, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps %xmm0, 32(%rdx) -; SSE-NEXT: movaps %xmm6, 48(%rdx) -; SSE-NEXT: movaps %xmm7, (%rdx) -; SSE-NEXT: movaps %xmm11, 16(%rdx) -; SSE-NEXT: movaps %xmm4, 32(%rcx) +; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm5[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm10[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm5[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[2,0] +; SSE-NEXT: movaps %xmm14, 48(%rsi) +; SSE-NEXT: movaps %xmm11, 32(%rsi) +; SSE-NEXT: movaps %xmm9, 16(%rsi) +; SSE-NEXT: movaps %xmm6, (%rsi) +; SSE-NEXT: movaps %xmm12, 48(%rdx) +; SSE-NEXT: movaps %xmm7, 32(%rdx) +; SSE-NEXT: movaps %xmm15, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps %xmm8, 48(%rcx) -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps %xmm3, 16(%rcx) +; SSE-NEXT: movaps %xmm2, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride3_vf16: @@ -499,52 +477,52 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,1],ymm4[1,3],ymm7[6,5],ymm4[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm7[0,2],ymm5[4,7],ymm7[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,0],ymm7[2,0],ymm3[5,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1],ymm1[1,3],ymm9[6,5],ymm1[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,3],ymm9[0,2],ymm8[4,7],ymm9[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,0],ymm9[2,0],ymm0[5,4],ymm9[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[2,0],ymm7[3,0],ymm3[6,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0],ymm10[2,0],ymm7[4,4],ymm10[6,4] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,2],ymm11[0,3],ymm12[5,6],ymm11[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,0],ymm9[3,0],ymm0[6,4],ymm9[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm9[0,0],ymm12[2,0],ymm9[4,4],ymm12[6,4] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm13[0,3],ymm14[5,6],ymm13[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm11[1,0],ymm6[2,0],ymm11[5,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,0],ymm4[0,3],ymm6[6,4],ymm4[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1],ymm3[0,3],ymm7[4,5],ymm3[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm7[2,1],ymm5[1,3],ymm7[6,5],ymm5[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,3],ymm8[0,2],ymm4[4,7],ymm8[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm3[1,0],ymm8[2,0],ymm3[5,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm10[2,1],ymm1[1,3],ymm10[6,5],ymm1[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,3],ymm11[0,2],ymm9[4,7],ymm11[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[1,0],ymm11[2,0],ymm0[5,4],ymm11[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm3[2,0],ymm8[3,0],ymm3[6,4],ymm8[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm8[0,0],ymm12[2,0],ymm8[4,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,2],ymm7[0,3],ymm13[5,6],ymm7[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,0],ymm11[3,0],ymm0[6,4],ymm11[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm11[0,0],ymm12[2,0],ymm11[4,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm13[1,2],ymm10[0,3],ymm13[5,6],ymm10[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,0],ymm6[2,0],ymm12[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,0],ymm5[0,3],ymm6[6,4],ymm5[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1],ymm3[0,3],ymm8[4,5],ymm3[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,0],ymm2[2,0],ymm13[5,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm2[2,0],ymm5[5,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,3],ymm2[6,4],ymm1[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[0,1],ymm0[0,3],ymm9[4,5],ymm0[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,1],ymm0[0,3],ymm11[4,5],ymm0[4,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper @@ -731,471 +709,426 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i32_stride3_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $360, %rsp # imm = 0x168 -; SSE-NEXT: movaps 192(%rdi), %xmm3 -; SSE-NEXT: movaps 224(%rdi), %xmm2 -; SSE-NEXT: movaps 208(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm7 -; SSE-NEXT: movaps 272(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: subq $184, %rsp +; SSE-NEXT: movaps 240(%rdi), %xmm3 ; SSE-NEXT: movaps 256(%rdi), %xmm9 -; SSE-NEXT: movaps (%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm8 -; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps 64(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps %xmm1, %xmm15 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,0] -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm7, %xmm11 +; SSE-NEXT: movaps 272(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[1,0] -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps 64(%rdi), %xmm12 +; SSE-NEXT: movaps 80(%rdi), %xmm5 +; SSE-NEXT: movaps 192(%rdi), %xmm6 +; SSE-NEXT: movaps 208(%rdi), %xmm4 +; SSE-NEXT: movaps 224(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movaps 16(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm3, %xmm14 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdi), %xmm10 -; SSE-NEXT: movaps 160(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] +; SSE-NEXT: movaps 32(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rdi), %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[2,0] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[2,0] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm12[2,0] +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[2,0] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdi), %xmm7 +; SSE-NEXT: movaps 128(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[2,0] +; SSE-NEXT: movaps 96(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm0 +; SSE-NEXT: movaps 304(%rdi), %xmm8 +; SSE-NEXT: movaps 320(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 336(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm2 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[2,0] +; SSE-NEXT: movaps 288(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm10 +; SSE-NEXT: movaps 176(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[2,0] +; SSE-NEXT: movaps 144(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 352(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 368(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[2,0] +; SSE-NEXT: movaps 336(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 96(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm2 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 304(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] -; SSE-NEXT: movaps 288(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm6[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm15[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm6[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm2 -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm9[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm11 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm5, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm12[0,0] -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm7, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm0[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm13, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm3[0,0] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps (%rsp), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm15[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm4[0,0] -; SSE-NEXT: movaps %xmm4, %xmm7 -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm13[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm4[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,3] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps %xmm11, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[0,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm9[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm14[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm7[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm14[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm13[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm7[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm8[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm8[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm7[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 16(%rsi) -; SSE-NEXT: movaps %xmm2, 96(%rdx) -; SSE-NEXT: movaps %xmm10, 32(%rdx) -; SSE-NEXT: movaps %xmm14, 112(%rdx) -; SSE-NEXT: movaps %xmm9, 48(%rdx) -; SSE-NEXT: movaps %xmm11, 64(%rdx) +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm10[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm8[2,0] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm10[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm10[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm11[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm11[2,0] +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movaps %xmm2, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 80(%rdx) +; SSE-NEXT: movaps %xmm2, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movaps %xmm0, 96(%rcx) -; SSE-NEXT: movaps %xmm1, 112(%rcx) -; SSE-NEXT: movaps %xmm13, 64(%rcx) -; SSE-NEXT: movaps %xmm7, 80(%rcx) -; SSE-NEXT: movaps %xmm3, 32(%rcx) -; SSE-NEXT: movaps %xmm5, 48(%rcx) -; SSE-NEXT: movaps %xmm6, (%rcx) -; SSE-NEXT: movaps %xmm4, 16(%rcx) -; SSE-NEXT: addq $360, %rsp # imm = 0x168 +; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movaps %xmm11, (%rsi) +; SSE-NEXT: movaps %xmm1, 112(%rdx) +; SSE-NEXT: movaps %xmm14, 48(%rdx) +; SSE-NEXT: movaps %xmm15, 96(%rdx) +; SSE-NEXT: movaps %xmm13, 32(%rdx) +; SSE-NEXT: movaps %xmm4, 80(%rdx) +; SSE-NEXT: movaps %xmm5, 16(%rdx) +; SSE-NEXT: movaps %xmm6, 64(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm8, 112(%rcx) +; SSE-NEXT: movaps %xmm3, 96(%rcx) +; SSE-NEXT: movaps %xmm12, 80(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movaps %xmm7, 48(%rcx) +; SSE-NEXT: movaps %xmm9, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps %xmm10, (%rcx) +; SSE-NEXT: addq $184, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride3_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $392, %rsp # imm = 0x188 +; AVX1-ONLY-NEXT: subq $232, %rsp +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5,6],ymm9[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm9[1,3],ymm1[6,5],ymm9[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm8[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm10[2,0],ymm8[5,4],ymm10[6,4] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,0],ymm13[2,0],ymm7[5,4],ymm13[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6],ymm5[7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm5[1,3],ymm1[6,5],ymm5[5,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5,6],ymm8[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm8[1,3],ymm1[6,5],ymm8[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm14[2,0],ymm6[5,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[2,0],ymm2[5,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm4[1,3],ymm1[6,5],ymm4[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm5[1,0],ymm3[2,0],ymm5[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm15[1,3],ymm0[6,5],ymm15[5,7] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm15[1],ymm11[2,3],ymm15[4],ymm11[5,6],ymm15[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0],ymm11[1],ymm2[2,3],ymm11[4],ymm2[5,6],ymm11[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[2,1],ymm11[1,3],ymm6[6,5],ymm11[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,3],ymm15[0,2],ymm14[4,7],ymm15[4,6] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm14[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm14[1,0],ymm15[2,0],ymm14[5,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[2,0],ymm10[3,0],ymm8[6,4],ymm10[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,0],ymm0[2,0],ymm10[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,2],ymm13[0,3],ymm8[5,6],ymm13[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm9[2],ymm12[3,4],ymm9[5],ymm12[6,7] +; AVX1-ONLY-NEXT: vshufps $201, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm7[2,0],ymm13[3,0],ymm7[6,4],ymm13[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm13[0,0],ymm10[2,0],ymm13[4,4],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,0],ymm14[3,0],ymm6[6,4],ymm14[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,0],ymm0[2,0],ymm14[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,2],ymm8[0,3],ymm10[5,6],ymm8[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vblendps $219, (%rsp), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm8[2],mem[3,4],ymm8[5],mem[6,7] +; AVX1-ONLY-NEXT: vshufps $201, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm1[3,0],ymm10[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0],ymm10[2,0],ymm1[4,4],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm10[0,3],ymm6[5,6],ymm10[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps $201, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[2,0],ymm3[3,0],ymm5[6,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[0,0],ymm10[2,0],ymm3[4,4],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm12[3,0],ymm2[6,4],ymm12[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,0],ymm0[2,0],ymm12[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm9[0,1],mem[2],ymm9[3,4],mem[5],ymm9[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,0],ymm7[2,0],ymm13[5,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0],ymm9[0,3],ymm7[6,4],ymm9[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm3[0,1],mem[0,3],ymm3[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5,6,7] -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm8[2,0],ymm5[0,3],ymm8[6,4],ymm5[4,7] -; AVX1-ONLY-NEXT: vshufps $196, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm14[0,1],mem[0,3],ymm14[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm10[1,0],ymm3[2,0],ymm10[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm15[0,3],ymm3[6,4],ymm15[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,3],ymm1[4,5],ymm4[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm0[0,3],ymm2[6,4],ymm0[4,7] -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm12[0,1],mem[0,3],ymm12[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm11[2],ymm2[3,4],ymm11[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm6[0,3],ymm0[5,6],ymm6[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[2,0],ymm15[3,0],ymm14[6,4],ymm15[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,0],ymm6[2,0],ymm15[4,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm10[1,0],ymm6[2,0],ymm10[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm9[0,3],ymm6[6,4],ymm9[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm13[0,1],ymm7[0,3],ymm13[4,5],ymm7[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,0],ymm6[2,0],ymm9[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,0],ymm4[0,3],ymm6[6,4],ymm4[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm5[0,3],ymm3[4,5],ymm5[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps $36, (%rsp), %ymm8, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm8[0,1],mem[2],ymm8[3,4],mem[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm8[0,3],ymm3[6,4],ymm8[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm2[2],ymm11[3,4],ymm2[5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,0],ymm2[2,0],ymm4[5,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm11[0,3],ymm2[6,4],ymm11[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm15[0,1],ymm14[0,3],ymm15[4,5],ymm14[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rcx) -; AVX1-ONLY-NEXT: addq $392, %rsp # imm = 0x188 +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rcx) +; AVX1-ONLY-NEXT: addq $232, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride3_vf32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $104, %rsp -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm8 = [2,5,2,5,2,5,2,5] -; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6],ymm10[7] +; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm7 = [2,5,2,5,2,5,2,5] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5,6],ymm14[7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm11 = <0,3,6,1,4,7,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0],ymm13[1],ymm4[2,3],ymm13[4],ymm4[5,6],ymm13[7] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0],ymm3[1],ymm13[2,3],ymm3[4],ymm13[5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm1[1],ymm14[2,3],ymm1[4],ymm14[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm11, %ymm11 -; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm1, %ymm7 ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1],ymm3[2],ymm13[3,4],ymm3[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1],ymm13[2],ymm4[3,4],ymm13[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm1, %ymm7 +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm1, %ymm11 -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm2[2],ymm15[3,4],ymm2[5],ymm15[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm10 = <2,5,0,3,6,u,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm10, %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,1,0,3,4,5,4,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm11[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm11 = <2,5,0,3,6,u,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm10[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm11, %ymm6 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm4[2],ymm13[3,4],ymm4[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm14[2],ymm2[3,4],ymm14[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm11, %ymm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rsi) ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-SLOW-NEXT: addq $104, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -1203,100 +1136,99 @@ ; AVX2-FAST-LABEL: load_i32_stride3_vf32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $104, %rsp -; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm15 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm12 -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [2,5,2,5,2,5,2,5] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm8 +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm7 = [2,5,2,5,2,5,2,5] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5,6],ymm11[7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm13 = <0,3,6,1,4,7,u,u> ; AVX2-FAST-NEXT: vpermps %ymm10, %ymm13, %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm6 -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6],ymm15[7] ; AVX2-FAST-NEXT: vpermps %ymm10, %ymm13, %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5,6],ymm4[7] ; AVX2-FAST-NEXT: vpermps %ymm10, %ymm13, %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm1[1],ymm14[2,3],ymm1[4],ymm14[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <1,4,7,2,5,u,u,u> -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm2, %ymm7 ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm12, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm8[0,1],ymm4[2],ymm8[3,4],ymm4[5],ymm8[6,7] ; AVX2-FAST-NEXT: vpermps %ymm13, %ymm2, %ymm13 -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm6[2],ymm14[3,4],ymm6[5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm7[2],ymm14[3,4],ymm7[5],ymm14[6,7] ; AVX2-FAST-NEXT: vpermps %ymm13, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = [0,1,0,3,0,1,4,7] ; AVX2-FAST-NEXT: vpermps %ymm12, %ymm9, %ymm11 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm12 = <2,5,0,3,6,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm12, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm15[2],ymm5[3,4],ymm15[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm9, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm9, %ymm6 ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm14[2],ymm6[3,4],ymm14[5],ymm6[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm14[2],ymm7[3,4],ymm14[5],ymm7[6,7] ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%rsi) ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%rsi) +; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-FAST-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm10, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-FAST-NEXT: vmovaps %ymm10, (%rdx) -; AVX2-FAST-NEXT: vmovaps %ymm8, 96(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FAST-NEXT: addq $104, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -1304,103 +1236,102 @@ ; AVX2-FAST-PERLANE-LABEL: load_i32_stride3_vf32: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $104, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm8 = [2,5,2,5,2,5,2,5] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm7 = [2,5,2,5,2,5,2,5] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5,6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm11 = <0,3,6,1,4,7,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm11, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0],ymm13[1],ymm4[2,3],ymm13[4],ymm4[5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm11, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0],ymm3[1],ymm13[2,3],ymm3[4],ymm13[5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm11, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm1[1],ymm14[2,3],ymm1[4],ymm14[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm1, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1],ymm3[2],ymm13[3,4],ymm3[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1],ymm13[2],ymm4[3,4],ymm13[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm1, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm1, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm2[2],ymm15[3,4],ymm2[5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm10 = <2,5,0,3,6,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm10, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,1,0,3,4,5,4,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm11 = <2,5,0,3,6,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm11, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm4[2],ymm13[3,4],ymm4[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm14[2],ymm2[3,4],ymm14[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: addq $104, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -1454,810 +1385,744 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i32_stride3_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1064, %rsp # imm = 0x428 -; SSE-NEXT: movaps 624(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 656(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 640(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 432(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 464(%rdi), %xmm6 +; SSE-NEXT: subq $808, %rsp # imm = 0x328 +; SSE-NEXT: movaps 576(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 448(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 272(%rdi), %xmm5 +; SSE-NEXT: movaps 592(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 608(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 384(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps 400(%rdi), %xmm13 +; SSE-NEXT: movaps 416(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm8 -; SSE-NEXT: movaps 80(%rdi), %xmm11 -; SSE-NEXT: movaps 64(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[1,0] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movaps 192(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps 208(%rdi), %xmm14 +; SSE-NEXT: movaps 224(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,0] -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps 32(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0] -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm14[2,0] +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[2,0] +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[2,0] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdi), %xmm3 +; SSE-NEXT: movaps 80(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdi), %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[2,0] +; SSE-NEXT: movaps 48(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps (%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm1 +; SSE-NEXT: movaps 256(%rdi), %xmm10 +; SSE-NEXT: movaps 272(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[2,0] +; SSE-NEXT: movaps 240(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 448(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdi), %xmm0 +; SSE-NEXT: movaps 464(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 192(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 416(%rdi), %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[2,0] +; SSE-NEXT: movaps 432(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 640(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 400(%rdi), %xmm0 +; SSE-NEXT: movaps 656(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 384(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 608(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 592(%rdi), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[2,0] +; SSE-NEXT: movaps 624(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdi), %xmm2 +; SSE-NEXT: movaps 128(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[2,0] +; SSE-NEXT: movaps 96(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 304(%rdi), %xmm7 +; SSE-NEXT: movaps 320(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] -; SSE-NEXT: movaps 576(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdi), %xmm6 -; SSE-NEXT: movaps 160(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[2,0] +; SSE-NEXT: movaps 288(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 496(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 512(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[2,0] +; SSE-NEXT: movaps 480(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 688(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 704(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[2,0] +; SSE-NEXT: movaps 672(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm1 +; SSE-NEXT: movaps 176(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[2,0] +; SSE-NEXT: movaps 144(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 352(%rdi), %xmm4 +; SSE-NEXT: movaps 368(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[2,0] +; SSE-NEXT: movaps 336(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 544(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 560(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[2,0] +; SSE-NEXT: movaps 528(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 736(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 752(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm6[2,0] +; SSE-NEXT: movaps 720(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[0,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps 336(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm5[2,0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 560(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 544(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] -; SSE-NEXT: movaps 528(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm12[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm12[2,0] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm8[2,0] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm14[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 752(%rdi), %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 736(%rdi), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[1,0] -; SSE-NEXT: movaps 720(%rdi), %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm4 -; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[1,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm9 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm2[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm13 -; SSE-NEXT: movaps 304(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm13[1,0] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm2[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 512(%rdi), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 496(%rdi), %xmm2 +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[1,0] -; SSE-NEXT: movaps 480(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm2[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 704(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 688(%rdi), %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm13[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm9[1,0] -; SSE-NEXT: movaps 672(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm9 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm2[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm11[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm10[0,2] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm7[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm6[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[0,0] -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm15[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm15[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm11[0,0] -; SSE-NEXT: movaps %xmm11, %xmm2 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm9[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm5 -; SSE-NEXT: movaps %xmm12, %xmm15 -; SSE-NEXT: movaps %xmm14, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm14[0,0] -; SSE-NEXT: movaps %xmm14, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm13[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 176(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movaps %xmm2, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm0, %xmm12 +; SSE-NEXT: movaps %xmm2, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm2[0,0] -; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[3,1],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm2[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] +; SSE-NEXT: movaps %xmm2, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, (%rsi) +; SSE-NEXT: movaps %xmm1, 240(%rdx) +; SSE-NEXT: movaps %xmm3, 224(%rdx) +; SSE-NEXT: movaps %xmm4, 208(%rdx) +; SSE-NEXT: movaps %xmm7, 192(%rdx) +; SSE-NEXT: movaps %xmm9, 176(%rdx) +; SSE-NEXT: movaps %xmm11, 160(%rdx) +; SSE-NEXT: movaps %xmm13, 144(%rdx) +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 128(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm5, 240(%rcx) +; SSE-NEXT: movaps %xmm8, 224(%rcx) +; SSE-NEXT: movaps %xmm10, 208(%rcx) +; SSE-NEXT: movaps %xmm12, 192(%rcx) +; SSE-NEXT: movaps %xmm14, 176(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm0, 160(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0,1],mem[0,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm9[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[0,1],mem[0,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,3] -; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0,1],mem[0,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 176(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 16(%rsi) -; SSE-NEXT: movaps %xmm12, 224(%rdx) -; SSE-NEXT: movaps %xmm14, 240(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 192(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 208(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 160(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 176(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 128(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 144(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 16(%rdx) -; SSE-NEXT: movaps %xmm0, 240(%rcx) -; SSE-NEXT: movaps %xmm1, 224(%rcx) -; SSE-NEXT: movaps %xmm2, 208(%rcx) -; SSE-NEXT: movaps %xmm3, 192(%rcx) -; SSE-NEXT: movaps %xmm5, 176(%rcx) -; SSE-NEXT: movaps %xmm6, 160(%rcx) -; SSE-NEXT: movaps %xmm7, 144(%rcx) -; SSE-NEXT: movaps %xmm8, 128(%rcx) -; SSE-NEXT: movaps %xmm9, 112(%rcx) -; SSE-NEXT: movaps %xmm15, 96(%rcx) -; SSE-NEXT: movaps %xmm10, 80(%rcx) -; SSE-NEXT: movaps %xmm11, 64(%rcx) -; SSE-NEXT: movaps %xmm13, 48(%rcx) +; SSE-NEXT: movaps %xmm0, 144(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: addq $1064, %rsp # imm = 0x428 +; SSE-NEXT: movaps %xmm6, (%rcx) +; SSE-NEXT: addq $808, %rsp # imm = 0x328 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1416, %rsp # imm = 0x588 -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 +; AVX1-ONLY-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm9[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm9, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm9[1,3],ymm1[6,5],ymm9[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,1],ymm10[1,3],ymm8[6,5],ymm10[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm1[2,0],ymm8[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,0],ymm1[2,0],ymm7[5,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm6[1,3],ymm1[6,5],ymm6[5,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm5[1,3],ymm1[6,5],ymm5[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm15[2,0],ymm5[5,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm3[1,3],ymm1[6,5],ymm3[5,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm2[1,3],ymm1[6,5],ymm2[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm5[2,0],ymm2[5,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm2[2,0],ymm1[5,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm2[1,3],ymm0[6,5],ymm2[5,7] -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm14[2,0],ymm1[5,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm8[1,3],ymm0[6,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5,6],ymm8[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm13[2,0],ymm1[5,4],ymm13[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm9[1,3],ymm0[6,5],ymm9[5,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm2[1,3],ymm1[6,5],ymm2[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3],ymm9[4],ymm1[5,6],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm2[2,0],ymm1[5,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3],ymm10[4],ymm1[5,6],ymm10[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm7[2,0],ymm2[5,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm11[1,3],ymm0[6,5],ymm11[5,7] -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3],ymm11[4],ymm1[5,6],ymm11[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[2,1],ymm4[1,3],ymm0[6,5],ymm4[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm2[0,2],ymm1[4,7],ymm2[4,6] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm8[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm6[2,0],ymm8[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6],ymm7[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[2,1],ymm7[1,3],ymm0[6,5],ymm7[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,3],ymm3[0,2],ymm2[4,7],ymm3[4,6] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm15[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm15[1,0],ymm13[2,0],ymm15[5,4],ymm13[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm2[1,3],ymm0[6,5],ymm2[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm12[0,2],ymm3[4,7],ymm12[4,6] +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[2,1],ymm3[1,3],ymm1[6,5],ymm3[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm14[0,2],ymm0[4,7],ymm14[4,6] +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[1,0],ymm12[2,0],ymm1[5,4],ymm12[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm15[3,0],ymm0[6,4],ymm15[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[0,0],ymm0[2,0],ymm15[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] +; AVX1-ONLY-NEXT: vshufps $201, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm1[3,0],ymm14[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[0,0],ymm14[2,0],ymm1[4,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[3,0],ymm0[6,4],ymm5[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,0],ymm0[2,0],ymm5[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX1-ONLY-NEXT: vshufps $201, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm1[3,0],ymm14[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[0,0],ymm14[2,0],ymm1[4,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[3,0],ymm0[6,4],ymm14[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,0],ymm0[2,0],ymm14[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX1-ONLY-NEXT: vshufps $201, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm1[3,0],ymm14[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[0,0],ymm14[2,0],ymm1[4,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm13[3,0],ymm0[6,4],ymm13[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0],ymm0[2,0],ymm13[4,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX1-ONLY-NEXT: vshufps $201, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,2],ymm3[0,3],ymm2[5,6],ymm3[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm0[3,0],ymm14[6,4],ymm0[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[0,0],ymm14[2,0],ymm0[4,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps $201, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm8[2,0],ymm6[3,0],ymm8[6,4],ymm6[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm6[0,0],ymm14[2,0],ymm6[4,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7] +; AVX1-ONLY-NEXT: vshufps $201, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[2,0],ymm13[3,0],ymm15[6,4],ymm13[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm13[0,0],ymm14[2,0],ymm13[4,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX1-ONLY-NEXT: vshufps $201, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm1[3,0],ymm14[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[0,0],ymm14[2,0],ymm1[4,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] +; AVX1-ONLY-NEXT: vshufps $201, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1,2],mem[0,3],ymm0[5,6],mem[4,7] ; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,0],ymm12[3,0],ymm4[6,4],ymm12[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0],ymm1[2,0],ymm12[4,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm9[2],ymm1[3,4],ymm9[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,2],ymm4[0,3],ymm5[5,6],ymm4[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[2,0],ymm7[3,0],ymm6[6,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,0],ymm2[2,0],ymm7[4,4],ymm2[6,4] -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1],ymm10[2],ymm2[3,4],ymm10[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,2],ymm6[0,3],ymm15[5,6],ymm6[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm12[2,0],ymm7[3,0],ymm12[6,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm7[0,0],ymm5[2,0],ymm7[4,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1],ymm11[2],ymm5[3,4],ymm11[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm7[0,3],ymm14[5,6],ymm7[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm13[1,0],ymm14[2,0],ymm13[5,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm15[0,3],ymm14[6,4],ymm15[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm15[0,1],mem[0,3],ymm15[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,0],ymm14[2,0],ymm3[5,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[0,3],ymm0[6,4],ymm8[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1],mem[0,3],ymm3[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm8[0,1],mem[2],ymm8[3,4],mem[5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,0],ymm0[2,0],ymm13[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[0,3],ymm0[6,4],ymm8[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1],mem[0,3],ymm8[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm0[2,0],ymm4[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm9[0,3],ymm0[6,4],ymm9[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[2,0],ymm12[3,0],ymm1[6,4],ymm12[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,0],ymm14[2,0],ymm12[4,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,0],ymm0[2,0],ymm14[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm10[0,3],ymm0[6,4],ymm10[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1],mem[0,3],ymm1[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,0],ymm1[2,0],ymm9[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm4[0,3],ymm1[6,4],ymm4[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm2[2],ymm10[3,4],ymm2[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm10[0,3],ymm2[6,4],ymm10[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm1[0,1],mem[0,3],ymm1[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,0],ymm0[2,0],ymm5[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[0,3],ymm0[6,4],ymm4[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1],ymm8[0,3],ymm6[4,5],ymm8[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[1,0],ymm4[2,0],ymm9[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,0],ymm4[2,0],ymm5[5,4],ymm4[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm6[0,3],ymm4[6,4],ymm6[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm1[0,1],mem[0,3],ymm1[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,0],ymm5[2,0],ymm6[5,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,0],ymm7[0,3],ymm5[6,4],ymm7[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm13[0,1],ymm15[0,3],ymm13[4,5],ymm15[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm7[0,1],mem[2],ymm7[3,4],mem[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,0],ymm5[2,0],ymm6[5,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm7[0,3],ymm5[6,4],ymm7[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,1],mem[0,3],ymm6[4,5],mem[4,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm5[2],ymm11[3,4],ymm5[5],ymm11[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,0],ymm6[2,0],ymm7[5,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm11[0,3],ymm5[6,4],ymm11[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1],ymm12[0,3],ymm6[4,5],ymm12[4,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[1,0],ymm6[2,0],ymm7[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[2,0],ymm2[0,3],ymm6[6,4],ymm2[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 32(%rcx) -; AVX1-ONLY-NEXT: addq $1416, %rsp # imm = 0x588 +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,1],mem[0,3],ymm6[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm8[0,1],mem[2],ymm8[3,4],mem[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[1,0],ymm6[2,0],ymm7[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm8[0,3],ymm6[6,4],ymm8[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,1],mem[0,3],ymm7[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,0],ymm7[2,0],ymm8[5,4],ymm7[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[2,0],ymm3[0,3],ymm7[6,4],ymm3[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,1],mem[0,3],ymm7[4,5],mem[4,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rcx) +; AVX1-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride3_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm7 +; AVX2-SLOW-NEXT: subq $1128, %rsp # imm = 0x468 +; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovups %ymm8, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,5,2,5,2,5,2,5] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2,3],ymm2[4],ymm12[5,6],ymm2[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3],ymm10[4],ymm2[5,6],ymm10[7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm13 = <0,3,6,1,4,7,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm13, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -2267,184 +2132,180 @@ ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm13, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps %ymm4, %ymm9 -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm13, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6],ymm10[7] +; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm3[1],ymm11[2,3],ymm3[4],ymm11[5,6],ymm3[7] ; AVX2-SLOW-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm13, %ymm1 -; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vmovaps %ymm3, %ymm9 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm13, %ymm1 +; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm13, %ymm7 -; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 480(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm13, %ymm14 -; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm13, %ymm15 -; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm7[2],ymm12[3,4],ymm7[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm15 = <1,4,7,2,5,u,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm15, %ymm1 ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm13[2],mem[3,4],ymm13[5],mem[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermps (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vpermps (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm4[2],ymm9[3,4],ymm4[5],ymm9[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm8[2],ymm14[3,4],ymm8[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm15, %ymm1 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm12 = <2,5,0,3,6,u,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm10 = <2,5,0,3,6,u,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = ymm13[0,1],mem[2],ymm13[3,4],mem[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm12, %ymm1 -; AVX2-SLOW-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-SLOW-NEXT: vpermilps $196, (%rsp), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm12, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm10, %ymm2 ; AVX2-SLOW-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm12, %ymm3 -; AVX2-SLOW-NEXT: vpermilps $196, (%rsp), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm10, %ymm3 +; AVX2-SLOW-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm12, %ymm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm12, %ymm5 +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm10, %ymm5 ; AVX2-SLOW-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm12, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm14[2],ymm8[3,4],ymm14[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm10, %ymm6 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm15[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 192(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 128(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm7, 224(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm7, 160(%rsi) @@ -2453,13 +2314,13 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 192(%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 192(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 128(%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 128(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm7, 224(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -2468,41 +2329,50 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rdx) -; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm5, 224(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 128(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 160(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 192(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 128(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 192(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 160(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%rcx) -; AVX2-SLOW-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX2-SLOW-NEXT: addq $1128, %rsp # imm = 0x468 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i32_stride3_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $1000, %rsp # imm = 0x3E8 -; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps 480(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovups %ymm7, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,5,2,5,2,5,2,5] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0],ymm10[1],ymm14[2,3],ymm10[4],ymm14[5,6],ymm10[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5,6],ymm14[7] ; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm10 = <0,3,6,1,4,7,u,u> ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm10, %ymm2 @@ -2520,60 +2390,60 @@ ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm10, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovaps %ymm3, %ymm6 ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 -; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm5 ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 480(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm8 ; AVX2-FAST-NEXT: vpermps %ymm7, %ymm10, %ymm11 -; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm12 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm12 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm12[0],ymm1[1],ymm12[2,3],ymm1[4],ymm12[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermps %ymm15, %ymm10, %ymm15 -; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm13 ; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7] +; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm15 = <1,4,7,2,5,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm15, %ymm1 ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] @@ -2603,7 +2473,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm3, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm15, %ymm1 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm14 @@ -2632,14 +2502,14 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm15 = [0,1,0,3,0,1,4,7] ; AVX2-FAST-NEXT: vpermps %ymm11, %ymm15, %ymm14 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm15, %ymm2 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] @@ -2675,14 +2545,6 @@ ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 192(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 128(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm7, 224(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm7, 160(%rsi) @@ -2691,13 +2553,13 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 192(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm7, 192(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 128(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm7, 128(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, 64(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm7, 64(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm7, (%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm7, 224(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -2706,40 +2568,50 @@ ; AVX2-FAST-NEXT: vmovaps %ymm7, 96(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rdx) -; AVX2-FAST-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm4, 160(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm14, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 192(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 128(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 160(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm14, (%rcx) ; AVX2-FAST-NEXT: addq $1000, %rsp # imm = 0x3E8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride3_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1096, %rsp # imm = 0x448 -; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: subq $1128, %rsp # imm = 0x468 +; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,5,2,5,2,5,2,5] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2,3],ymm2[4],ymm12[5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3],ymm10[4],ymm2[5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm13 = <0,3,6,1,4,7,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm13, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -2749,184 +2621,180 @@ ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm13, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm13, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm3[1],ymm11[2,3],ymm3[4],ymm11[5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm13, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm13, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm13, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 480(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm13, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm13, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm7[2],ymm12[3,4],ymm7[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm15 = <1,4,7,2,5,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm15, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm13[2],mem[3,4],ymm13[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermps (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm4[2],ymm9[3,4],ymm4[5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm8[2],ymm14[3,4],ymm8[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm15, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm12 = <2,5,0,3,6,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm10 = <2,5,0,3,6,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm13[0,1],mem[2],ymm13[3,4],mem[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermilps $196, (%rsp), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm10, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm12, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermilps $196, (%rsp), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm10, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm12, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm12, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm10, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm12, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm14[2],ymm8[3,4],ymm14[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm10, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm15[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 192(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 128(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 224(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 160(%rsi) @@ -2935,13 +2803,13 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 192(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 192(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 128(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 128(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 224(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -2950,15 +2818,24 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 96(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 160(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 192(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 128(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: addq $1096, %rsp # imm = 0x448 +; AVX2-FAST-PERLANE-NEXT: addq $1128, %rsp # imm = 0x468 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -2990,13 +2867,13 @@ ; AVX512-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 ; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = <17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u> -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512-NEXT: vpermt2d %zmm3, %zmm14, %zmm17 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] -; AVX512-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm19 -; AVX512-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 -; AVX512-NEXT: vpermt2d %zmm8, %zmm18, %zmm19 +; AVX512-NEXT: vpermt2d %zmm8, %zmm18, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512-NEXT: vpermt2d %zmm1, %zmm14, %zmm19 +; AVX512-NEXT: vpermt2d %zmm6, %zmm18, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512-NEXT: vpermt2d %zmm0, %zmm14, %zmm20 ; AVX512-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 @@ -3006,10 +2883,10 @@ ; AVX512-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] ; AVX512-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 -; AVX512-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 -; AVX512-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 ; AVX512-NEXT: vpermt2d %zmm7, %zmm18, %zmm1 ; AVX512-NEXT: vpermt2d %zmm6, %zmm9, %zmm1 +; AVX512-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 +; AVX512-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 ; AVX512-NEXT: vpermt2d %zmm10, %zmm18, %zmm2 ; AVX512-NEXT: vpermt2d %zmm11, %zmm9, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rsi) @@ -3017,13 +2894,13 @@ ; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm12, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm20, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm19, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm17, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm2, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm2, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <192 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -175,59 +175,40 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm0, (%r8) ; AVX1-ONLY-NEXT: retq ; -; AVX2-ONLY-LABEL: load_i32_stride4_vf4: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,4,0,4] -; AVX2-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5] -; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [2,6,2,6] -; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm1 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [3,7,3,7] -; AVX2-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vmovaps %xmm6, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rcx) -; AVX2-ONLY-NEXT: vmovaps %xmm2, (%r8) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512-LABEL: load_i32_stride4_vf4: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12] -; AVX512-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [1,5,9,13] -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [2,6,10,14] -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm4 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [3,7,11,15] -; AVX512-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 -; AVX512-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512-NEXT: vmovdqa %xmm3, (%rdx) -; AVX512-NEXT: vmovdqa %xmm4, (%rcx) -; AVX512-NEXT: vmovdqa %xmm5, (%r8) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX2-LABEL: load_i32_stride4_vf4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = [0,4,0,4] +; AVX2-NEXT: # xmm0 = mem[0,0] +; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovaps (%rdi), %xmm2 +; AVX2-NEXT: vmovaps 16(%rdi), %xmm3 +; AVX2-NEXT: vmovaps 32(%rdi), %xmm4 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] +; AVX2-NEXT: vmovaps 48(%rdi), %xmm5 +; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5] +; AVX2-NEXT: # xmm7 = mem[0,0] +; AVX2-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-NEXT: vpermps %ymm8, %ymm7, %ymm7 +; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = [2,6,2,6] +; AVX2-NEXT: # xmm7 = mem[0,0] +; AVX2-NEXT: vpermps %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [3,7,3,7] +; AVX2-NEXT: # xmm3 = mem[0,0] +; AVX2-NEXT: vpermps %ymm8, %ymm3, %ymm3 +; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-NEXT: vmovaps %xmm0, (%rsi) +; AVX2-NEXT: vmovaps %xmm6, (%rdx) +; AVX2-NEXT: vmovaps %xmm1, (%rcx) +; AVX2-NEXT: vmovaps %xmm2, (%r8) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %wide.vec = load <16 x i32>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <4 x i32> %strided.vec1 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <4 x i32> @@ -423,105 +404,105 @@ ; SSE-NEXT: subq $40, %rsp ; SSE-NEXT: movaps 208(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm1 -; SSE-NEXT: movaps 144(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm14 +; SSE-NEXT: movaps 240(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdi), %xmm2 +; SSE-NEXT: movaps 144(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdi), %xmm5 ; SSE-NEXT: movaps 176(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdi), %xmm3 +; SSE-NEXT: movaps 160(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm10 -; SSE-NEXT: movaps 112(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdi), %xmm9 +; SSE-NEXT: movaps 112(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps %xmm10, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; SSE-NEXT: movaps %xmm14, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movaps %xmm9, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE-NEXT: movaps 192(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps 48(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; SSE-NEXT: movaps (%rdi), %xmm11 -; SSE-NEXT: movaps 16(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm11, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] -; SSE-NEXT: movaps %xmm13, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps 32(%rdi), %xmm3 +; SSE-NEXT: movaps 48(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] +; SSE-NEXT: movaps (%rdi), %xmm10 +; SSE-NEXT: movaps 16(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1] +; SSE-NEXT: movaps %xmm13, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; SSE-NEXT: movaps %xmm10, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm8[1] -; SSE-NEXT: movaps %xmm14, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps %xmm12, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps %xmm7, (%rsi) -; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movaps %xmm4, 48(%rdx) -; SSE-NEXT: movaps %xmm13, (%rdx) +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; SSE-NEXT: movaps %xmm9, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps %xmm12, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm3[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 48(%rsi) +; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 16(%rsi) +; SSE-NEXT: movaps %xmm11, (%rsi) +; SSE-NEXT: movaps %xmm1, 48(%rdx) ; SSE-NEXT: movaps %xmm6, 32(%rdx) -; SSE-NEXT: movaps %xmm5, 16(%rdx) -; SSE-NEXT: movaps %xmm9, 48(%rcx) -; SSE-NEXT: movaps %xmm8, 32(%rcx) -; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps %xmm15, 16(%rdx) +; SSE-NEXT: movaps %xmm13, (%rdx) +; SSE-NEXT: movaps %xmm8, 48(%rcx) +; SSE-NEXT: movaps %xmm7, 32(%rcx) +; SSE-NEXT: movaps %xmm2, 16(%rcx) ; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps %xmm12, 48(%r8) -; SSE-NEXT: movaps %xmm14, 32(%r8) -; SSE-NEXT: movaps %xmm10, 16(%r8) -; SSE-NEXT: movaps %xmm11, (%r8) +; SSE-NEXT: movaps %xmm5, 32(%r8) +; SSE-NEXT: movaps %xmm9, 16(%r8) +; SSE-NEXT: movaps %xmm10, (%r8) ; SSE-NEXT: addq $40, %rsp ; SSE-NEXT: retq ; @@ -823,139 +804,154 @@ ; SSE-LABEL: load_i32_stride4_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $440, %rsp # imm = 0x1B8 -; SSE-NEXT: movaps 272(%rdi), %xmm7 +; SSE-NEXT: movaps 336(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 368(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 304(%rdi), %xmm8 +; SSE-NEXT: movaps 352(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 336(%rdi), %xmm9 +; SSE-NEXT: movaps 64(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm10 +; SSE-NEXT: movaps 112(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm1 -; SSE-NEXT: movaps 112(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 96(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 272(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 256(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 304(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps 288(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movaps 256(%rdi), %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movaps 320(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm1 +; SSE-NEXT: movaps 176(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movaps 160(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 208(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps 144(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 496(%rdi), %xmm1 +; SSE-NEXT: movaps 432(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 480(%rdi), %xmm0 +; SSE-NEXT: movaps 416(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 464(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 448(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] +; SSE-NEXT: movaps 400(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 384(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 240(%rdi), %xmm14 +; SSE-NEXT: movaps 224(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movaps 208(%rdi), %xmm11 +; SSE-NEXT: movaps 192(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] ; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 176(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm3 +; SSE-NEXT: movaps 496(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 480(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 464(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 448(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 144(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] -; SSE-NEXT: movaps 432(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 416(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 400(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 384(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] -; SSE-NEXT: movaps 32(%rdi), %xmm5 -; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps (%rdi), %xmm10 -; SSE-NEXT: movaps 16(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps 32(%rdi), %xmm2 +; SSE-NEXT: movaps 48(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps (%rdi), %xmm10 +; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm14[2],xmm4[3],xmm14[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE-NEXT: movaps (%rsp), %xmm11 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: movaps %xmm11, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] @@ -963,116 +959,99 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm3[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm12[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm14[1] -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: movaps %xmm10, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps %xmm7, 96(%rdx) -; SSE-NEXT: movaps %xmm11, 32(%rdx) -; SSE-NEXT: movaps %xmm15, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movaps %xmm6, 96(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps %xmm4, 112(%rcx) +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: movaps (%rsp), %xmm9 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm9[1] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps %xmm13, %xmm9 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm4 +; SSE-NEXT: movaps %xmm14, %xmm11 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps %xmm4, %xmm14 +; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] +; SSE-NEXT: movaps %xmm10, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps %xmm3, 112(%rdx) +; SSE-NEXT: movaps %xmm15, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps %xmm6, (%rdx) +; SSE-NEXT: movaps %xmm13, 112(%rcx) +; SSE-NEXT: movaps %xmm11, 96(%rcx) +; SSE-NEXT: movaps %xmm9, 80(%rcx) +; SSE-NEXT: movaps %xmm0, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) -; SSE-NEXT: movaps %xmm12, 64(%rcx) -; SSE-NEXT: movaps %xmm14, (%rcx) -; SSE-NEXT: movaps %xmm1, 80(%rcx) +; SSE-NEXT: movaps %xmm5, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm8, 112(%r8) -; SSE-NEXT: movaps %xmm9, 96(%r8) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rcx) +; SSE-NEXT: movaps %xmm7, 112(%r8) +; SSE-NEXT: movaps %xmm14, 96(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r8) +; SSE-NEXT: movaps %xmm12, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) -; SSE-NEXT: movaps %xmm13, 32(%r8) +; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps %xmm10, (%r8) @@ -1081,31 +1060,29 @@ ; ; AVX1-ONLY-LABEL: load_i32_stride4_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1000, %rsp # imm = 0x3E8 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 +; AVX1-ONLY-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm11[0],ymm2[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm13[0],ymm5[2],ymm13[2] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[4],ymm5[4],ymm2[5],ymm5[5] +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm8 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm11 +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm9 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm7, %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm10 +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1118,44 +1095,43 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm12 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm12 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1163,74 +1139,80 @@ ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1],ymm8[2,0],ymm10[4,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm4[2,0],ymm6[4,5],ymm4[6,4] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm6[0],ymm13[1],ymm6[1],ymm13[4],ymm6[4],ymm13[5],ymm6[5] -; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm10 ; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm11[1,0],ymm5[5,4],ymm11[5,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5] +; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm9[1,0],ymm8[5,4],ymm9[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm9[1],xmm11[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm13[1],xmm5[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm14, %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,0],ymm14[1,0],ymm12[5,4],ymm14[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[4],ymm8[4],ymm4[5],ymm8[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm5[1,0],ymm10[5,4],ymm5[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[1],xmm4[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm12[1],xmm14[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm3[1,0],ymm1[5,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[2,0],ymm15[2,3],ymm1[6,4],ymm15[6,7] +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm4[0],mem[0],xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[4],ymm7[4],ymm12[5],ymm7[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm14[1,0],ymm1[5,4],ymm14[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[2,0],ymm8[2,3],ymm1[6,4],ymm8[6,7] +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero @@ -1238,58 +1220,59 @@ ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm1[1],ymm13[1],ymm1[3],ymm13[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm5[2],ymm10[3],ymm5[3],ymm10[6],ymm5[6],ymm10[7],ymm5[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[0,1],ymm8[2,0],ymm1[4,5],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm6[2],xmm9[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm2[2],xmm9[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm8[2,0],ymm1[4,5],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm4[2],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[6],ymm10[6],ymm5[7],ymm10[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm6[2],xmm12[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[6],ymm5[6],ymm8[7],ymm5[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm2[2],xmm3[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm12[1],ymm7[3],ymm12[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[6],ymm13[6],ymm14[7],ymm13[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[6],ymm12[6],ymm1[7],ymm12[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm14[2],mem[2],xmm14[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm7[2],xmm12[2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = zero,zero,xmm8[2],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] @@ -1298,50 +1281,51 @@ ; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, (%rsp), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,0],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[3,0],ymm8[3,0],ymm5[7,4],ymm8[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,0],ymm1[2,3],ymm4[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm10[3,0],xmm11[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[2,0],ymm4[2,3],ymm5[6,4],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm2[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,0],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm13[3,0],mem[3,0],ymm13[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,0],mem[3,0],ymm8[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,0],ymm1[2,3],ymm8[6,4],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm15[3,0],xmm9[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm9[2,0],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0],ymm6[3,0],ymm7[7,4],ymm6[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[2,0],ymm5[2,3],ymm7[6,4],ymm5[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm13[3,0],xmm14[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,0],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm12[3,0],mem[3,0],ymm12[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,3],ymm3[6,4],ymm2[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,0],xmm14[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[2,0],xmm3[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) @@ -1350,47 +1334,43 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX1-ONLY-NEXT: addq $1000, %rsp # imm = 0x3E8 +; AVX1-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride4_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: subq $680, %rsp # imm = 0x2A8 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [0,4,0,4,0,4,0,4] -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm10 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm9 +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,4,0,4] @@ -1399,38 +1379,41 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm3, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps %ymm4, %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm3, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1444,169 +1427,169 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1,5,1,5,1,5,1,5] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5] +; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm3, %ymm0 ; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm3, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm9, %ymm4 +; AVX2-ONLY-NEXT: vmovaps %ymm9, %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [1,5,1,5] -; AVX2-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm3, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm3, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] -; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm6, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm3, %ymm9 -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm3, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm3, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vmovaps %ymm14, %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm13 -; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm15 +; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,6,2,6] -; AVX2-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [2,6,2,6] +; AVX2-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps (%rsp), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm10 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [3,7,3,7,3,7,3,7] -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm10, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm10, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [3,7,3,7] -; AVX2-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm10, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm10, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm10, %ymm5 -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm10, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm6 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm9[2],mem[2],xmm9[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm1 = [3,7,3,7,3,7,3,7] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm1, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [3,7,3,7] +; AVX2-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm1, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm1, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm15, (%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, (%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX2-ONLY-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-ONLY-NEXT: addq $680, %rsp # imm = 0x2A8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1686,37 +1669,40 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i32_stride4_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1224, %rsp # imm = 0x4C8 -; SSE-NEXT: movaps 144(%rdi), %xmm4 -; SSE-NEXT: movaps 176(%rdi), %xmm5 +; SSE-NEXT: subq $1240, %rsp # imm = 0x4D8 +; SSE-NEXT: movaps 208(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdi), %xmm7 +; SSE-NEXT: movaps 240(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm9 +; SSE-NEXT: movaps 224(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdi), %xmm10 +; SSE-NEXT: movaps 128(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm12 -; SSE-NEXT: movaps 96(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movaps 160(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdi), %xmm8 +; SSE-NEXT: movaps 112(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 @@ -1724,28 +1710,11 @@ ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movaps 128(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 336(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movaps 192(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1753,28 +1722,27 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 304(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps 288(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps 272(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movaps 272(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 256(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 496(%rdi), %xmm1 +; SSE-NEXT: movaps 368(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 480(%rdi), %xmm0 +; SSE-NEXT: movaps 352(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 464(%rdi), %xmm2 +; SSE-NEXT: movaps 336(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 448(%rdi), %xmm1 +; SSE-NEXT: movaps 320(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -1790,21 +1758,21 @@ ; SSE-NEXT: movaps 400(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 384(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 624(%rdi), %xmm1 +; SSE-NEXT: movaps 496(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 608(%rdi), %xmm0 +; SSE-NEXT: movaps 480(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 592(%rdi), %xmm2 +; SSE-NEXT: movaps 464(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 576(%rdi), %xmm1 +; SSE-NEXT: movaps 448(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -1827,15 +1795,15 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 752(%rdi), %xmm1 +; SSE-NEXT: movaps 624(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 736(%rdi), %xmm0 +; SSE-NEXT: movaps 608(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 720(%rdi), %xmm2 +; SSE-NEXT: movaps 592(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 704(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 576(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] @@ -1857,14 +1825,14 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 880(%rdi), %xmm1 +; SSE-NEXT: movaps 752(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 864(%rdi), %xmm0 +; SSE-NEXT: movaps 736(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 848(%rdi), %xmm2 +; SSE-NEXT: movaps 720(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 832(%rdi), %xmm1 +; SSE-NEXT: movaps 704(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -1874,8 +1842,8 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 816(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 800(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps 800(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 784(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1887,332 +1855,350 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1008(%rdi), %xmm1 +; SSE-NEXT: movaps 880(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 992(%rdi), %xmm15 +; SSE-NEXT: movaps 864(%rdi), %xmm15 ; SSE-NEXT: movaps %xmm15, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 976(%rdi), %xmm1 +; SSE-NEXT: movaps 848(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 832(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 944(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 928(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 912(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 960(%rdi), %xmm14 +; SSE-NEXT: movaps 896(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] ; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] -; SSE-NEXT: movaps 944(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 928(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 912(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 896(%rdi), %xmm5 +; SSE-NEXT: movaps 1008(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 992(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 976(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 960(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] -; SSE-NEXT: movaps 32(%rdi), %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE-NEXT: movaps 32(%rdi), %xmm7 ; SSE-NEXT: movaps 48(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movaps (%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 16(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm3[1] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm4[0] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: movaps %xmm8, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movaps %xmm9, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm12 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm13 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm11 +; SSE-NEXT: movaps %xmm0, %xmm9 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm13[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm9[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm9[1] -; SSE-NEXT: movaps %xmm2, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm15[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm6[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 176(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 16(%rsi) -; SSE-NEXT: movaps %xmm5, 224(%rdx) -; SSE-NEXT: movaps %xmm14, 240(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 192(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: movaps %xmm0, %xmm12 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 240(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 224(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 160(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, (%rsi) +; SSE-NEXT: movaps %xmm5, 240(%rdx) +; SSE-NEXT: movaps %xmm14, 224(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movaps %xmm5, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 160(%rdx) +; SSE-NEXT: movaps %xmm5, 192(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movaps %xmm5, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 128(%rdx) +; SSE-NEXT: movaps %xmm5, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movaps %xmm5, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 96(%rdx) +; SSE-NEXT: movaps %xmm5, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movaps %xmm5, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 64(%rdx) +; SSE-NEXT: movaps %xmm5, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movaps %xmm5, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 32(%rdx) +; SSE-NEXT: movaps %xmm5, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movaps %xmm5, 48(%rdx) -; SSE-NEXT: movaps %xmm10, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movaps %xmm5, 16(%rdx) -; SSE-NEXT: movaps %xmm3, 240(%rcx) -; SSE-NEXT: movaps %xmm8, 224(%rcx) -; SSE-NEXT: movaps %xmm13, 208(%rcx) -; SSE-NEXT: movaps %xmm0, 192(%rcx) -; SSE-NEXT: movaps %xmm1, 176(%rcx) -; SSE-NEXT: movaps %xmm4, 160(%rcx) -; SSE-NEXT: movaps %xmm7, 144(%rcx) -; SSE-NEXT: movaps %xmm11, 128(%rcx) -; SSE-NEXT: movaps %xmm12, 112(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps %xmm10, (%rdx) +; SSE-NEXT: movaps %xmm11, 240(%rcx) +; SSE-NEXT: movaps %xmm15, 224(%rcx) +; SSE-NEXT: movaps %xmm1, 208(%rcx) +; SSE-NEXT: movaps %xmm2, 192(%rcx) +; SSE-NEXT: movaps %xmm3, 176(%rcx) +; SSE-NEXT: movaps %xmm6, 160(%rcx) +; SSE-NEXT: movaps %xmm8, 144(%rcx) +; SSE-NEXT: movaps %xmm9, 128(%rcx) +; SSE-NEXT: movaps %xmm13, 112(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 80(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm12, 240(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 240(%r8) -; SSE-NEXT: movaps %xmm9, 224(%r8) +; SSE-NEXT: movaps %xmm0, 224(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2221,13 +2207,13 @@ ; SSE-NEXT: movaps %xmm0, 176(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r8) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r8) @@ -2239,34 +2225,39 @@ ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm15, (%r8) -; SSE-NEXT: addq $1224, %rsp # imm = 0x4C8 +; SSE-NEXT: movaps %xmm4, (%r8) +; SSE-NEXT: addq $1240, %rsp # imm = 0x4D8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride4_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2200, %rsp # imm = 0x898 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm8 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm9[0],ymm5[1],ymm9[1],ymm5[4],ymm9[4],ymm5[5],ymm9[5] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: subq $2152, %rsp # imm = 0x868 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[4],ymm2[4],ymm7[5],ymm2[5] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm10 -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovaps %xmm6, %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps %xmm8, %xmm12 +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm6, %xmm14 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2279,157 +2270,156 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 912(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 912(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1],ymm12[2,0],ymm13[4,5],ymm12[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1],ymm11[2,0],ymm13[4,5],ymm11[6,4] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] @@ -2437,51 +2427,45 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm8[0],ymm14[1],ymm8[1],ymm14[4],ymm8[4],ymm14[5],ymm8[5] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,0],ymm7[1,0],ymm9[5,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm7[1,0],ymm10[5,4],ymm7[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm15, %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[1],xmm5[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm12[1],xmm14[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm3[0],ymm11[0],ymm3[1],ymm11[1],ymm3[4],ymm11[4],ymm3[5],ymm11[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[4],ymm8[4],ymm4[5],ymm8[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm5[1,0],ymm10[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,0],ymm4[1,0],ymm7[5,4],ymm4[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[1],xmm15[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm5[1],xmm9[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm2[0],ymm14[0],ymm2[1],ymm14[1],ymm2[4],ymm14[4],ymm2[5],ymm14[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm9[1,0],ymm1[5,4],ymm9[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm8[0],mem[0],xmm8[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2524,9 +2508,8 @@ ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] @@ -2536,9 +2519,9 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -2559,60 +2542,61 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm9[2],ymm14[3],ymm9[3],ymm14[6],ymm9[6],ymm14[7],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm3[2],xmm11[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[6],ymm10[6],ymm5[7],ymm10[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm6[2],xmm12[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm10[1],ymm4[3],ymm10[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[6],ymm9[6],ymm5[7],ymm9[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm3[1],ymm11[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[6],ymm7[6],ymm4[7],ymm7[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm2[2],xmm3[2] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm5[2],xmm10[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm11[2],ymm7[3],ymm11[3],ymm7[6],ymm11[6],ymm7[7],ymm11[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm2[1],ymm14[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = zero,zero,xmm13[2],mem[0] +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = zero,zero,xmm8[2],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm4[2],xmm5[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2626,9 +2610,9 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = zero,zero,xmm13[2],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,zero,xmm12[2],xmm14[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2638,31 +2622,30 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = zero,zero,xmm13[2],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = zero,zero,xmm0[2],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = zero,zero,xmm13[2],mem[0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = zero,zero,xmm15[2],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -2674,686 +2657,680 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = zero,zero,xmm13[2],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,0],ymm6[3,0],ymm9[7,4],ymm6[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[3,0],xmm11[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm14[3,0],ymm1[7,4],ymm14[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,0],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[2,0],ymm0[2,3],ymm3[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm2[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[2,0],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,0],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm4[2],ymm10[3],ymm4[3],ymm10[6],ymm4[6],ymm10[7],ymm4[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm9[3,0],ymm5[3,0],ymm9[7,4],ymm5[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[2,0],ymm1[2,3],ymm13[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm12[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm14[2,0],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[6],ymm6[6],ymm8[7],ymm6[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,0],ymm7[3,0],ymm11[7,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[2,0],ymm10[2,3],ymm11[6,4],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[2,0],ymm0[2,3],ymm5[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm2[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,0],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[2,0],ymm5[2,3],ymm7[6,4],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm2[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm13[2,0],xmm11[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm2[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm9[2,0],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm6[2,3],ymm7[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm2[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm9[2,0],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm9 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[2,0],ymm6[2,3],ymm9[6,4],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[2,0],ymm7[2,3],ymm9[6,4],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm2[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm11[2,0],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm2[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm10[2,0],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[2,0],ymm4[2,3],ymm8[6,4],ymm4[6,7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[2,0],ymm8[2,3],ymm9[6,4],ymm8[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm15[2],mem[2],xmm15[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm2[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm9[2,0],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm2[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm10[2,0],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[2,0],ymm2[2,3],ymm7[6,4],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm3[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,0],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm5[3,0],mem[3,0],ymm5[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,0],ymm3[2,3],ymm5[6,4],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[3,0],mem[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,0],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm4[3,0],mem[3,0],ymm4[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,0],ymm2[2,3],ymm4[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[3,0],mem[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,0],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-ONLY-NEXT: addq $2200, %rsp # imm = 0x898 +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride4_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1960, %rsp # imm = 0x7A8 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX2-ONLY-NEXT: subq $1976, %rsp # imm = 0x7B8 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,4,0,4] +; AVX2-ONLY-NEXT: # xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovaps %ymm6, %ymm9 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm2, %ymm3 +; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm12 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 528(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm11 +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm2, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4] -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,4,0,4] -; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm6, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm3 +; AVX2-ONLY-NEXT: vmovaps %ymm4, %ymm14 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm3 +; AVX2-ONLY-NEXT: vmovaps %ymm4, %ymm13 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm9 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm14 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 912(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm1 +; AVX2-ONLY-NEXT: vmovaps %ymm4, %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 912(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm3 -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm11 -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 528(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm13 +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1,5,1,5,1,5,1,5] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm15 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [1,5,1,5] +; AVX2-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm12 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps %ymm15, %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1,5,1,5,1,5,1,5] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm10, %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5] -; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 944(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm10, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm15 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm11, %ymm12 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 560(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 560(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps %ymm11, %ymm15 +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vmovaps %ymm10, %ymm9 +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [2,6,2,6] -; AVX2-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm3, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm3, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 944(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,7,3,7,3,7,3,7] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,6,2,6,2,6,2,6] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [3,7,3,7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [2,6,2,6] ; AVX2-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps %ymm8, %ymm6 ; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps %ymm4, %ymm7 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps (%rsp), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 224(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 160(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX2-ONLY-NEXT: addq $1960, %rsp # imm = 0x7A8 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,7,3,7,3,7,3,7] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [3,7,3,7] +; AVX2-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX2-ONLY-NEXT: addq $1976, %rsp # imm = 0x7B8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -3365,72 +3342,72 @@ ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm12 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm18 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm17 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28] ; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512-NEXT: vpermt2d %zmm12, %zmm19, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512-NEXT: vpermt2d %zmm8, %zmm19, %zmm13 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512-NEXT: vpermt2d %zmm17, %zmm19, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512-NEXT: vpermt2d %zmm10, %zmm19, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512-NEXT: vpermt2d %zmm18, %zmm19, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512-NEXT: vpermt2d %zmm14, %zmm19, %zmm13 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512-NEXT: vpermt2d %zmm12, %zmm19, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512-NEXT: vpermt2d %zmm9, %zmm19, %zmm16 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm16[0,1,2,3],zmm13[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512-NEXT: vpermt2d %zmm7, %zmm19, %zmm16 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512-NEXT: vpermt2d %zmm4, %zmm19, %zmm20 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm20[0,1,2,3],zmm16[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm20 ; AVX512-NEXT: vpermt2d %zmm3, %zmm19, %zmm20 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm19 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512-NEXT: vpermt2d %zmm12, %zmm21, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512-NEXT: vpermt2d %zmm8, %zmm21, %zmm22 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512-NEXT: vpermt2d %zmm17, %zmm21, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512-NEXT: vpermt2d %zmm14, %zmm21, %zmm23 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,5,9,13,17,21,25,29,1,5,9,13,17,21,25,29] +; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm21 +; AVX512-NEXT: vpermt2d %zmm18, %zmm20, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512-NEXT: vpermt2d %zmm14, %zmm20, %zmm22 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512-NEXT: vpermt2d %zmm12, %zmm20, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm23 +; AVX512-NEXT: vpermt2d %zmm9, %zmm20, %zmm23 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512-NEXT: vpermt2d %zmm10, %zmm21, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm23 +; AVX512-NEXT: vpermt2d %zmm7, %zmm20, %zmm23 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512-NEXT: vpermt2d %zmm4, %zmm21, %zmm24 +; AVX512-NEXT: vpermt2d %zmm4, %zmm20, %zmm24 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512-NEXT: vpermt2d %zmm3, %zmm21, %zmm24 -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm21 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm3, %zmm20, %zmm24 +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm20 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm20[0,1,2,3],zmm24[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,18,22,26,30,2,6,10,14,18,22,26,30] ; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512-NEXT: vpermt2d %zmm12, %zmm24, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-NEXT: vpermt2d %zmm8, %zmm24, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512-NEXT: vpermt2d %zmm18, %zmm24, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm26 +; AVX512-NEXT: vpermt2d %zmm14, %zmm24, %zmm26 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512-NEXT: vpermt2d %zmm17, %zmm24, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512-NEXT: vpermt2d %zmm14, %zmm24, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512-NEXT: vpermt2d %zmm12, %zmm24, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm27 +; AVX512-NEXT: vpermt2d %zmm9, %zmm24, %zmm27 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512-NEXT: vpermt2d %zmm10, %zmm24, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm27 +; AVX512-NEXT: vpermt2d %zmm7, %zmm24, %zmm27 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512-NEXT: vpermt2d %zmm4, %zmm24, %zmm28 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] @@ -3440,34 +3417,34 @@ ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,19,23,27,31,3,7,11,15,19,23,27,31] ; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2d %zmm17, %zmm28, %zmm16 +; AVX512-NEXT: vpermt2d %zmm18, %zmm28, %zmm17 ; AVX512-NEXT: vpermt2d %zmm14, %zmm28, %zmm15 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512-NEXT: vpermt2d %zmm10, %zmm28, %zmm7 -; AVX512-NEXT: vpermt2d %zmm4, %zmm28, %zmm5 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm17[4,5,6,7] ; AVX512-NEXT: vpermt2d %zmm12, %zmm28, %zmm11 -; AVX512-NEXT: vpermt2d %zmm8, %zmm28, %zmm9 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm9, %zmm28, %zmm10 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm11[4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm7, %zmm28, %zmm6 +; AVX512-NEXT: vpermt2d %zmm4, %zmm28, %zmm5 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] ; AVX512-NEXT: vpermt2d %zmm3, %zmm28, %zmm2 ; AVX512-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm18, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm13, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm8, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm20, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm22, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm20, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm27, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm25, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm26, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm9, 128(%r8) ; AVX512-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <256 x i32>, ptr %in.vec, align 64 @@ -3484,7 +3461,6 @@ ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX: {{.*}} ; AVX1: {{.*}} -; AVX2: {{.*}} ; AVX2-FAST: {{.*}} ; AVX2-FAST-PERLANE: {{.*}} ; AVX2-SLOW: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll @@ -68,13 +68,13 @@ ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX2-ONLY-NEXT: vpbroadcastd 16(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] +; AVX2-ONLY-NEXT: vbroadcastss 16(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vmovq %xmm3, (%rsi) ; AVX2-ONLY-NEXT: vmovq %xmm4, (%rdx) ; AVX2-ONLY-NEXT: vpextrq $1, %xmm1, (%rcx) ; AVX2-ONLY-NEXT: vmovq %xmm0, (%r8) -; AVX2-ONLY-NEXT: vmovq %xmm2, (%r9) +; AVX2-ONLY-NEXT: vmovlps %xmm2, (%r9) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -90,7 +90,9 @@ ; AVX512F-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpextrd $3, %xmm1, %eax ; AVX512F-SLOW-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 -; AVX512F-SLOW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX512F-SLOW-NEXT: vmovd %xmm2, %eax +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512F-SLOW-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpbroadcastd 16(%rdi), %ymm5 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] ; AVX512F-SLOW-NEXT: vmovq %xmm3, (%rsi) @@ -109,16 +111,17 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,6,1,6] ; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm4 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,7,2,7] -; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm5 +; AVX512F-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm5 +; AVX512F-FAST-NEXT: vpextrd $3, %xmm1, %eax +; AVX512F-FAST-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 ; AVX512F-FAST-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512F-FAST-NEXT: vpbroadcastd 16(%rdi), %ymm1 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX512F-FAST-NEXT: vpbroadcastd 16(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] ; AVX512F-FAST-NEXT: vmovq %xmm3, (%rsi) ; AVX512F-FAST-NEXT: vmovq %xmm4, (%rdx) -; AVX512F-FAST-NEXT: vmovq %xmm5, (%rcx) +; AVX512F-FAST-NEXT: vmovq %xmm1, (%rcx) ; AVX512F-FAST-NEXT: vmovq %xmm0, (%r8) -; AVX512F-FAST-NEXT: vmovq %xmm1, (%r9) +; AVX512F-FAST-NEXT: vmovq %xmm2, (%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -134,7 +137,9 @@ ; AVX512BW-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm5 ; AVX512BW-SLOW-NEXT: vpextrd $3, %xmm1, %eax ; AVX512BW-SLOW-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 -; AVX512BW-SLOW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX512BW-SLOW-NEXT: vmovd %xmm2, %eax +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpbroadcastd 16(%rdi), %ymm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] ; AVX512BW-SLOW-NEXT: vmovq %xmm3, (%rsi) @@ -153,16 +158,17 @@ ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,6,1,6] ; AVX512BW-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm4 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,7,2,7] -; AVX512BW-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm5 +; AVX512BW-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm5 +; AVX512BW-FAST-NEXT: vpextrd $3, %xmm1, %eax +; AVX512BW-FAST-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1 ; AVX512BW-FAST-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX512BW-FAST-NEXT: vpbroadcastd 16(%rdi), %ymm1 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX512BW-FAST-NEXT: vpbroadcastd 16(%rdi), %ymm5 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3] ; AVX512BW-FAST-NEXT: vmovq %xmm3, (%rsi) ; AVX512BW-FAST-NEXT: vmovq %xmm4, (%rdx) -; AVX512BW-FAST-NEXT: vmovq %xmm5, (%rcx) +; AVX512BW-FAST-NEXT: vmovq %xmm1, (%rcx) ; AVX512BW-FAST-NEXT: vmovq %xmm0, (%r8) -; AVX512BW-FAST-NEXT: vmovq %xmm1, (%r9) +; AVX512BW-FAST-NEXT: vmovq %xmm2, (%r9) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %wide.vec = load <10 x i32>, ptr %in.vec, align 64 @@ -238,7 +244,7 @@ ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2],xmm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm3[2],xmm7[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm7 = xmm7[1,0] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1,2],xmm6[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] @@ -425,56 +431,55 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2],xmm6[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm5, %ymm7 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4],ymm9[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1],ymm1[1,3],ymm9[6,5],ymm1[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[2,0],ymm7[3,0],ymm9[6,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm9[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2],xmm11[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm11[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm1[2,0],ymm8[7,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm1[2,1],ymm8[6,4],ymm1[6,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2],xmm11[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm11 = xmm11[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[1,0],ymm11[0,0],ymm0[5,4],ymm11[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[0,0],ymm1[3,0],ymm2[4,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0],ymm1[2,2],ymm12[6,4],ymm1[6,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,0],mem[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm12[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm11[1,0],ymm0[6,4],ymm11[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1],ymm1[1,3],ymm8[6,5],ymm1[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,0],ymm6[3,0],ymm8[6,4],ymm6[7,4] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2],xmm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0],ymm1[2,0],ymm7[7,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0],ymm1[2,1],ymm7[6,4],ymm1[6,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm8[2],xmm10[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm10 = xmm10[1,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,0],ymm10[0,0],ymm0[5,4],ymm10[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm2[0,0],ymm1[3,0],ymm2[4,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0],ymm1[2,2],ymm11[6,4],ymm1[6,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,0],mem[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm11[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[2,0],ymm10[1,0],ymm0[6,4],ymm10[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -589,26 +594,25 @@ ; SSE-LABEL: load_i32_stride5_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $296, %rsp # imm = 0x128 -; SSE-NEXT: movdqa 288(%rdi), %xmm3 +; SSE-NEXT: movdqa 208(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm4 -; SSE-NEXT: movdqa 240(%rdi), %xmm9 -; SSE-NEXT: movdqa 256(%rdi), %xmm2 -; SSE-NEXT: movdqa (%rdi), %xmm11 -; SSE-NEXT: movdqa 16(%rdi), %xmm15 -; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm7 +; SSE-NEXT: movdqa 192(%rdi), %xmm4 +; SSE-NEXT: movdqa 160(%rdi), %xmm9 +; SSE-NEXT: movdqa 176(%rdi), %xmm2 +; SSE-NEXT: movdqa 128(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm6 +; SSE-NEXT: movdqa 80(%rdi), %xmm11 +; SSE-NEXT: movdqa 96(%rdi), %xmm15 +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa 16(%rdi), %xmm12 +; SSE-NEXT: movdqa 32(%rdi), %xmm8 +; SSE-NEXT: movdqa 48(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm8 -; SSE-NEXT: movdqa 160(%rdi), %xmm10 -; SSE-NEXT: movdqa 176(%rdi), %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa %xmm8, %xmm13 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] @@ -618,14 +622,14 @@ ; SSE-NEXT: movdqa %xmm11, %xmm14 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm6, %xmm11 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm7 ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa %xmm4, %xmm2 @@ -634,81 +638,77 @@ ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa 240(%rdi), %xmm10 +; SSE-NEXT: movdqa 256(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 128(%rdi), %xmm3 +; SSE-NEXT: movdqa 288(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 272(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 224(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; SSE-NEXT: movdqa %xmm14, %xmm13 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 304(%rdi), %xmm14 +; SSE-NEXT: movdqa 224(%rdi), %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 144(%rdi), %xmm1 +; SSE-NEXT: movdqa 304(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] ; SSE-NEXT: movdqa %xmm15, %xmm11 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -718,15 +718,15 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa %xmm6, %xmm10 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -734,34 +734,34 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] ; SSE-NEXT: movdqa %xmm8, %xmm7 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: pshufd $255, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] @@ -779,89 +779,82 @@ ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm11[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm10[0],xmm15[1] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm3[0],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm10[0],xmm14[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movaps %xmm10, 16(%rsi) +; SSE-NEXT: movaps %xmm10, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 48(%rsi) +; SSE-NEXT: movaps %xmm11, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, (%rsi) +; SSE-NEXT: movaps %xmm11, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rdx) +; SSE-NEXT: movaps %xmm11, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 32(%rdx) -; SSE-NEXT: movapd %xmm13, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rcx) +; SSE-NEXT: movaps %xmm3, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rcx) +; SSE-NEXT: movaps %xmm3, (%rdx) +; SSE-NEXT: movapd %xmm13, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 32(%rcx) -; SSE-NEXT: movapd %xmm0, 16(%r8) -; SSE-NEXT: movapd %xmm2, 48(%r8) -; SSE-NEXT: movapd %xmm6, (%r8) -; SSE-NEXT: movapd %xmm7, 32(%r8) -; SSE-NEXT: movapd %xmm14, 16(%r9) -; SSE-NEXT: movapd %xmm15, 48(%r9) -; SSE-NEXT: movapd %xmm12, (%r9) -; SSE-NEXT: movapd %xmm1, 32(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rcx) +; SSE-NEXT: movapd %xmm0, 48(%r8) +; SSE-NEXT: movapd %xmm2, 32(%r8) +; SSE-NEXT: movapd %xmm5, 16(%r8) +; SSE-NEXT: movapd %xmm7, (%r8) +; SSE-NEXT: movapd %xmm14, 48(%r9) +; SSE-NEXT: movapd %xmm15, 32(%r9) +; SSE-NEXT: movapd %xmm12, 16(%r9) +; SSE-NEXT: movapd %xmm1, (%r9) ; SSE-NEXT: addq $296, %rsp # imm = 0x128 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride5_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $168, %rsp +; AVX1-ONLY-NEXT: subq $104, %rsp ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm13 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm2[4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7] @@ -870,83 +863,84 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1],ymm5[1,3],ymm3[6,5],ymm5[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,0],ymm1[3,0],ymm3[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm2[1,3],ymm1[6,5],ymm2[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm11[2,3],ymm2[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm7[2,3],ymm2[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[3,0],ymm1[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,0],ymm5[2,0],ymm14[7,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,0],ymm5[2,0],ymm12[7,4],ymm5[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm5[2,1],ymm1[6,4],ymm5[6,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm13[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm13[2],xmm3[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[1,0],ymm4[0,0],ymm1[5,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm1[1,0],ymm4[0,0],ymm1[5,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm2[2,0],ymm0[7,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,1],ymm0[6,4],ymm2[6,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm7[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3],ymm11[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm10 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm15[2],xmm3[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[1,0],ymm3[0,0],ymm0[5,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm6[0,0],ymm5[3,0],ymm6[4,4],ymm5[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm5[2,2],ymm14[6,4],ymm5[6,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,0],mem[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm14[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[2,0],ymm4[1,0],ymm1[6,4],ymm4[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,0],ymm2[3,0],ymm11[4,4],ymm2[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0],ymm2[2,2],ymm9[6,4],ymm2[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,0],ymm3[0,0],ymm0[5,4],ymm3[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm14[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,0],ymm5[3,0],ymm6[4,4],ymm5[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0],ymm5[2,2],ymm13[6,4],ymm5[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[3,0],mem[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[2,0],ymm4[1,0],ymm1[6,4],ymm4[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm7[0,0],ymm2[3,0],ymm7[4,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0],ymm2[2,2],ymm11[6,4],ymm2[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,0],mem[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[2,0],ymm3[1,0],ymm0[6,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm11[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm0[2,0],ymm3[1,0],ymm0[6,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3],ymm9[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],mem[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm10[0,1,2,3],mem[4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],mem[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] @@ -958,16 +952,16 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm13, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) -; AVX1-ONLY-NEXT: addq $168, %rsp +; AVX1-ONLY-NEXT: addq $104, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -1226,395 +1220,391 @@ ; SSE-LABEL: load_i32_stride5_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $904, %rsp # imm = 0x388 -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm8 +; SSE-NEXT: movdqa 80(%rdi), %xmm15 +; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 368(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 352(%rdi), %xmm4 +; SSE-NEXT: movdqa 320(%rdi), %xmm11 +; SSE-NEXT: movdqa 336(%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm13 ; SSE-NEXT: movdqa 16(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm9 +; SSE-NEXT: movdqa 32(%rdi), %xmm14 ; SSE-NEXT: movdqa 48(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm4 -; SSE-NEXT: movdqa 400(%rdi), %xmm11 -; SSE-NEXT: movdqa 416(%rdi), %xmm14 -; SSE-NEXT: movdqa 128(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm8 -; SSE-NEXT: movdqa 80(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm7 -; SSE-NEXT: movdqa 336(%rdi), %xmm0 +; SSE-NEXT: movdqa 400(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 416(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 368(%rdi), %xmm2 +; SSE-NEXT: movdqa 448(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm1 +; SSE-NEXT: movdqa 160(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm0 +; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa 192(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 560(%rdi), %xmm1 +; SSE-NEXT: movdqa 480(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm0 +; SSE-NEXT: movdqa 496(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 608(%rdi), %xmm2 +; SSE-NEXT: movdqa 528(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 592(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa 512(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm6 -; SSE-NEXT: movdqa 176(%rdi), %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 256(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 208(%rdi), %xmm2 +; SSE-NEXT: movdqa 288(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm0 +; SSE-NEXT: movdqa 272(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 480(%rdi), %xmm1 +; SSE-NEXT: movdqa 560(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 496(%rdi), %xmm0 +; SSE-NEXT: movdqa 576(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 528(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 512(%rdi), %xmm0 +; SSE-NEXT: movdqa 608(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 592(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 144(%rdi), %xmm1 +; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 464(%rdi), %xmm1 +; SSE-NEXT: movdqa 384(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa 144(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 384(%rdi), %xmm1 +; SSE-NEXT: movdqa 464(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 304(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,1,1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm1 +; SSE-NEXT: movdqa 544(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, (%rsp), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 224(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 304(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,1,1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 544(%rdi), %xmm1 +; SSE-NEXT: movdqa 624(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm1[2],xmm11[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm13[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] +; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa %xmm12, %xmm14 +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm7, %xmm11 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm10 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm9, %xmm8 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: movdqa %xmm13, %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm3[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -1632,76 +1622,76 @@ ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] -; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 16(%rsi) +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movaps %xmm12, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movaps %xmm12, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 112(%rdx) +; SSE-NEXT: movaps %xmm12, 80(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, 16(%rdx) +; SSE-NEXT: movaps %xmm12, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movaps %xmm12, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 112(%rcx) +; SSE-NEXT: movaps %xmm12, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movaps %xmm12, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 80(%rcx) +; SSE-NEXT: movaps %xmm12, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movaps %xmm12, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 48(%rcx) +; SSE-NEXT: movaps %xmm12, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movaps %xmm12, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movaps %xmm12, 16(%rcx) ; SSE-NEXT: movapd %xmm6, 112(%r8) ; SSE-NEXT: movapd %xmm8, 96(%r8) ; SSE-NEXT: movapd %xmm10, 80(%r8) ; SSE-NEXT: movapd %xmm11, 64(%r8) -; SSE-NEXT: movapd %xmm15, 48(%r8) -; SSE-NEXT: movapd %xmm13, 32(%r8) +; SSE-NEXT: movapd %xmm14, 48(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: movaps %xmm6, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -1719,627 +1709,614 @@ ; ; AVX1-ONLY-LABEL: load_i32_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1000, %rsp # imm = 0x3E8 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX1-ONLY-NEXT: subq $920, %rsp # imm = 0x398 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm15 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm7[1,3],ymm0[6,5],ymm7[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm6[1,3],ymm0[6,5],ymm6[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm13[1,3],ymm0[6,5],ymm13[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm4[2,3],ymm13[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm11 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm5[1,3],ymm0[6,5],ymm5[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm11[2,3],ymm5[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm8[1,3],ymm0[6,5],ymm8[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm12[2,3],ymm8[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm12[1,3],ymm0[6,5],ymm12[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm3[2,3],ymm12[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm7[1,3],ymm0[6,5],ymm7[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm7[2,0],ymm13[7,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm7[2,1],ymm0[6,4],ymm7[6,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm15[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[3,0],ymm10[2,0],ymm15[7,4],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm10[2,1],ymm0[6,4],ymm10[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm13[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,0],ymm10[0,0],ymm13[5,4],ymm10[4,4] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm5[0,0],ymm4[5,4],ymm5[4,4] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,0],ymm6[2,0],ymm14[7,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[2,1],ymm0[6,4],ymm6[6,5] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm13[2,0],ymm0[7,4],ymm13[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm13[2,1],ymm0[6,4],ymm13[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm14[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,0],ymm3[0,0],ymm14[5,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,0],ymm9[0,0],ymm11[5,4],ymm9[4,4] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm5[2,0],ymm0[7,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,1],ymm0[6,4],ymm5[6,5] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm8[2,0],ymm0[7,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,1],ymm0[6,4],ymm8[6,5] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm6[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm11[1,0],ymm6[0,0],ymm11[5,4],ymm6[4,4] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm12[2,0],ymm0[7,4],ymm12[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm12[2,1],ymm0[6,4],ymm12[6,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm4 = xmm4[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm8[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[1,0],ymm5[0,0],ymm8[5,4],ymm5[4,4] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm7[3,0],ymm1[4,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm7[2,2],ymm15[6,4],ymm7[6,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[2,0],ymm10[1,0],ymm13[6,4],ymm10[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm2[1,0],ymm13[0,0],ymm2[5,4],ymm13[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,0],ymm9[3,0],ymm2[4,4],ymm9[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm9[2,2],ymm15[6,4],ymm9[6,6] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[2,0],ymm3[1,0],ymm14[6,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm7[2,0],ymm0[7,4],ymm7[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm7[2,1],ymm1[6,4],ymm7[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm12[2],xmm6[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm6 = xmm6[1,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[1,0],ymm1[0,0],ymm6[5,4],ymm1[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,0],ymm4[3,0],ymm3[4,4],ymm4[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm4[2,2],ymm15[6,4],ymm4[6,6] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,0],ymm10[3,0],ymm3[4,4],ymm10[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm10[2,2],ymm15[6,4],ymm10[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[2,0],ymm6[1,0],ymm11[6,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,0],ymm5[1,0],ymm4[6,4],ymm5[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[0,0],ymm12[3,0],ymm4[4,4],ymm12[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm12[2,2],ymm15[6,4],ymm12[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm10[0,0],ymm1[3,0],ymm10[4,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm1[2,2],ymm15[6,4],ymm1[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[2,0],ymm5[1,0],ymm8[6,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[2,0],ymm9[1,0],ymm11[6,4],ymm9[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm1[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1,2,3],ymm1[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],mem[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1,2],ymm7[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm13[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm2[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[0,0],ymm8[3,0],ymm1[4,4],ymm8[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm8[2,2],ymm14[6,4],ymm8[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm2[2,0],ymm13[1,0],ymm2[6,4],ymm13[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm12[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm5[0,0],ymm7[3,0],ymm5[4,4],ymm7[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0],ymm7[2,2],ymm12[6,4],ymm7[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm6[2,0],ymm7[1,0],ymm6[6,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = mem[0,1,2,3],ymm1[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],mem[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5,6],ymm14[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],mem[1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3],ymm1[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],mem[1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm11[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],mem[1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5,6],ymm11[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],mem[1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm8[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm6[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 32(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%r9) -; AVX1-ONLY-NEXT: addq $1000, %rsp # imm = 0x3E8 +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r9) +; AVX1-ONLY-NEXT: addq $920, %rsp # imm = 0x398 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride5_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1000, %rsp # imm = 0x3E8 -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-ONLY-NEXT: subq $936, %rsp # imm = 0x3A8 +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,2,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm10 ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4],ymm3[5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm11[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4],ymm3[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm13, %ymm14 ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4],ymm2[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, 608(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm6, %ymm12 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa %ymm5, %ymm10 +; AVX2-ONLY-NEXT: vmovdqa %ymm5, %ymm8 ; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4],ymm2[5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm11, %ymm7 -; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 608(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm5[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4],ymm1[5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <1,6,3,u> -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [5,2,7,0,5,2,7,0] -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm6[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm6, %ymm13 +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,2,7,0,5,2,7,0] +; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm6[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm4[2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 624(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu %ymm12, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm10[2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastd 464(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm6[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa %ymm11, %ymm8 -; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm11[2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm9[2,3],mem[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm6[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm12[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm10[2,3],ymm7[4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 464(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm5[2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm7, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastd 624(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <2,7,4,u> -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm6[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa %ymm15, %ymm7 -; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <2,7,4,u> +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm13[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3,4,5,6],ymm6[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm6[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] -; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm4[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm14, %ymm15 -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa %ymm13, %ymm6 -; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpermd %ymm8, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2],ymm9[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3,4,5,6],ymm7[7] +; AVX2-ONLY-NEXT: vmovdqa %ymm15, %ymm4 +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm7[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm14[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm2 +; AVX2-ONLY-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5,6],ymm15[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2],ymm15[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %ymm9, %ymm2 +; AVX2-ONLY-NEXT: vmovdqa %ymm5, %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm5[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2],ymm11[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm5[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm9 = [1,6,1,6,1,6,1,6] -; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm9, %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm13, %ymm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm11 = ymm8[12,13,14,15],ymm13[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm13[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,3,2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm11 = [1,6,1,6,1,6,1,6] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermd %ymm8, %ymm11, %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm12[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm15[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm9, %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr $12, (%rsp), %ymm3, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm11, %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm1[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm5[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm8, %ymm9, %ymm10 -; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm6[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm11, %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm8[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm14 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm11[0,1,2,3],mem[4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm13[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm13[5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <4,1,6,u> -; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm10, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm9 = [2,7,2,7,2,7,2,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3],ymm2[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vperm2i128 $2, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1],ymm12[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm10, %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm2[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vmovdqa %ymm10, %ymm12 +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm11, %ymm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[0,1],ymm5[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <4,1,6,u> +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm7, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm5 = [2,7,2,7,2,7,2,7] +; AVX2-ONLY-NEXT: vpermd %ymm8, %ymm5, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm7, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm5, %ymm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm7, %ymm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm7, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm13[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm13[5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm5, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1],ymm1[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm10, %ymm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm9, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm14[0,1,2,3],mem[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm10, %ymm4 -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1],ymm8[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm9, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 96(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm3, 96(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%r9) -; AVX2-ONLY-NEXT: addq $1000, %rsp # imm = 0x3E8 +; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-ONLY-NEXT: addq $936, %rsp # imm = 0x3A8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -2557,237 +2534,242 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i32_stride5_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1928, %rsp # imm = 0x788 -; SSE-NEXT: movdqa 768(%rdi), %xmm2 +; SSE-NEXT: subq $1960, %rsp # imm = 0x7A8 +; SSE-NEXT: movdqa 688(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 752(%rdi), %xmm4 +; SSE-NEXT: movdqa 672(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 720(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 736(%rdi), %xmm3 +; SSE-NEXT: movdqa 640(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 656(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm6 +; SSE-NEXT: movdqa 368(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 416(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm9 +; SSE-NEXT: movdqa 352(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 320(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 336(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1040(%rdi), %xmm1 +; SSE-NEXT: movdqa 960(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1056(%rdi), %xmm0 +; SSE-NEXT: movdqa 976(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1088(%rdi), %xmm2 +; SSE-NEXT: movdqa 1008(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1072(%rdi), %xmm0 +; SSE-NEXT: movdqa 992(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm12 +; SSE-NEXT: movdqa 96(%rdi), %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa 128(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa 112(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm0 +; SSE-NEXT: movdqa 400(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 416(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 368(%rdi), %xmm2 +; SSE-NEXT: movdqa 448(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm0 +; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 640(%rdi), %xmm1 +; SSE-NEXT: movdqa 720(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 656(%rdi), %xmm0 +; SSE-NEXT: movdqa 736(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 688(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm0 +; SSE-NEXT: movdqa 768(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 752(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 960(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 976(%rdi), %xmm0 +; SSE-NEXT: movdqa 1040(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 1056(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1008(%rdi), %xmm2 +; SSE-NEXT: movdqa 1088(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 992(%rdi), %xmm0 +; SSE-NEXT: movdqa 1072(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm6 -; SSE-NEXT: movdqa 256(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm7 +; SSE-NEXT: movdqa 176(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 560(%rdi), %xmm15 -; SSE-NEXT: movdqa 576(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa 480(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 496(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 608(%rdi), %xmm2 +; SSE-NEXT: movdqa 528(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 592(%rdi), %xmm0 +; SSE-NEXT: movdqa 512(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 880(%rdi), %xmm2 +; SSE-NEXT: movdqa 800(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 896(%rdi), %xmm0 +; SSE-NEXT: movdqa 816(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 928(%rdi), %xmm2 +; SSE-NEXT: movdqa 848(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 912(%rdi), %xmm0 +; SSE-NEXT: movdqa 832(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1200(%rdi), %xmm2 +; SSE-NEXT: movdqa 1120(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1216(%rdi), %xmm0 +; SSE-NEXT: movdqa 1136(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1248(%rdi), %xmm3 +; SSE-NEXT: movdqa 1168(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1232(%rdi), %xmm0 +; SSE-NEXT: movdqa 1152(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm14 -; SSE-NEXT: movdqa 176(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa 240(%rdi), %xmm3 +; SSE-NEXT: movdqa 256(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 208(%rdi), %xmm2 +; SSE-NEXT: movdqa 288(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 272(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 480(%rdi), %xmm8 -; SSE-NEXT: movdqa 496(%rdi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 560(%rdi), %xmm9 +; SSE-NEXT: movdqa 576(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 528(%rdi), %xmm2 +; SSE-NEXT: movdqa 608(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 512(%rdi), %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 592(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 800(%rdi), %xmm13 -; SSE-NEXT: movdqa 816(%rdi), %xmm0 +; SSE-NEXT: movdqa 880(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 896(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 848(%rdi), %xmm2 +; SSE-NEXT: movdqa 928(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 832(%rdi), %xmm0 +; SSE-NEXT: movdqa 912(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1120(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 1136(%rdi), %xmm0 +; SSE-NEXT: movdqa 1200(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1216(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1168(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1152(%rdi), %xmm0 +; SSE-NEXT: movdqa 1248(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1232(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -2795,52 +2777,62 @@ ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 144(%rdi), %xmm1 +; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 304(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: movdqa 384(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 464(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2850,45 +2842,45 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 384(%rdi), %xmm1 +; SSE-NEXT: movdqa 544(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 624(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 544(%rdi), %xmm1 +; SSE-NEXT: movdqa 704(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 784(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2898,12 +2890,12 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 704(%rdi), %xmm1 +; SSE-NEXT: movdqa 864(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -2913,47 +2905,22 @@ ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 944(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 864(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 1104(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 1024(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] @@ -2962,12 +2929,12 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa 1264(%rdi), %xmm1 +; SSE-NEXT: movdqa 1104(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -2975,65 +2942,97 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa 1184(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa 1264(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,1,1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -3049,42 +3048,42 @@ ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] @@ -3092,51 +3091,50 @@ ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3147,28 +3145,48 @@ ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] @@ -3187,17 +3205,7 @@ ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] @@ -3207,169 +3215,139 @@ ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm4[0],xmm14[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[2,2,2,2] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm3[0],xmm14[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm4[0],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm3[0],xmm11[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm3[0],xmm10[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm3[0],xmm9[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm3[0],xmm8[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -3379,16 +3357,15 @@ ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -3407,15 +3384,14 @@ ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[2,2,2,2] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm15[0],xmm2[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -3433,14 +3409,6 @@ ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 176(%rsi) @@ -3449,13 +3417,13 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 192(%rsi) +; SSE-NEXT: movaps %xmm15, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 128(%rsi) +; SSE-NEXT: movaps %xmm15, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 64(%rsi) +; SSE-NEXT: movaps %xmm15, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, (%rsi) +; SSE-NEXT: movaps %xmm15, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -3465,38 +3433,46 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 224(%rdx) +; SSE-NEXT: movaps %xmm15, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 240(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 192(%rdx) +; SSE-NEXT: movaps %xmm15, 224(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 160(%rdx) +; SSE-NEXT: movaps %xmm15, 192(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 128(%rdx) +; SSE-NEXT: movaps %xmm15, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 96(%rdx) +; SSE-NEXT: movaps %xmm15, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 64(%rdx) +; SSE-NEXT: movaps %xmm15, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, 32(%rdx) +; SSE-NEXT: movaps %xmm15, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, (%rdx) +; SSE-NEXT: movaps %xmm15, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 240(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, 224(%rcx) @@ -3529,7 +3505,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: movaps %xmm15, (%rcx) ; SSE-NEXT: movapd %xmm13, 240(%r8) -; SSE-NEXT: movaps (%rsp), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: movaps %xmm13, 224(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: movaps %xmm13, 208(%r8) @@ -3575,354 +3551,338 @@ ; SSE-NEXT: movapd %xmm14, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) -; SSE-NEXT: addq $1928, %rsp # imm = 0x788 +; SSE-NEXT: addq $1960, %rsp # imm = 0x7A8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride5_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $2488, %rsp # imm = 0x9B8 -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 768(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 768(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm12[1,3],ymm0[6,5],ymm12[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm6[1,3],ymm2[6,5],ymm6[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm5[1,3],ymm2[6,5],ymm5[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm13[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm8[1,3],ymm2[6,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1],ymm13[2,3],ymm8[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 944(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm9[1,3],ymm2[6,5],ymm9[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm12[2,3],ymm9[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 1264(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm5[1,3],ymm0[6,5],ymm5[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm7[1,3],ymm0[6,5],ymm7[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm14[2,3],ymm7[4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 784(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm7[1,3],ymm2[6,5],ymm7[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm9[1,3],ymm0[6,5],ymm9[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 1104(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm10[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm14[1,3],ymm2[6,5],ymm14[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1],ymm10[2,3],ymm14[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm10[1,3],ymm2[6,5],ymm10[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 784(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm13[1,3],ymm0[6,5],ymm13[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm11[2,3],ymm13[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm15[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1],ymm11[1,3],ymm2[6,5],ymm11[5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1],ymm15[2,3],ymm11[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[3,0],ymm2[6,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm1[1,3],ymm0[6,5],ymm1[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm15[2,3],ymm1[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 1104(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 944(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm1[1,3],ymm0[6,5],ymm1[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 1264(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[2,0],ymm0[7,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,1],ymm0[6,4],ymm1[6,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2],xmm1[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm2[0,0],ymm1[5,4],ymm2[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,0],ymm1[0,0],ymm13[5,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm5[2,0],ymm0[7,4],ymm5[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,1],ymm0[6,4],ymm5[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[0,0],ymm2[5,4],ymm1[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm9[2,0],ymm0[7,4],ymm9[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm9[2,1],ymm0[6,4],ymm9[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[2,0],ymm0[7,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,1],ymm0[6,4],ymm1[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3930,177 +3890,180 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[2,0],ymm0[7,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,1],ymm0[6,4],ymm1[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm9[2,0],ymm0[7,4],ymm9[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm9[2,1],ymm0[6,4],ymm9[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm2[0,0],ymm1[5,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[0,0],ymm2[5,4],ymm1[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm7[2,0],ymm0[7,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm7[2,1],ymm0[6,4],ymm7[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[2,0],ymm0[7,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,1],ymm0[6,4],ymm1[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2],xmm1[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,0],ymm0[0,0],ymm4[5,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm10[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm10[1,0],ymm4[0,0],ymm10[5,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm14[2,0],ymm0[7,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm14[2,1],ymm1[6,4],ymm14[6,5] -; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm3[2,0],ymm0[7,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,1],ymm1[6,4],ymm3[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] +; AVX1-ONLY-NEXT: vmovaps %xmm8, %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2],xmm2[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[1,0],ymm0[0,0],ymm11[5,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[1,0],ymm6[0,0],ymm5[5,4],ymm6[4,4] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,0],ymm10[2,0],ymm0[7,4],ymm10[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm10[2,1],ymm2[6,4],ymm10[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,0],ymm1[2,0],ymm0[7,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm1[2,1],ymm2[6,4],ymm1[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] +; AVX1-ONLY-NEXT: vmovaps %xmm15, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm15[2],xmm3[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm14[1,0],ymm0[0,0],ymm14[5,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm7[1,0],ymm8[0,0],ymm7[5,4],ymm8[4,4] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm13[2,0],ymm0[7,4],ymm13[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm13[2,1],ymm3[6,4],ymm13[6,5] -; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[3,0],ymm0[2,0],ymm1[7,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm0[2,1],ymm3[6,4],ymm0[6,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2],xmm12[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm12 = xmm12[1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,0],ymm1[0,0],ymm12[5,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0,1],mem[2],xmm15[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} xmm15 = xmm15[1,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,0],ymm2[0,0],ymm1[5,4],ymm2[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm12[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[0,0],ymm1[3,0],ymm5[4,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[0,0],ymm1[3,0],ymm12[4,4],ymm1[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm1[2,2],ymm15[6,4],ymm1[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm13[2,0],mem[1,0],ymm13[6,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm11[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[0,0],ymm1[3,0],ymm8[4,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[0,0],ymm1[3,0],ymm11[4,4],ymm1[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm1[2,2],ymm15[6,4],ymm1[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm1[2,0],mem[1,0],ymm1[6,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm10[2,0],ymm4[1,0],ymm10[6,4],ymm4[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,0],ymm9[3,0],ymm15[4,4],ymm9[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm9[2,2],ymm15[6,4],ymm9[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm2[3,0],ymm1[4,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm2[2,2],ymm15[6,4],ymm2[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm1[2,0],mem[1,0],ymm1[6,4],mem[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $18, (%rsp), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm2[2,0],mem[1,0],ymm2[6,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,0],ymm1[3,0],ymm15[4,4],ymm1[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm1[2,2],ymm15[6,4],ymm1[6,6] +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[0,0],ymm2[3,0],ymm9[4,4],ymm2[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm2[2,2],ymm15[6,4],ymm2[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm1[2,0],mem[1,0],ymm1[6,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[2,0],ymm6[1,0],ymm5[6,4],ymm6[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm2[3,0],ymm1[4,4],ymm2[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm2[2,2],ymm15[6,4],ymm2[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,0],ymm3[3,0],ymm2[4,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm3[2,2],ymm15[6,4],ymm3[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,0],ymm3[1,0],ymm12[6,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm3[2,0],mem[1,0],ymm3[6,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[0,0],ymm10[3,0],ymm9[4,4],ymm10[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm10[2,2],ymm15[6,4],ymm10[6,6] +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,0],ymm3[3,0],ymm13[4,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm3[2,2],ymm15[6,4],ymm3[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[2,0],ymm10[1,0],ymm14[6,4],ymm10[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm7[2,0],ymm8[1,0],ymm7[6,4],ymm8[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4108,197 +4071,206 @@ ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,0],ymm6[3,0],ymm3[4,4],ymm6[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm6[2,2],ymm15[6,4],ymm6[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,0],ymm4[3,0],ymm3[4,4],ymm4[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm4[2,2],ymm15[6,4],ymm4[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[2,0],ymm6[1,0],ymm11[6,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm14[2,0],mem[1,0],ymm14[6,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm7[3,0],ymm1[4,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm7[2,2],ymm15[6,4],ymm7[6,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[0,0],ymm4[3,0],ymm8[4,4],ymm4[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm4[2,2],ymm15[6,4],ymm4[6,6] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,0],ymm2[1,0],ymm4[6,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm4[2,0],mem[1,0],ymm4[6,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],mem[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],mem[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5,6],ymm10[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],mem[1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5,6],ymm13[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $127, (%rsp), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5],mem[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm11[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,1,2,3,4],mem[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm1, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm9[5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],mem[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $128, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3,4,5,6],mem[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],mem[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm2, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm13[5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5,6],ymm14[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,1,2,3,4],mem[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $128, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,1,2,3,4,5,6],mem[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5,6],ymm14[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,1,2,3,4],mem[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5,6],ymm12[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,1,2,3,4],mem[5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $128, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,1,2,3,4,5,6],mem[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm15, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm12, (%r9) ; AVX1-ONLY-NEXT: addq $2488, %rsp # imm = 0x9B8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -4306,479 +4278,475 @@ ; AVX2-ONLY-LABEL: load_i32_stride5_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovdqu %ymm12, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,2,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm2 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm11, %ymm12 +; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa %ymm10, %ymm14 +; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa %ymm9, %ymm15 +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm9 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, 608(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 768(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm6[2,3],ymm15[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa %ymm5, %ymm8 +; AVX2-ONLY-NEXT: vmovdqa %ymm5, %ymm13 ; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm4, %ymm10 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, 928(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 1088(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm12[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm11[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, 1248(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 608(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, 768(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 928(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, 1088(%rdi), %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 1248(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm7[2,3],ymm2[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm13[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4],ymm1[5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm5[0,1,0,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4],ymm1[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <1,6,3,u> -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm6[0,1],mem[2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm7[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd $51, (%rsp), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,2,7,0,5,2,7,0] ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm7[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 624(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpbroadcastd 464(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm15[0,1],mem[2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 944(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpbroadcastd 784(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm8[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm13[2,3],ymm10[4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 1264(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpbroadcastd 1104(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm10[2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa %ymm10, %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm13[2,3],ymm10[4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm11[2,3],mem[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm5[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 464(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpbroadcastd 624(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm15[0,1],mem[2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm13[0,1],mem[2,3],ymm13[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 784(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpbroadcastd 944(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm8[0,1],mem[2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastd 1104(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpbroadcastd 1264(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <2,7,4,u> -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm3, %ymm0 -; AVX2-ONLY-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <2,7,4,u> +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3],ymm7[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpblendd $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm7[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3],ymm14[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 896(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 736(%rdi), %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 1216(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 1056(%rdi), %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd $127, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm10[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm14, %ymm9 +; AVX2-ONLY-NEXT: vmovdqa %ymm10, %ymm8 +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3],ymm5[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX2-ONLY-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpblendd $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5,6],ymm4[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm3, %ymm4 -; AVX2-ONLY-NEXT: vinserti128 $1, 736(%rdi), %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX2-ONLY-NEXT: vmovdqa %ymm13, %ymm7 -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2],ymm15[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-ONLY-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5,6],ymm3[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm5[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vinserti128 $1, 1056(%rdi), %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm6[4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-ONLY-NEXT: vinserti128 $1, 896(%rdi), %ymm0, %ymm14 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5,6],ymm13[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5,6],ymm14[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm14[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vinserti128 $1, 1216(%rdi), %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2,3,4,5,6],ymm14[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm14[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm10[0,1,2,3],mem[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3],ymm4[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = mem[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,3,2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,6,1,6,1,6,1,6] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,2,3],ymm3[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[3,0,2,2,7,4,6,6] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm9[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vmovdqa %ymm7, %ymm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,2,3],ymm5[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[3,0,2,2,7,4,6,6] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm9[12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm11[0,1,2,3],mem[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[3,0,2,2,7,4,6,6] +; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[12,13,14,15],ymm13[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm13[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm2[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[3,0,2,2,7,4,6,6] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm10[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm10[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[3,0,2,2,7,4,6,6] +; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm9[4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm3[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[3,0,2,2,7,4,6,6] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm12[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0,1,2,3],ymm1[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[3,0,2,2,7,4,6,6] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm5[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm3[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[0,1],ymm1[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,2,3],ymm1[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <4,1,6,u> ; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm8, %ymm5 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7] -; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm7, %ymm1 +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1],ymm1[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm1[5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm7, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2i128 $2, (%rsp), %ymm6, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1],ymm6[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm8, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm10[0,1],ymm9[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm9[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm8, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = mem[0,1],ymm9[0,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm9[5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm8, %ymm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm3[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm3[5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm7, %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm2[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm2[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm8, %ymm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm10[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm10[0,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm8, %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm8, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm9 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] @@ -4790,88 +4758,98 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm10[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 224(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 192(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 160(%r9) +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3],ymm12[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2i128 $2, (%rsp), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm11[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm8, %ymm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm6[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm7, %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 224(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, 192(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 160(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm2, 128(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 96(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 32(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 96(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 64(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm14, (%r9) ; AVX2-ONLY-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -4879,201 +4857,201 @@ ; AVX512F-LABEL: load_i32_stride5_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $584, %rsp # imm = 0x248 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm8 ; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm14 ; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm15 ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm17 ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm21 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm24 ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm6, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm5, %zmm24 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm19 = <0,5,10,15,20,25,30,u> ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm6, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm6, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm6 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm5, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm12, %zmm19, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm8, %zmm0, %zmm5 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm7, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm18 = <17,22,27,0,5,10,15,u> -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm27 ; AVX512F-NEXT: vpermt2d %zmm20, %zmm18, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm5, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm5, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm7, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm7, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm8, %zmm0, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm26 = <2,7,12,17,22,27,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm26, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm22, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm22, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm22, %zmm28 -; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm22 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm29, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm26, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] +; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm23, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm23, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm23, %zmm28 +; AVX512F-NEXT: vpermi2d %zmm8, %zmm0, %zmm23 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] +; AVX512F-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm30, %zmm29 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermt2d %zmm21, %zmm31, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm29, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm30, %zmm21 ; AVX512F-NEXT: vpermt2d %zmm17, %zmm31, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm29, %zmm17 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm11, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm31, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <3,8,13,18,23,28,u,u> -; AVX512F-NEXT: vpermt2d %zmm10, %zmm31, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm30, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm31, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = <3,8,13,18,23,28,u,u> +; AVX512F-NEXT: vpermi2d %zmm0, %zmm8, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm8, %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm11, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm14, %zmm16 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm31 = <4,9,14,19,24,29,u,u> -; AVX512F-NEXT: vpermt2d %zmm13, %zmm31, %zmm20 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm31, %zmm20 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm7 ; AVX512F-NEXT: vpermt2d %zmm15, %zmm19, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm18, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm26, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm11, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm31, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm18, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm26, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm14, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm31, %zmm12 ; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512F-NEXT: vpermt2d %zmm15, %zmm19, %zmm4 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm1 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm18, %zmm5 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm18, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512F-NEXT: vpermt2d %zmm15, %zmm26, %zmm8 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm11, %zmm10 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm31, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm14, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm31, %zmm1 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm0 +; AVX512F-NEXT: vpermi2d %zmm15, %zmm0, %zmm19 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm15, %zmm18 +; AVX512F-NEXT: vpermi2d %zmm15, %zmm0, %zmm26 +; AVX512F-NEXT: vpermi2d %zmm15, %zmm0, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 ; AVX512F-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] -; AVX512F-NEXT: vpermt2d %zmm3, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm31 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm15, %zmm7 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] -; AVX512F-NEXT: vpermt2d %zmm3, %zmm15, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm15, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm15, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm18 +; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512F-NEXT: vpermt2d %zmm15, %zmm31, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm2 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm31, %zmm7 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm31, %zmm4 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm5 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm31, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512F-NEXT: vpermt2d %zmm15, %zmm27, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm27, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm27, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm27, %zmm18 ; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm25 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm23 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] -; AVX512F-NEXT: vpermt2d %zmm3, %zmm8, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm8, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm8, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm30 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm29 {%k1} +; AVX512F-NEXT: vpermt2d %zmm15, %zmm8, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm8, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm8, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm8, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm30 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] -; AVX512F-NEXT: vpermt2d %zmm3, %zmm8, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm8, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm8, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm8, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm8, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm8, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm8, %zmm30 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] -; AVX512F-NEXT: vpermt2d %zmm3, %zmm8, %zmm20 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} -; AVX512F-NEXT: vpermt2d %zmm31, %zmm8, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} -; AVX512F-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vpermt2d %zmm6, %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm8, %zmm20 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512F-NEXT: vpermt2d %zmm2, %zmm8, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512F-NEXT: vpermt2d %zmm3, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512F-NEXT: vpermt2d %zmm5, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm19, 192(%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm24, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm18, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm27, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm9, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm5, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm22, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm29, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm30, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm31, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm23, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm28, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm25, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm22, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm30, 192(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm17, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm29, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm1, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm12, 64(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm20, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%r9) ; AVX512F-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -5081,201 +5059,201 @@ ; AVX512BW-LABEL: load_i32_stride5_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $584, %rsp # imm = 0x248 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm17 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm21 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm6, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm5, %zmm24 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = <0,5,10,15,20,25,30,u> ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm19, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm6 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm5, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm19, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm0, %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm7, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = <17,22,27,0,5,10,15,u> -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm27 ; AVX512BW-NEXT: vpermt2d %zmm20, %zmm18, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm5, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm7, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <2,7,12,17,22,27,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm26, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm22, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm22, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm22, %zmm28 -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm22 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm29, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm26, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5] +; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm23, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm23, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm23, %zmm28 +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm0, %zmm23 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22] +; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm30, %zmm29 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7] ; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm31, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm29, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm30, %zmm21 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm31, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm29, %zmm17 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <3,8,13,18,23,28,u,u> -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm30, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm31, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <3,8,13,18,23,28,u,u> +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm8, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm11, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm14, %zmm16 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm31 = <4,9,14,19,24,29,u,u> -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm31, %zmm20 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm31, %zmm20 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm19, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm18, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm26, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm11, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm31, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm18, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm26, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm14, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm31, %zmm12 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm19, %zmm4 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm1 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm18, %zmm5 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm18, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm26, %zmm8 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm11, %zmm10 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm31, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm14, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm15, %zmm0, %zmm19 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm15, %zmm18 +; AVX512BW-NEXT: vpermi2d %zmm15, %zmm0, %zmm26 +; AVX512BW-NEXT: vpermi2d %zmm15, %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm31, %zmm0 ; AVX512BW-NEXT: movw $8064, %ax # imm = 0x1F80 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm15, %zmm7 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm15, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm15, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm15, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm18 +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm31, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm24 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm31, %zmm7 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm31, %zmm4 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm31, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm27, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm27, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm27, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm27, %zmm18 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm8, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm29 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm8, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm30 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm8, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm8, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm30 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm20 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm8, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm8, %zmm20 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm8, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm27, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm30, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm31, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm22, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 192(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm29, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%r9) ; AVX512BW-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -138,31 +138,32 @@ ; AVX512F-FAST-LABEL: load_i32_stride6_vf2: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,6,0,6] -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpextrd $2, %xmm1, %r10d +; AVX512F-FAST-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,7,1,7] -; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4] -; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2 +; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm4 +; AVX512F-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm1 +; AVX512F-FAST-NEXT: vmovd %xmm2, %r10d +; AVX512F-FAST-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,5,3,5] -; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5 -; AVX512F-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] -; AVX512F-FAST-NEXT: # xmm1 = mem[0,0] -; AVX512F-FAST-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX512F-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512F-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm0, %xmm5 +; AVX512F-FAST-NEXT: vmovddup {{.*#+}} xmm0 = [4,2,4,2] +; AVX512F-FAST-NEXT: # xmm0 = mem[0,0] +; AVX512F-FAST-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX512F-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512F-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3] ; AVX512F-FAST-NEXT: # xmm6 = mem[0,0] -; AVX512F-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3 -; AVX512F-FAST-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm2 +; AVX512F-FAST-NEXT: vmovq %xmm3, (%rsi) ; AVX512F-FAST-NEXT: vmovq %xmm4, (%rdx) -; AVX512F-FAST-NEXT: vmovq %xmm2, (%rcx) +; AVX512F-FAST-NEXT: vmovq %xmm1, (%rcx) ; AVX512F-FAST-NEXT: vmovq %xmm5, (%r8) -; AVX512F-FAST-NEXT: vmovlps %xmm1, (%r9) -; AVX512F-FAST-NEXT: vmovlps %xmm3, (%rax) +; AVX512F-FAST-NEXT: vmovlps %xmm0, (%r9) +; AVX512F-FAST-NEXT: vmovlps %xmm2, (%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -202,31 +203,32 @@ ; AVX512BW-FAST-LABEL: load_i32_stride6_vf2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,6,0,6] -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512BW-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-FAST-NEXT: vpextrd $2, %xmm1, %r10d +; AVX512BW-FAST-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3 ; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,7,1,7] -; AVX512BW-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4] -; AVX512BW-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2 +; AVX512BW-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm4 +; AVX512BW-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm1 +; AVX512BW-FAST-NEXT: vmovd %xmm2, %r10d +; AVX512BW-FAST-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1 ; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,5,3,5] -; AVX512BW-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5 -; AVX512BW-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] -; AVX512BW-FAST-NEXT: # xmm1 = mem[0,0] -; AVX512BW-FAST-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX512BW-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm1 +; AVX512BW-FAST-NEXT: vpermi2d %xmm2, %xmm0, %xmm5 +; AVX512BW-FAST-NEXT: vmovddup {{.*#+}} xmm0 = [4,2,4,2] +; AVX512BW-FAST-NEXT: # xmm0 = mem[0,0] +; AVX512BW-FAST-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX512BW-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX512BW-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3] ; AVX512BW-FAST-NEXT: # xmm6 = mem[0,0] -; AVX512BW-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3 -; AVX512BW-FAST-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-FAST-NEXT: vpermps %ymm2, %ymm6, %ymm2 +; AVX512BW-FAST-NEXT: vmovq %xmm3, (%rsi) ; AVX512BW-FAST-NEXT: vmovq %xmm4, (%rdx) -; AVX512BW-FAST-NEXT: vmovq %xmm2, (%rcx) +; AVX512BW-FAST-NEXT: vmovq %xmm1, (%rcx) ; AVX512BW-FAST-NEXT: vmovq %xmm5, (%r8) -; AVX512BW-FAST-NEXT: vmovlps %xmm1, (%r9) -; AVX512BW-FAST-NEXT: vmovlps %xmm3, (%rax) +; AVX512BW-FAST-NEXT: vmovlps %xmm0, (%r9) +; AVX512BW-FAST-NEXT: vmovlps %xmm2, (%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %wide.vec = load <12 x i32>, ptr %in.vec, align 64 @@ -959,11 +961,11 @@ ; SSE-NEXT: movdqa 256(%rdi), %xmm5 ; SSE-NEXT: movdqa 192(%rdi), %xmm13 ; SSE-NEXT: movdqa 208(%rdi), %xmm14 -; SSE-NEXT: movdqa 336(%rdi), %xmm11 +; SSE-NEXT: movdqa 144(%rdi), %xmm11 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm7 -; SSE-NEXT: movdqa 288(%rdi), %xmm12 -; SSE-NEXT: movdqa 304(%rdi), %xmm8 +; SSE-NEXT: movdqa 160(%rdi), %xmm7 +; SSE-NEXT: movdqa 96(%rdi), %xmm12 +; SSE-NEXT: movdqa 112(%rdi), %xmm8 ; SSE-NEXT: movdqa 64(%rdi), %xmm3 ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm2 @@ -1002,16 +1004,16 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: movdqa 288(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm2 +; SSE-NEXT: movdqa 304(%rdi), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSE-NEXT: movdqa %xmm2, %xmm11 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa 144(%rdi), %xmm15 -; SSE-NEXT: movdqa 160(%rdi), %xmm6 +; SSE-NEXT: movdqa 336(%rdi), %xmm15 +; SSE-NEXT: movdqa 352(%rdi), %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,0,1,1] @@ -1065,13 +1067,13 @@ ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa 368(%rdi), %xmm3 +; SSE-NEXT: movdqa 176(%rdi), %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] ; SSE-NEXT: movdqa %xmm3, %xmm12 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa 320(%rdi), %xmm4 +; SSE-NEXT: movdqa 128(%rdi), %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1088,11 +1090,11 @@ ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa 176(%rdi), %xmm6 +; SSE-NEXT: movdqa 368(%rdi), %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa 128(%rdi), %xmm13 +; SSE-NEXT: movdqa 320(%rdi), %xmm13 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1196,44 +1198,44 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps %xmm1, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps %xmm1, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps %xmm1, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps %xmm1, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movapd %xmm14, 16(%r8) +; SSE-NEXT: movapd %xmm14, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%r8) +; SSE-NEXT: movaps %xmm1, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%r8) -; SSE-NEXT: movapd %xmm0, 16(%r9) +; SSE-NEXT: movapd %xmm0, 48(%r9) ; SSE-NEXT: movapd %xmm3, 32(%r9) -; SSE-NEXT: movapd %xmm4, 48(%r9) +; SSE-NEXT: movapd %xmm4, 16(%r9) ; SSE-NEXT: movapd %xmm15, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm11, 16(%rax) +; SSE-NEXT: movapd %xmm11, 48(%rax) ; SSE-NEXT: movapd %xmm9, 32(%rax) -; SSE-NEXT: movapd %xmm12, 48(%rax) +; SSE-NEXT: movapd %xmm12, 16(%rax) ; SSE-NEXT: movapd %xmm10, (%rax) ; SSE-NEXT: addq $360, %rsp # imm = 0x168 ; SSE-NEXT: retq @@ -2153,464 +2155,465 @@ ; SSE-LABEL: load_i32_stride6_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $1016, %rsp # imm = 0x3F8 -; SSE-NEXT: movdqa 64(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm14 -; SSE-NEXT: movdqa 528(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 544(%rdi), %xmm4 -; SSE-NEXT: movdqa 480(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 496(%rdi), %xmm15 ; SSE-NEXT: movdqa 144(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm12 -; SSE-NEXT: movdqa 96(%rdi), %xmm1 +; SSE-NEXT: movdqa 160(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm8 +; SSE-NEXT: movdqa 448(%rdi), %xmm3 +; SSE-NEXT: movdqa 384(%rdi), %xmm4 +; SSE-NEXT: movdqa 400(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,0,1,1] -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 384(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,0,1,1] +; SSE-NEXT: movdqa %xmm8, %xmm13 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 432(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 480(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 304(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 496(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 336(%rdi), %xmm2 +; SSE-NEXT: movdqa 528(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 544(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm3 +; SSE-NEXT: movdqa 208(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 688(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 240(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 256(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 576(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 592(%rdi), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 720(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 736(%rdi), %xmm0 +; SSE-NEXT: movdqa 624(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 640(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa 240(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm3[0],xmm11[1] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm1 -; SSE-NEXT: movdqa 592(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 288(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 304(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movdqa 336(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 352(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,2,3,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 640(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm5[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 672(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm4[0],xmm10[1] +; SSE-NEXT: movdqa 688(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE-NEXT: movdqa 720(%rdi), %xmm0 +; SSE-NEXT: movdqa 736(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm6[0],xmm10[1] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm3[0],xmm10[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm5[0],xmm10[1] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, (%rsp), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa (%rsp), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa 176(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] -; SSE-NEXT: movdqa 128(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,3,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa 80(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,3,2,3] -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movdqa 80(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa 368(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa 320(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movdqa 176(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; SSE-NEXT: movdqa 128(%rdi), %xmm15 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] ; SSE-NEXT: movdqa 272(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] ; SSE-NEXT: movdqa 224(%rdi), %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa 368(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa 320(%rdi), %xmm14 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa 464(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa 416(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] ; SSE-NEXT: movdqa 560(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,3,2,3] ; SSE-NEXT: movdqa 512(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa 464(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa 656(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa 416(%rdi), %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa 608(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] ; SSE-NEXT: movdqa 752(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[2,3,2,3] -; SSE-NEXT: movdqa 704(%rdi), %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa 656(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3] -; SSE-NEXT: movdqa 608(%rdi), %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa 704(%rdi), %xmm10 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa %xmm15, %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[3,3,3,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] -; SSE-NEXT: movdqa %xmm12, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm15[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[0,0,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -2620,18 +2623,20 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm15 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: movapd %xmm15, %xmm10 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2639,9 +2644,10 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2649,20 +2655,19 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2670,86 +2675,84 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rcx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movaps %xmm0, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps %xmm0, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r8) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) @@ -2760,38 +2763,39 @@ ; SSE-NEXT: movapd %xmm4, 80(%r9) ; SSE-NEXT: movapd %xmm5, 64(%r9) ; SSE-NEXT: movapd %xmm6, 48(%r9) -; SSE-NEXT: movapd %xmm13, 32(%r9) -; SSE-NEXT: movapd %xmm9, 16(%r9) +; SSE-NEXT: movapd %xmm12, 32(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm14, 112(%rax) -; SSE-NEXT: movapd %xmm10, 96(%rax) -; SSE-NEXT: movapd %xmm7, 80(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rax) +; SSE-NEXT: movapd %xmm13, 96(%rax) +; SSE-NEXT: movapd %xmm11, 80(%rax) +; SSE-NEXT: movapd %xmm15, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movapd %xmm15, 16(%rax) -; SSE-NEXT: movapd %xmm12, (%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movapd %xmm10, (%rax) ; SSE-NEXT: addq $1016, %rsp # imm = 0x3F8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride6_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1064, %rsp # imm = 0x428 +; AVX1-ONLY-NEXT: subq $1048, %rsp # imm = 0x418 ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9 @@ -2808,14 +2812,14 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm2[0,1] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm4[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[3],ymm3[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[3],ymm5[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm5[0,0],ymm4[6,4],ymm5[4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm5[0,0],ymm2[6,4],ymm5[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,2],ymm0[6,4],ymm5[6,6] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2881,6 +2885,7 @@ ; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm12[0,1] +; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm12[0],ymm0[0],ymm12[3],ymm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] @@ -2897,8 +2902,8 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[3,0],ymm5[1,0],ymm14[7,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm12[3,0],ymm5[1,0],ymm12[7,4],ymm5[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,0],ymm5[2,3],ymm6[6,4],ymm5[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,0],xmm4[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[0,2],xmm4[1,3] @@ -2909,137 +2914,156 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm0[3,0],ymm3[1,0],ymm0[7,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[3,0],ymm3[1,0],ymm6[7,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[2,3],ymm4[6,4],ymm3[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,0],xmm2[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[0,2],xmm2[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,1],ymm10[1,3],ymm0[7,5],ymm10[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,0],ymm9[1,0],ymm5[7,4],ymm9[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,0],ymm9[1,0],ymm14[7,4],ymm9[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,0],ymm9[2,3],ymm2[6,4],ymm9[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[1,0],xmm1[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm1[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,1],ymm13[1,3],ymm12[7,5],ymm13[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,1],ymm4[1,3],ymm1[7,5],ymm4[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm15[2,1],mem[2,0],ymm15[6,5],mem[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm11[0,1,2,3],mem[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,0],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm11[0,1,2,3],mem[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0],ymm4[2,0],ymm0[4,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm3[2,0],ymm0[4,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm14[2,1],mem[2,0],ymm14[6,5],mem[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm12[2,1],mem[2,0],ymm12[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,0],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,0],ymm10[2,0],ymm7[4,4],ymm10[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm1[2,0],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm8[0,1,2,3],mem[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[2,1],ymm11[2,0],ymm5[6,5],ymm11[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[0,0],ymm7[2,0],ymm0[4,4],ymm7[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm9[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm0[2,0],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2],ymm8[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,0],ymm8[2,0],ymm9[4,4],ymm8[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm13[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm6[2,1],mem[2,0],ymm6[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,0],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm10[0,1,2,3],mem[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,0],ymm11[2,0],ymm12[4,4],ymm11[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm13[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[2,1],ymm6[2,0],ymm1[6,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm14[2,1],ymm10[2,0],ymm14[6,5],ymm10[6,4] +; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm8 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm12[2,0],xmm14[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm9[2,0],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm15[2,0],ymm0[4,4],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm4[3,1],ymm5[4,5],ymm4[7,5] -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1],ymm13[2,1],ymm5[7,5],ymm13[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1],ymm10[3,1],ymm7[4,5],ymm10[7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,0],ymm15[2,0],ymm0[4,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[3,1],ymm4[4,5],ymm3[7,5] ; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[3,1],ymm7[2,1],ymm5[7,5],ymm7[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[3,1],ymm6[2,1],ymm13[7,5],ymm6[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm15[3,1],ymm0[4,5],ymm15[7,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[3,1],xmm14[3,3] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm12 -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[3,1],ymm6[2,1],ymm1[7,5],ymm6[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[0,1],ymm8[3,1],ymm9[4,5],ymm8[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1],ymm7[3,1],ymm2[4,5],ymm7[7,5] ; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm11[2,1],ymm2[7,5],ymm11[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm7[2,1],ymm4[7,5],ymm7[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1],ymm11[3,1],ymm12[4,5],ymm11[7,5] +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm5[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1],ymm11[2,1],ymm3[7,5],ymm11[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm15[3,1],ymm0[4,5],ymm15[7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[3,1],xmm14[3,3] +; AVX1-ONLY-NEXT: vmovaps %ymm10, %ymm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[3,1],ymm10[2,1],ymm8[7,5],ymm10[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,0],ymm13[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm10[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[2,0],ymm6[0,0],ymm10[6,4],ymm6[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm1[2,0],ymm6[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3052,159 +3076,139 @@ ; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm5[0,0],ymm2[6,4],ymm5[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2],ymm1[2,0],ymm5[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm14[0,0],ymm2[6,4],ymm14[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm1[2,0],ymm14[4,6],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm13[0],ymm1[2],ymm13[2] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,0],ymm10[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,0],ymm0[0,0],ymm4[6,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm2[2,0],ymm0[4,6],ymm2[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm5[1],ymm11[0],ymm5[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,1],ymm7[2,0],ymm4[4,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm12[2,0],ymm0[0,0],ymm12[6,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,2],ymm7[2,0],ymm0[4,6],ymm7[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm7[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm15[0],ymm2[2],ymm15[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,0],ymm12[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[2,0],ymm1[0,0],ymm6[6,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,2],ymm3[2,0],ymm1[4,6],ymm3[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0],xmm3[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vmovapd 656(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm9[1],ymm3[0],ymm9[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm7[0,1],ymm15[2,0],ymm7[4,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm13[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[2,0],ymm15[0,0],ymm13[6,4],ymm15[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm15[0,2],ymm12[2,0],ymm15[4,6],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm0[1,0],ymm4[7,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[2,0],ymm0[4,7],ymm2[6,4] -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm14[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm4[3,1],mem[1,3],ymm4[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm10[1,1],ymm4[2,0],ymm10[5,5],ymm4[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm5[1,0],ymm0[7,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,3],ymm0[2,0],ymm5[4,7],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm5[3,1],mem[1,3],ymm5[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0],xmm3[1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovapd 656(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm3[1],ymm15[0],ymm3[2],ymm15[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm8[0,1],ymm11[2,0],ymm8[4,5],ymm11[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm11[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm7[2,0],ymm11[0,0],ymm7[6,4],ymm11[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[0,2],ymm15[2,0],ymm11[4,6],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,0],ymm6[1,0],ymm10[7,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,3],ymm1[2,0],ymm6[4,7],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,1],ymm5[2,0],ymm10[5,5],ymm5[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[3,0],ymm1[1,0],ymm6[7,4],ymm1[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[2,0],ymm1[4,7],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,1],mem[1,3],ymm2[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1],ymm2[2,0],ymm6[5,5],ymm2[6,4] +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm15[1,1],ymm10[2,0],ymm15[5,5],ymm10[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,0],ymm0[1,0],ymm12[7,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[2,0],ymm0[4,7],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm5[3,1],mem[1,3],ymm5[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1],ymm2[2,0],ymm4[5,5],ymm2[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,0],ymm15[1,0],ymm13[7,4],ymm15[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,3],ymm1[2,0],ymm15[4,7],ymm1[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm11[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[3,1],ymm3[1,3],ymm9[7,5],ymm3[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,1],ymm3[2,0],ymm7[5,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm14[1,0],ymm1[7,4],ymm14[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,3],ymm1[2,0],ymm14[4,7],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm4[3,1],mem[1,3],ymm4[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1],ymm4[2,0],ymm5[5,5],ymm4[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,0],ymm11[1,0],ymm7[7,4],ymm11[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,3],ymm2[2,0],ymm11[4,7],ymm2[6,4] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm13[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm3[3,1],mem[1,3],ymm3[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[1,1],ymm3[2,0],ymm8[5,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) -; AVX1-ONLY-NEXT: addq $1064, %rsp # imm = 0x428 +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) +; AVX1-ONLY-NEXT: addq $1048, %rsp # imm = 0x418 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3213,7 +3217,7 @@ ; AVX2-SLOW-NEXT: subq $1160, %rsp # imm = 0x488 ; AVX2-SLOW-NEXT: vmovaps 480(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm7, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm0 @@ -3229,393 +3233,393 @@ ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm14 = <0,6,4,u> -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm14, %ymm2 ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm3[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4,2,4,2,4,2,4,2] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm4, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[0,1],ymm6[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3,4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm14, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm2, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm4, %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm14, %ymm15 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm14, %ymm10 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm5[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm2, %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm4, %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm14, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm14, %ymm14 ; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[0,1],ymm1[0,1] -; AVX2-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[0,1],ymm1[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm8[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,3,5,3,5,3,5,3] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm11, %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm11, %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3,4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm11, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm4 = <1,7,5,u> +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm4, %ymm12 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm12 = [5,3,5,3,5,3,5,3] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm12, %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm4, %ymm6 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm12, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm12, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm12, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,3,2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $51, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3],ymm4[4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] +; AVX2-SLOW-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, (%rsp), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4],ymm12[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm11[1],ymm3[2,3,4],ymm11[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vmovaps %ymm8, %ymm13 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = mem[3,3,3,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm6[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] -; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm4 -; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vmovaps %ymm9, %ymm1 -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm9, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1],ymm10[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm2, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm1, %ymm11 -; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm2, %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm11[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm3, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm14 = [5,3,5,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm14 = [4,2,4,2] ; AVX2-SLOW-NEXT: # xmm14 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm3, %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm3, %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm14, %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm3, %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1],ymm9[2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm14, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm10[0,1],mem[2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm14, %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r8) +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm0, %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm13[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r9) +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm1[2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm14, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm9 = [5,3,5,3] +; AVX2-SLOW-NEXT: # xmm9 = mem[0,0] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm9, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7] +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm4, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps $8, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm9, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm4, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm9, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) ; AVX2-SLOW-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -3625,7 +3629,7 @@ ; AVX2-FAST-NEXT: subq $1160, %rsp # imm = 0x488 ; AVX2-FAST-NEXT: vmovaps 480(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovups %ymm7, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm0 @@ -3641,225 +3645,228 @@ ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm14 = <0,6,4,u> -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm14, %ymm2 ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm3[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4,2,4,2,4,2,4,2] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[0,1],ymm6[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm14, %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm4, %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[0,1],ymm1[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm14, %ymm15 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm14, %ymm10 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm5[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm4, %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm14, %ymm14 ; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm15 -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[0,1],ymm1[0,1] -; AVX2-FAST-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[0,1],ymm1[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm8[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm11, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm4 = <1,7,5,u> +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm4, %ymm12 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm12 = [5,3,5,3,5,3,5,3] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm12, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm7 = [2,0,6,4,2,0,6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = [2,0,6,4,2,0,6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm13 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm13[2,3,2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3],ymm4[4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $204, (%rsp), %ymm15, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm5, %ymm6 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm5[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4],ymm12[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm13[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovaps %ymm15, %ymm13 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4],ymm8[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[3,3,3,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] @@ -3867,162 +3874,158 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm1 +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm6[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] -; AVX2-FAST-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm10[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm2, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm14 = [5,3,5,3] -; AVX2-FAST-NEXT: # xmm14 = mem[0,0] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm8 = [4,2,4,2] +; AVX2-FAST-NEXT: # xmm8 = mem[0,0] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm13[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1],ymm9[2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm8, %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FAST-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm4[0,1],mem[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm8, %ymm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm5, (%r9) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm1[2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm8, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm8[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm9 = [5,3,5,3] +; AVX2-FAST-NEXT: # xmm9 = mem[0,0] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm4, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%r9) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FAST-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -4032,7 +4035,7 @@ ; AVX2-FAST-PERLANE-NEXT: subq $1160, %rsp # imm = 0x488 ; AVX2-FAST-PERLANE-NEXT: vmovaps 480(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm0 @@ -4048,393 +4051,393 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm14 = <0,6,4,u> -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm14, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm3[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm2, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4,2,4,2,4,2,4,2] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm4, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[0,1],ymm6[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm14, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm2, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm4, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[0,1],ymm1[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm14, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm14, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm5[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm2, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm4, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm14, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[0,1],ymm1[0,1] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[0,1],ymm1[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm8[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,3,5,3,5,3,5,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm11, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm11, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm4 = <1,7,5,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm4, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm12 = [5,3,5,3,5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm12, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm12, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm12, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm12, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $51, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3],ymm4[4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, (%rsp), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4],ymm12[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm11[1],ymm3[2,3,4],ymm11[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm6[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm2, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1],ymm10[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm2, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm1, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1],ymm15[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm2, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm14 = [5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm14 = [4,2,4,2] ; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm3, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm3, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm14, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm3, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1],ymm9[2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm14, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm10[0,1],mem[2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm14, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm0, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r9) +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm1[2,3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm14, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm9 = [5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7] +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm4, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm9, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm9, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -4748,35 +4751,35 @@ ; SSE-LABEL: load_i32_stride6_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $2184, %rsp # imm = 0x888 -; SSE-NEXT: movdqa 912(%rdi), %xmm7 +; SSE-NEXT: movdqa 816(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 928(%rdi), %xmm3 +; SSE-NEXT: movdqa 832(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 864(%rdi), %xmm8 +; SSE-NEXT: movdqa 768(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 880(%rdi), %xmm4 +; SSE-NEXT: movdqa 784(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 528(%rdi), %xmm9 +; SSE-NEXT: movdqa 432(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 544(%rdi), %xmm5 +; SSE-NEXT: movdqa 448(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 480(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 496(%rdi), %xmm6 +; SSE-NEXT: movdqa 384(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 400(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm2 +; SSE-NEXT: movdqa 64(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4796,217 +4799,221 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1248(%rdi), %xmm1 +; SSE-NEXT: movdqa 1152(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1264(%rdi), %xmm0 +; SSE-NEXT: movdqa 1168(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1296(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1312(%rdi), %xmm0 +; SSE-NEXT: movdqa 1200(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1216(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 96(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa 144(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 384(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm0 +; SSE-NEXT: movdqa 480(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 496(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 432(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm0 +; SSE-NEXT: movdqa 528(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 544(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 768(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 784(%rdi), %xmm0 +; SSE-NEXT: movdqa 864(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 880(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 816(%rdi), %xmm2 +; SSE-NEXT: movdqa 912(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 832(%rdi), %xmm0 +; SSE-NEXT: movdqa 928(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1152(%rdi), %xmm1 +; SSE-NEXT: movdqa 1248(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1168(%rdi), %xmm0 +; SSE-NEXT: movdqa 1264(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1200(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1216(%rdi), %xmm0 +; SSE-NEXT: movdqa 1296(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1312(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 288(%rdi), %xmm7 -; SSE-NEXT: movdqa 304(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm7 +; SSE-NEXT: movdqa 208(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 336(%rdi), %xmm10 -; SSE-NEXT: movdqa 352(%rdi), %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm9 +; SSE-NEXT: movdqa 256(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,0,1,1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 688(%rdi), %xmm0 +; SSE-NEXT: movdqa 576(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 592(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 720(%rdi), %xmm3 +; SSE-NEXT: movdqa 624(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 736(%rdi), %xmm0 +; SSE-NEXT: movdqa 640(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1056(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 960(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1072(%rdi), %xmm0 +; SSE-NEXT: movdqa 976(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1104(%rdi), %xmm3 +; SSE-NEXT: movdqa 1008(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1120(%rdi), %xmm0 +; SSE-NEXT: movdqa 1024(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1440(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1344(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1456(%rdi), %xmm0 +; SSE-NEXT: movdqa 1360(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1488(%rdi), %xmm2 +; SSE-NEXT: movdqa 1392(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1504(%rdi), %xmm0 +; SSE-NEXT: movdqa 1408(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm3 -; SSE-NEXT: movdqa 208(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 240(%rdi), %xmm2 -; SSE-NEXT: movdqa 256(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 288(%rdi), %xmm4 +; SSE-NEXT: movdqa 304(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm6 -; SSE-NEXT: movdqa 592(%rdi), %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movdqa 336(%rdi), %xmm2 +; SSE-NEXT: movdqa 352(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm11 -; SSE-NEXT: movdqa 640(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[0,0,1,1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 672(%rdi), %xmm1 +; SSE-NEXT: movdqa 688(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm11[0,0,1,1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm8[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 960(%rdi), %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; SSE-NEXT: movdqa 720(%rdi), %xmm3 +; SSE-NEXT: movdqa 736(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,0,1,1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm12[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1056(%rdi), %xmm14 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 976(%rdi), %xmm0 +; SSE-NEXT: movdqa 1072(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] -; SSE-NEXT: movdqa 1008(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1024(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; SSE-NEXT: movdqa 1104(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1120(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm14[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1344(%rdi), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm8[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm14[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1440(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1360(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1456(%rdi), %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; SSE-NEXT: movdqa 1392(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1408(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm15[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, (%rsp), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; SSE-NEXT: movdqa 1488(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1504(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm11[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm8[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm15[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm14[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm14[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload @@ -5014,29 +5021,27 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm14[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm14[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm7[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm13[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm10[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm14[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm13 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm14[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm6[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm14[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: pshufd $85, (%rsp), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[3,3,3,3] @@ -5055,11 +5060,11 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm14[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm14[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload @@ -5067,27 +5072,16 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm14[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm14[0],xmm11[1] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm14[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm14[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5098,19 +5092,20 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5122,10 +5117,9 @@ ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload @@ -5133,35 +5127,35 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload @@ -5169,295 +5163,307 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa 80(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm9[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa 80(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[2,3,2,3] -; SSE-NEXT: movdqa 32(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa 32(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 176(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm9[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa 176(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd $238, (%rsp), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[2,3,2,3] -; SSE-NEXT: movdqa 128(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] +; SSE-NEXT: movdqa 128(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 272(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm9[0,0,1,1] +; SSE-NEXT: movdqa 272(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa 224(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[2,3,2,3] +; SSE-NEXT: movdqa 224(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 368(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm9[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[2,3,2,3] -; SSE-NEXT: movdqa 320(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa 464(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,1,1] +; SSE-NEXT: movdqa 368(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[2,3,2,3] -; SSE-NEXT: movdqa 416(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] +; SSE-NEXT: movdqa 320(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa 560(%rdi), %xmm8 +; SSE-NEXT: movdqa 464(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm8[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa 512(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] +; SSE-NEXT: pshufd $238, (%rsp), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[2,3,2,3] +; SSE-NEXT: movdqa 416(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 656(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa 560(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm7[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[2,3,2,3] -; SSE-NEXT: movdqa 608(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] +; SSE-NEXT: movdqa 512(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 752(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,1,1] +; SSE-NEXT: movdqa 656(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm7[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[2,3,2,3] -; SSE-NEXT: movdqa 704(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] +; SSE-NEXT: movdqa 608(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa 848(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm9[0,0,1,1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa 752(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm7[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa 800(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[2,3,2,3] +; SSE-NEXT: movdqa 704(%rdi), %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa 944(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa 848(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[2,3,2,3] +; SSE-NEXT: movdqa 800(%rdi), %xmm15 +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa 944(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[2,3,2,3] ; SSE-NEXT: movdqa 896(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa 1040(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa 1040(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[2,3,2,3] ; SSE-NEXT: movdqa 992(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa 1136(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa 1136(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[2,3,2,3] ; SSE-NEXT: movdqa 1088(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa 1232(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa 1232(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[2,3,2,3] ; SSE-NEXT: movdqa 1184(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa 1328(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa 1328(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[2,3,2,3] -; SSE-NEXT: movdqa 1280(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movdqa 1280(%rdi), %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa 1424(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa 1424(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm15[2,3,2,3] -; SSE-NEXT: movdqa 1376(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[2,3,2,3] +; SSE-NEXT: movdqa 1376(%rdi), %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa 1520(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa 1520(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa 1472(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[2,3,2,3] +; SSE-NEXT: movdqa 1472(%rdi), %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[3,3,3,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, (%rsp), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,3,3,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[3,3,3,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -5472,11 +5478,12 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -5488,13 +5495,12 @@ ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -5504,108 +5510,106 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] @@ -5615,7 +5619,7 @@ ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] @@ -5627,19 +5631,19 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -5653,7 +5657,8 @@ ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] @@ -5662,35 +5667,34 @@ ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] @@ -5702,10 +5706,10 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5713,10 +5717,10 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5724,10 +5728,10 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5735,21 +5739,21 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5757,9 +5761,10 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5767,10 +5772,10 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5778,10 +5783,10 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -5789,17 +5794,17 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5832,14 +5837,13 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm13 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -5848,7 +5852,7 @@ ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] @@ -5857,21 +5861,14 @@ ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rsi) @@ -5880,13 +5877,13 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps %xmm0, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps %xmm0, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -5896,38 +5893,46 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rdx) +; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rdx) +; SSE-NEXT: movaps %xmm0, 224(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rdx) +; SSE-NEXT: movaps %xmm0, 192(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rdx) +; SSE-NEXT: movaps %xmm0, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rcx) @@ -5981,13 +5986,13 @@ ; SSE-NEXT: movaps %xmm0, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r8) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) @@ -5997,8 +6002,8 @@ ; SSE-NEXT: movapd %xmm6, 192(%r9) ; SSE-NEXT: movapd %xmm7, 176(%r9) ; SSE-NEXT: movapd %xmm8, 160(%r9) -; SSE-NEXT: movapd %xmm9, 144(%r9) -; SSE-NEXT: movapd %xmm12, 128(%r9) +; SSE-NEXT: movapd %xmm10, 144(%r9) +; SSE-NEXT: movapd %xmm11, 128(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -6050,26 +6055,26 @@ ; ; AVX1-ONLY-LABEL: load_i32_stride6_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2536, %rsp # imm = 0x9E8 -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm6 +; AVX1-ONLY-NEXT: subq $2568, %rsp # imm = 0xA08 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm9 ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm9[0,0],ymm1[6,4],ymm9[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm9[2,2],ymm0[6,4],ymm9[6,6] @@ -6086,11 +6091,11 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm1[0,0],ymm2[6,4],ymm1[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,2],ymm0[6,4],ymm1[6,6] -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6098,9 +6103,9 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm14[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 736(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6108,16 +6113,16 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm13 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm13[0,0],ymm1[6,4],ymm13[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm13[2,2],ymm0[6,4],ymm13[6,6] -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6125,9 +6130,9 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6135,16 +6140,16 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1440(%rdi), %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm10 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm10[0,0],ymm1[6,4],ymm10[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm10[2,2],ymm0[6,4],ymm10[6,6] -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6152,9 +6157,9 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6162,16 +6167,16 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm8[0,0],ymm1[6,4],ymm8[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,2],ymm0[6,4],ymm8[6,6] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6179,9 +6184,9 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6189,16 +6194,16 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm6[0,0],ymm1[6,4],ymm6[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[2,2],ymm0[6,4],ymm6[6,6] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6206,9 +6211,9 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 736(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6216,44 +6221,44 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm4[0,0],ymm1[6,4],ymm4[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,2],ymm0[6,4],ymm4[6,6] -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] -; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1440(%rdi), %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm2[0,0],ymm1[6,4],ymm2[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm2[2,2],ymm0[6,4],ymm2[6,6] -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6261,9 +6266,9 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1],xmm12[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,2],xmm12[0,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm15 ; AVX1-ONLY-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm15[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6294,9 +6299,9 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[1,0],xmm14[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm15[0,2],xmm14[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[3,1],ymm15[1,3],ymm1[7,5],ymm15[5,7] +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6307,9 +6312,9 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm1[1,0],xmm11[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm13[0,2],xmm11[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm1[3,1],ymm13[1,3],ymm1[7,5],ymm13[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm1[3,1],ymm15[1,3],ymm1[7,5],ymm15[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6326,16 +6331,16 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm8[1,0],ymm0[7,4],ymm8[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm8[1,0],ymm13[7,4],ymm8[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,3],ymm0[6,4],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,0],xmm7[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[0,2],xmm7[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[3,1],ymm9[1,3],ymm1[7,5],ymm9[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6346,28 +6351,28 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm1[1,0],xmm5[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,2],xmm5[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,1],ymm7[1,3],ymm1[7,5],ymm7[5,7] +; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm4[1,0],ymm0[7,4],ymm4[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,3],ymm0[6,4],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,0],xmm3[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[0,2],xmm3[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[3,1],ymm4[1,3],ymm1[7,5],ymm4[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[3,1],ymm7[1,3],ymm1[7,5],ymm7[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm1[1,0],ymm3[7,4],ymm1[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[1,0],ymm0[7,4],ymm1[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,3],ymm0[6,4],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,0],xmm12[3,0] @@ -6396,7 +6401,7 @@ ; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[2,0],ymm2[4,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6412,12 +6417,13 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[2,0],ymm2[4,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6431,11 +6437,12 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm8[2,0],ymm1[4,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6449,174 +6456,176 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm10[0,1,2,3],mem[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm6[2,0],ymm1[4,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[2,0],ymm2[4,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[2,1],ymm8[2,0],ymm13[6,5],ymm8[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,0],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm9[2,0],ymm1[4,4],ymm9[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0],ymm4[2,0],ymm0[4,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,0],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm13[0,0],ymm2[2,0],ymm13[4,4],ymm2[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[2,1],ymm9[2,0],ymm0[6,5],ymm9[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[2,0],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,0],ymm6[2,0],ymm0[4,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm3[2,1],ymm5[2,0],ymm3[6,5],ymm5[6,4] -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm0[2,0],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0],ymm14[2,0],ymm1[4,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[0,1],mem[3,1],ymm10[4,5],mem[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm11[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm11[3,1],mem[2,1],ymm11[7,5],mem[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[0,1],mem[3,1],ymm10[4,5],mem[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm11[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[3,1],mem[2,1],ymm12[7,5],mem[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[0,1],ymm8[3,1],ymm10[4,5],ymm8[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm10[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm11[3,1],mem[2,1],ymm11[7,5],mem[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,1],ymm6[3,1],ymm8[4,5],ymm6[7,5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[3,1],mem[2,1],ymm10[7,5],mem[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm14[3,1],ymm1[4,5],ymm14[7,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1],xmm3[3,3] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm5[2,1],ymm4[7,5],ymm5[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,1],ymm2[3,1],ymm13[4,5],ymm2[7,5] -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm7[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm7[2,1],ymm3[7,5],ymm7[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[2,1],ymm10[2,0],ymm0[6,5],ymm10[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm12[2,0],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,0],ymm2[2,0],ymm0[4,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm7[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm9[3,1],ymm0[4,5],ymm9[7,5] -; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm15[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[2,1],ymm7[2,0],ymm0[6,5],ymm7[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm14[2,0],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2],ymm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm3[2,0],ymm1[4,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm13[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm13[3,1],mem[2,1],ymm13[7,5],mem[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm4[3,1],ymm0[4,5],ymm4[7,5] +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm5[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm13[3,1],ymm8[2,1],ymm13[7,5],ymm8[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm5[2,1],ymm4[7,5],ymm5[6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1],ymm15[2,1],ymm5[7,5],ymm15[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm6[3,1],ymm0[4,5],ymm6[7,5] +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm11[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1],ymm9[2,1],ymm4[7,5],ymm9[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm4[3,1],mem[2,1],ymm4[7,5],mem[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm2[3,1],ymm0[4,5],ymm2[7,5] +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm12[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm10[2,1],ymm2[7,5],ymm10[6,5] +; AVX1-ONLY-NEXT: vmovaps %ymm10, %ymm9 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vshufps $116, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[3,1],mem[3,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[3,1],ymm8[2,1],ymm6[7,5],ymm8[6,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,1],mem[2,1],ymm2[7,5],mem[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm3[3,1],ymm0[4,5],ymm3[7,5] +; AVX1-ONLY-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm14[3,1],mem[3,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm7[2,1],ymm2[7,5],ymm7[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] @@ -6624,6 +6633,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -6632,35 +6642,34 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm8[0],ymm1[2],ymm8[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,0],ymm6[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm14[0],ymm1[2],ymm14[2] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm8[0,0],ymm2[6,4],ymm8[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm1[2,0],ymm8[4,6],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm3[1],mem[0],ymm3[2],mem[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,0],ymm13[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm6[0,0],ymm1[6,4],ymm6[4,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm6[0,0],ymm2[6,4],ymm6[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm1[2,0],ymm6[4,6],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6675,9 +6684,10 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm5[0],ymm1[2],ymm5[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm15[0],ymm1[2],ymm15[2] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm5[0,0],ymm2[6,4],ymm5[4,4] @@ -6691,7 +6701,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 656(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6716,8 +6726,10 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 848(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[0],ymm1[2],mem[2] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm11[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[2,0],ymm3[0,0],ymm11[6,4],ymm3[4,4] @@ -6735,8 +6747,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 1040(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[0],ymm1[2],mem[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm9[0],ymm1[2],ymm9[2] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] @@ -6756,7 +6767,8 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 1232(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm14[0],ymm1[2],ymm14[2] +; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[0],ymm1[2],mem[2] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,0],ymm7[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] @@ -6774,10 +6786,10 @@ ; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3] -; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm14[1],mem[0],ymm14[2],mem[2] +; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm13[1],mem[0],ymm13[2],mem[2] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,1],ymm15[2,0],ymm13[4,5],ymm15[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2,3,4,5,6,7] @@ -6793,8 +6805,7 @@ ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = xmm13[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm15[3,1],mem[1,3],ymm15[7,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[3,1],ymm14[1,3],ymm15[7,5],ymm14[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[1,1],ymm15[2,0],ymm14[5,5],ymm15[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] @@ -6820,7 +6831,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm13 = ymm13[3,1],mem[1,3],ymm13[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload @@ -6830,7 +6841,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm12[3,0],ymm4[1,0],ymm12[7,4],ymm4[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,3],ymm8[2,0],ymm4[4,7],ymm8[6,4] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload @@ -6894,14 +6905,6 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%rsi) @@ -6910,13 +6913,13 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -6926,13 +6929,13 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -6942,22 +6945,30 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm7, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%r9) @@ -6982,204 +6993,204 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm14, (%rax) -; AVX1-ONLY-NEXT: addq $2536, %rsp # imm = 0x9E8 +; AVX1-ONLY-NEXT: addq $2568, %rsp # imm = 0xA08 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride6_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $2504, %rsp # imm = 0x9C8 -; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm8 +; AVX2-SLOW-NEXT: subq $2440, %rsp # imm = 0x988 +; AVX2-SLOW-NEXT: vmovaps 480(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm4 = <0,6,4,u> -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm8[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[0,1],ymm5[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm11 = [4,2,4,2,4,2,4,2] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm12 = [4,2,4,2,4,2,4,2] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm12, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[0,1],ymm2[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[0,1],ymm7[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm12, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1056(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1024(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 992(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 800(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1088(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1120(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm12, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1408(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1376(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 1184(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1344(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 1152(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1472(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1504(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 1312(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm12, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm12, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm12, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 864(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 832(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 1024(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps 800(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 992(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 1088(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 928(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps 1120(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm12, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1184(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 1376(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1152(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 1344(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vmovaps 1248(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovaps 1440(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 1408(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1280(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1312(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vmovaps 1504(%rdi), %ymm14 ; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm11, %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm12, %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm11 = <1,7,5,u> -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm11, %ymm0 +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm12 = <1,7,5,u> +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm12, %ymm0 ; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7] @@ -7187,53 +7198,74 @@ ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm11, %ymm12 +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm12, %ymm11 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm11, %ymm7 +; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm12, %ymm7 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm0, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm11, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm12, %ymm3 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm12, %ymm2 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm4[2,3,2,3] +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,3,2,3] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] @@ -7244,15 +7276,15 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, (%rsp), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -7273,10 +7305,11 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vmovaps 1152(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -7287,132 +7320,126 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1344(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm13[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1],ymm5[2,3],mem[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $51, (%rsp), %ymm13, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1],ymm13[2,3],mem[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1,2,3],ymm0[4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm5[2,3],mem[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1,2,3],ymm2[4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 1344(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1152(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm12[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm9[1,2,3],ymm6[4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm5[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm9[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3],ymm4[4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps $244, (%rsp), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = mem[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = mem[3,3,3,3] @@ -7434,261 +7461,250 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps $34, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = mem[3,3,3,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] +; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] -; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2] +; AVX2-SLOW-NEXT: # xmm5 = mem[0,0] +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm5, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,2,0,6,0,2,0,6] +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 848(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 848(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1040(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 1040(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[0,1],ymm13[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1232(%rdi), %xmm12 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,1],ymm11[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1424(%rdi), %xmm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm14[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm5, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload @@ -7697,31 +7713,30 @@ ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3] ; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm1, %ymm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $8, (%rsp), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm15[5,6,7] ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps $8, (%rsp), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -7729,8 +7744,8 @@ ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -7747,25 +7762,18 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rsi) @@ -7774,13 +7782,13 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7790,13 +7798,13 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7806,22 +7814,30 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r9) @@ -7845,209 +7861,208 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) -; AVX2-SLOW-NEXT: addq $2504, %rsp # imm = 0x9C8 +; AVX2-SLOW-NEXT: addq $2440, %rsp # imm = 0x988 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i32_stride6_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $2472, %rsp # imm = 0x9A8 -; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovaps 480(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm4 = <0,6,4,u> -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermps %ymm15, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm8[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[0,1],ymm5[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm10 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm12 = [4,2,4,2,4,2,4,2] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[0,1],ymm2[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[0,1],ymm7[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1056(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1024(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 992(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 800(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 960(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1088(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1120(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1408(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1376(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 1184(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1344(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 1152(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1472(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1504(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 1312(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermps %ymm7, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 864(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 832(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 1024(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps 800(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 992(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 768(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 960(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 896(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 928(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 1088(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 1120(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1184(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 1376(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1152(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 1344(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovaps 1248(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1216(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 1440(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 1408(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm14[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1280(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1312(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovaps 1504(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm10 = <1,7,5,u> -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm12 = <1,7,5,u> +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm12, %ymm0 ; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7] @@ -8055,357 +8070,446 @@ ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm10, %ymm12 +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm12, %ymm11 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm12, %ymm7 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm12, %ymm3 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm12, %ymm2 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm4 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm4[2,3,2,3] +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [2,0,6,4,2,0,6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm1[0,1],mem[2,3],ymm1[4,5],mem[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [2,0,6,4,2,0,6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 960(%rdi), %xmm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1344(%rdi), %xmm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 1152(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2,3],ymm1[4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm9[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm15[2,3],ymm11[4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1,2,3],ymm2[4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2,3],ymm1[4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1],ymm12[2,3],ymm14[4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm13[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1,2,3],ymm1[4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm9[1,2,3],ymm7[4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 960(%rdi), %xmm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1,2,3],ymm6[4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,1],ymm5[2,3],mem[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm3, %ymm8 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vpermilps $224, (%rsp), %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm5[0,1],mem[2,3],ymm5[4,5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vmovaps 1344(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm10[1,2,3],ymm8[4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm10 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = mem[3,3,3,3] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = mem[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vmovaps 1152(%rdi), %xmm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm13[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm8[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1,2,3],ymm4[4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm5[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vpermilps $255, (%rsp), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = mem[3,3,3,3] ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm11[1],ymm3[2,3,4],ymm11[5],ymm3[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[3,3,3,3] +; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovaps %ymm8, %ymm13 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vpermilps $244, (%rsp), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,1,3,3,4,5,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2] +; AVX2-FAST-NEXT: # xmm5 = mem[0,0] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 848(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm12[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8413,145 +8517,62 @@ ; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps 1040(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2] -; AVX2-FAST-NEXT: # xmm5 = mem[0,0] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, (%rsp), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 848(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 1232(%rdi), %xmm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[0,1],ymm11[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 1424(%rdi), %xmm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1040(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1232(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm13[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1424(%rdi), %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm2[2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload @@ -8560,76 +8581,67 @@ ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3] ; AVX2-FAST-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm0, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, (%rsp), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm15[0,1,2],mem[3],ymm15[4,5,6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm1, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpermps (%rsp), %ymm1, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rsi) @@ -8638,13 +8650,13 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8654,13 +8666,13 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8670,22 +8682,30 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%r8) +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r8) +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) @@ -8702,15 +8722,14 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm7, 224(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm10, 192(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm14, 160(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm6, 224(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm9, 192(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm13, 160(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax) @@ -8720,198 +8739,198 @@ ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride6_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $2504, %rsp # imm = 0x9C8 -; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: subq $2440, %rsp # imm = 0x988 +; AVX2-FAST-PERLANE-NEXT: vmovaps 480(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm4 = <0,6,4,u> -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm8[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[0,1],ymm5[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm11 = [4,2,4,2,4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm12 = [4,2,4,2,4,2,4,2] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm12, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[0,1],ymm2[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[0,1],ymm7[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm12, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1056(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1024(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 992(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 800(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1088(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1120(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm12, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1408(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1376(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1184(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1344(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1152(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1472(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1504(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1312(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm12, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm12, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm12, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 864(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 832(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1024(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 800(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 992(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1088(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 928(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1120(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm12, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1184(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1376(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1152(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1344(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps 1248(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1440(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1408(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1280(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1312(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1504(%rdi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm11 = <1,7,5,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm11, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm12 = <1,7,5,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm12, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7] @@ -8919,53 +8938,74 @@ ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm11, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm12, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm11, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm12, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm0, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm12, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm12, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm4[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] @@ -8976,15 +9016,36 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, (%rsp), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1152(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -9005,146 +9066,120 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1344(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm13[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vblendps $51, (%rsp), %ymm13, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm13[2,3],mem[4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1],ymm5[2,3],mem[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1,2,3],ymm0[4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm5[2,3],mem[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1,2,3],ymm2[4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 1344(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1152(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm12[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm9[1,2,3],ymm6[4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm5[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3],ymm4[4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, (%rsp), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[3,3,3,3] @@ -9166,261 +9201,250 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2] +; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm5, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,2,0,6,0,2,0,6] +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm3[0,1],mem[2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 848(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 848(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1],ymm11[2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1040(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1040(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1],ymm13[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1232(%rdi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1],ymm11[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1424(%rdi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm1, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm14[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm5, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload @@ -9429,31 +9453,30 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3] ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm1, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, (%rsp), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $8, (%rsp), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0,1,2],mem[3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -9461,8 +9484,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -9479,25 +9502,18 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rsi) @@ -9506,13 +9522,13 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9522,13 +9538,13 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9538,22 +9554,30 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%r9) @@ -9577,714 +9601,707 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $2504, %rsp # imm = 0x9C8 +; AVX2-FAST-PERLANE-NEXT: addq $2440, %rsp # imm = 0x988 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-LABEL: load_i32_stride6_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512F-NEXT: subq $2568, %rsp # imm = 0xA08 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm11 ; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm14 ; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm6, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512F-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm6, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm7, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm7, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm13, %zmm1, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm11, %zmm1, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm8, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm4, %zmm10, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm10, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm13, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm1, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm8, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm13, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm8, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm8, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm13, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm13, %zmm1, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm13, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm14, %zmm0, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm9, %zmm1 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm11, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm1, %zmm11, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm11, %zmm1, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512F-NEXT: vpermt2d %zmm2, %zmm18, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512F-NEXT: vpermt2d %zmm2, %zmm31, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512F-NEXT: vpermt2d %zmm2, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm9, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm18, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm31, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm4, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm6, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm18, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm31, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm1, %zmm27 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm6, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm17, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm18, %zmm26 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm31, %zmm27 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm21, %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm18, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm17, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm0 +; AVX512F-NEXT: vpermi2d %zmm16, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm16, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm0, %zmm16, %zmm18 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm16, %zmm31 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm16, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm1, %zmm20 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <0,6,12,18,24,30,u,u> +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <0,6,12,18,24,30,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm10, %zmm19 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <1,7,13,19,25,31,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm3, %zmm21 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = <2,8,14,20,26,u,u,u> +; AVX512F-NEXT: vpermt2d %zmm12, %zmm3, %zmm19 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <1,7,13,19,25,31,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm4, %zmm20 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <2,8,14,20,26,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm12, %zmm9, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <3,9,15,21,27,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm15, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <20,26,0,6,12,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm22 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <20,26,0,6,12,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm5, %zmm22 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <21,27,1,7,13,u,u,u> -; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm11 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm10, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm3, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm8 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm10, %zmm6 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm12 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm3, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm4, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm9, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm10 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm9, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm15, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm0 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm7 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm12, %zmm23 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm15, %zmm25 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm30 = <21,27,1,7,13,u,u,u> -; AVX512F-NEXT: vpermt2d %zmm0, %zmm30, %zmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <21,27,1,7,13,u,u,u> -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm4 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm15 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512F-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} ; AVX512F-NEXT: movw $31, %ax ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm28 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm18 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm24 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm27 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm18 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm9 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm31 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} ; AVX512F-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm17, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm16, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm19, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm20, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm18, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm28, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm24, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm28, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm5, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm16, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm9, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, (%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512F-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512F-NEXT: addq $2568, %rsp # imm = 0xA08 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride6_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm13 +; AVX512BW-NEXT: subq $2568, %rsp # imm = 0xA08 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm20, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm13, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm11, %zmm1, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm8, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm13, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm8, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm13, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm8, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm8, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm13, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm13, %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm13, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm11, %zmm1, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm9, %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm11, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm11, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm11, %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm18, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13] ; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm18, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm31, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm27, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm24 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm18, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm31, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm27 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm6, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm17, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm26 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm31, %zmm27 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm21, %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm17, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm21 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm16, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm16, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm16, %zmm18 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm16, %zmm31 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm16, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm1, %zmm20 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <0,6,12,18,24,30,u,u> +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <0,6,12,18,24,30,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm10, %zmm19 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <1,7,13,19,25,31,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm3, %zmm21 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = <2,8,14,20,26,u,u,u> +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm3, %zmm19 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <1,7,13,19,25,31,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm20 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <2,8,14,20,26,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <3,9,15,21,27,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <20,26,0,6,12,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm22 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <20,26,0,6,12,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm5, %zmm22 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <21,27,1,7,13,u,u,u> -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm10, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm3, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm12 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm3, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm4, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm9, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm15, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm10 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm9, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm7 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm12, %zmm23 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm25 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm4, %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm30 = <21,27,1,7,13,u,u,u> -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm30, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <21,27,1,7,13,u,u,u> -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} ; AVX512BW-NEXT: movw $-2048, %ax # imm = 0xF800 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} ; AVX512BW-NEXT: movw $31, %ax ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm28 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm24 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm27 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm18 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm9 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm31 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm31 {%k1} ; AVX512BW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm28, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm31, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm16, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, (%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512BW-NEXT: addq $2568, %rsp # imm = 0xA08 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <384 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -171,12 +171,10 @@ ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512F-FAST-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7] -; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [13,4,13,4,13,4,13,4] ; AVX512F-FAST-NEXT: vpermi2d %ymm6, %ymm1, %ymm7 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7] -; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [6,13,6,13,6,13,6,13] ; AVX512F-FAST-NEXT: vpermi2d %ymm1, %ymm6, %ymm8 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm1 ; AVX512F-FAST-NEXT: vmovq %xmm2, (%rsi) @@ -240,12 +238,10 @@ ; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512BW-FAST-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7] -; AVX512BW-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [13,4,13,4,13,4,13,4] ; AVX512BW-FAST-NEXT: vpermi2d %ymm6, %ymm1, %ymm7 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7] -; AVX512BW-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [6,13,6,13,6,13,6,13] ; AVX512BW-FAST-NEXT: vpermi2d %ymm1, %ymm6, %ymm8 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm8, %xmm1 ; AVX512BW-FAST-NEXT: vmovq %xmm2, (%rsi) @@ -341,53 +337,54 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = mem[0],xmm3[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[2] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm4[1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm7[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0],xmm5[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0],xmm6[1],xmm9[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,1,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm5[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[0,0],ymm1[1,0],ymm0[4,4],ymm1[5,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm6[0,1,2],xmm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = mem[0],xmm7[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0],ymm0[1,0],ymm1[4,4],ymm0[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm10[0,1],xmm5[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[0,1],xmm6[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm9[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm8, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm4, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm5, (%r10) +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm8, (%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm7, (%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%r10) ; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -409,19 +406,19 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vbroadcastss 8(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3] +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vbroadcastss 8(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3] ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm8 ; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm9 ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0],xmm8[1],xmm6[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm7[0],xmm8[1],xmm7[2,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,1,0] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3] ; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm9 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] ; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3] @@ -429,21 +426,21 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3] ; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] ; AVX2-SLOW-NEXT: vmovaps %xmm2, (%rsi) ; AVX2-SLOW-NEXT: vmovaps %xmm3, (%rdx) ; AVX2-SLOW-NEXT: vmovaps %xmm4, (%rcx) -; AVX2-SLOW-NEXT: vmovaps %xmm7, (%r8) +; AVX2-SLOW-NEXT: vmovaps %xmm6, (%r8) ; AVX2-SLOW-NEXT: vmovaps %xmm9, (%r9) ; AVX2-SLOW-NEXT: vmovaps %xmm5, (%r10) ; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rax) @@ -461,26 +458,26 @@ ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vbroadcastss 84(%rdi), %xmm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,0,7,0,1,0,7,0] ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm5 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm5[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vbroadcastss 8(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3] +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX2-FAST-NEXT: vbroadcastss 8(%rdi), %xmm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3] ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm8 ; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm9 ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0],xmm8[1],xmm6[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm7[0],xmm8[1],xmm7[2,3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,1,0] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],mem[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3] ; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] ; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3] @@ -488,21 +485,21 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm11, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] ; AVX2-FAST-NEXT: vbroadcastss 80(%rdi), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] ; AVX2-FAST-NEXT: vmovaps %xmm2, (%rsi) ; AVX2-FAST-NEXT: vmovaps %xmm3, (%rdx) ; AVX2-FAST-NEXT: vmovaps %xmm5, (%rcx) -; AVX2-FAST-NEXT: vmovaps %xmm7, (%r8) +; AVX2-FAST-NEXT: vmovaps %xmm6, (%r8) ; AVX2-FAST-NEXT: vmovaps %xmm9, (%r9) ; AVX2-FAST-NEXT: vmovaps %xmm4, (%r10) ; AVX2-FAST-NEXT: vmovaps %xmm0, (%rax) @@ -526,19 +523,19 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 8(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 8(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0],xmm8[1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm7[0],xmm8[1],xmm7[2,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,1,0] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3] @@ -546,21 +543,21 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm4, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm7, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm6, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm9, (%r9) ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm5, (%r10) ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, (%rax) @@ -618,107 +615,107 @@ ; SSE-LABEL: load_i32_stride7_vf8: ; SSE: # %bb.0: ; SSE-NEXT: pushq %rax -; SSE-NEXT: movdqa 144(%rdi), %xmm11 -; SSE-NEXT: movdqa 80(%rdi), %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm14 -; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm10 -; SSE-NEXT: movdqa 192(%rdi), %xmm13 -; SSE-NEXT: movdqa 160(%rdi), %xmm9 -; SSE-NEXT: movdqa 112(%rdi), %xmm8 -; SSE-NEXT: movdqa 128(%rdi), %xmm0 +; SSE-NEXT: movdqa 192(%rdi), %xmm5 +; SSE-NEXT: movdqa 160(%rdi), %xmm10 +; SSE-NEXT: movdqa 112(%rdi), %xmm13 +; SSE-NEXT: movdqa 128(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm8 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm15 +; SSE-NEXT: movdqa 48(%rdi), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movdqa 32(%rdi), %xmm0 +; SSE-NEXT: movdqa 144(%rdi), %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm4[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movdqa 176(%rdi), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm2[0],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm13[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] -; SSE-NEXT: movdqa 208(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,1,1] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] -; SSE-NEXT: movdqa 96(%rdi), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,1,1] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm7[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm3[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movdqa 176(%rdi), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] +; SSE-NEXT: movdqa 96(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,1,1] +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] +; SSE-NEXT: movdqa 208(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm7[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm14[0],xmm11[1] +; SSE-NEXT: movdqa %xmm2, %xmm15 +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm15[0],xmm13[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm13[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] @@ -727,29 +724,29 @@ ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movapd %xmm10, (%rdx) -; SSE-NEXT: movapd %xmm9, 16(%rdx) -; SSE-NEXT: movapd %xmm15, (%rcx) -; SSE-NEXT: movapd %xmm12, 16(%rcx) -; SSE-NEXT: movapd %xmm4, (%r8) -; SSE-NEXT: movapd %xmm6, 16(%r8) -; SSE-NEXT: movapd %xmm14, (%r9) -; SSE-NEXT: movapd %xmm11, 16(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movapd %xmm10, 16(%rdx) +; SSE-NEXT: movapd %xmm9, (%rdx) +; SSE-NEXT: movapd %xmm14, 16(%rcx) +; SSE-NEXT: movapd %xmm11, (%rcx) +; SSE-NEXT: movapd %xmm5, 16(%r8) +; SSE-NEXT: movapd %xmm8, (%r8) +; SSE-NEXT: movapd %xmm15, 16(%r9) +; SSE-NEXT: movapd %xmm13, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm13, (%rax) -; SSE-NEXT: movapd %xmm7, 16(%rax) +; SSE-NEXT: movapd %xmm12, 16(%rax) +; SSE-NEXT: movapd %xmm7, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm5, (%rax) -; SSE-NEXT: movapd %xmm3, 16(%rax) +; SSE-NEXT: movapd %xmm4, 16(%rax) +; SSE-NEXT: movapd %xmm3, (%rax) ; SSE-NEXT: popq %rax ; SSE-NEXT: retq ; @@ -757,95 +754,96 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm2[0],ymm11[2],ymm2[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm13[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm7[1,2],xmm10[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1],ymm11[2,2],ymm12[5,5],ymm11[6,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm9[0],xmm13[1],xmm9[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1,2],ymm7[3,4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm8[2,3],ymm4[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm8[0,0],ymm14[3,3],ymm8[4,4],ymm14[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm10[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm9[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[3,1],ymm11[0,3],ymm2[7,5],ymm11[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,1],ymm15[2,0],ymm12[6,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,0],ymm12[0,0],ymm11[5,4],ymm12[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[3,1],ymm11[0,2],ymm12[7,5],ymm11[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm9[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm14[0,1],ymm4[1,3],ymm14[4,5],ymm4[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm8[0,2],ymm12[2,0],ymm8[4,6],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm14[1,0],ymm4[2,0],ymm14[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm9[1,2],xmm12[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm9[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[1,1],ymm10[2,2],ymm11[5,5],ymm10[6,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = mem[0],xmm7[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3],ymm4[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,0],ymm9[3,3],ymm8[4,4],ymm9[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm9[1,2],xmm12[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm14[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[3,1],ymm10[0,3],ymm2[7,5],ymm10[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm11[2,1],ymm15[2,0],ymm11[6,5],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm15[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1,2],xmm12[3] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,0],ymm11[0,0],ymm10[5,4],ymm11[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[3,1],ymm10[0,2],ymm11[7,5],ymm10[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm14[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,0],ymm4[1,1],ymm12[4,4],ymm4[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm8[0,2],ymm11[2,0],ymm8[4,6],ymm11[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[1,0],ymm4[2,0],ymm12[5,4],ymm4[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm8[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm13[0,0],ymm8[7,4],ymm13[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm12[2,0],ymm8[6,4],ymm12[6,4] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm11[2,0],ymm8[6,4],ymm11[6,4] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm11[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm14[2,1],ymm4[3,3],ymm14[6,5],ymm4[7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0],xmm6[1],xmm5[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[1,0],ymm4[2,0],ymm9[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm12[0,1,2],xmm15[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm0[0,0],ymm1[1,0],ymm0[4,4],ymm1[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm13[0,1],xmm9[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm14[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm14[3,0],ymm9[0,0],ymm14[7,4],ymm9[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm12[2,0],ymm4[3,1],ymm12[6,4],ymm4[7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm5[0],xmm6[1],xmm5[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,0],ymm4[2,0],ymm13[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[0,0],ymm0[1,0],ymm1[4,4],ymm0[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm14[0,1],xmm13[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0],ymm13[0,0],ymm12[7,4],ymm13[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1],ymm9[2,0],ymm5[4,5],ymm9[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,0],ymm5[4,5],ymm12[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm11, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm8, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) @@ -1245,24 +1243,23 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i32_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $424, %rsp # imm = 0x1A8 -; SSE-NEXT: movdqa 304(%rdi), %xmm9 -; SSE-NEXT: movdqa 272(%rdi), %xmm2 -; SSE-NEXT: movdqa 224(%rdi), %xmm10 +; SSE-NEXT: subq $408, %rsp # imm = 0x198 +; SSE-NEXT: movdqa 416(%rdi), %xmm9 +; SSE-NEXT: movdqa 384(%rdi), %xmm2 +; SSE-NEXT: movdqa 336(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm15 +; SSE-NEXT: movdqa 352(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa 192(%rdi), %xmm7 -; SSE-NEXT: movdqa 160(%rdi), %xmm8 -; SSE-NEXT: movdqa 112(%rdi), %xmm11 -; SSE-NEXT: movdqa 128(%rdi), %xmm0 +; SSE-NEXT: movdqa 192(%rdi), %xmm3 +; SSE-NEXT: movdqa 160(%rdi), %xmm5 +; SSE-NEXT: movdqa 112(%rdi), %xmm4 +; SSE-NEXT: movdqa 128(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -1273,11 +1270,13 @@ ; SSE-NEXT: movdqa %xmm7, %xmm12 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] ; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm13 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1290,38 +1289,39 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm7 -; SSE-NEXT: movdqa 352(%rdi), %xmm0 +; SSE-NEXT: movdqa 224(%rdi), %xmm10 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 416(%rdi), %xmm2 -; SSE-NEXT: movdqa 384(%rdi), %xmm4 +; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: movdqa 272(%rdi), %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] ; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa 144(%rdi), %xmm15 +; SSE-NEXT: movdqa 32(%rdi), %xmm15 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa 32(%rdi), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 144(%rdi), %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1332,32 +1332,35 @@ ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa 256(%rdi), %xmm14 +; SSE-NEXT: movdqa 368(%rdi), %xmm14 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa %xmm5, %xmm13 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa 368(%rdi), %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa 256(%rdi), %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa 176(%rdi), %xmm3 +; SSE-NEXT: movdqa 64(%rdi), %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm5 +; SSE-NEXT: movdqa 176(%rdi), %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] @@ -1366,7 +1369,7 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm4 +; SSE-NEXT: movdqa 400(%rdi), %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] @@ -1374,16 +1377,15 @@ ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movdqa 400(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 288(%rdi), %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm13[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm8 +; SSE-NEXT: movdqa 96(%rdi), %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,0,1,1] ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] @@ -1391,18 +1393,19 @@ ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm11 +; SSE-NEXT: movdqa 208(%rdi), %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,1,1] ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,2,3,3] ; SSE-NEXT: movdqa %xmm6, %xmm5 ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm6[2],xmm10[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm0 +; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,1,1] ; SSE-NEXT: movdqa %xmm4, %xmm0 @@ -1413,12 +1416,13 @@ ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm14[2],xmm9[3],xmm14[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm13 +; SSE-NEXT: movdqa 320(%rdi), %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1521,270 +1525,271 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm15[0],xmm14[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: movaps %xmm13, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, (%rsi) +; SSE-NEXT: movaps %xmm13, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: movaps %xmm13, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movaps %xmm13, 16(%rdx) +; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movaps %xmm0, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movapd %xmm7, 48(%r9) -; SSE-NEXT: movapd %xmm5, 32(%r9) -; SSE-NEXT: movapd %xmm9, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm6, 48(%rax) -; SSE-NEXT: movapd %xmm1, 32(%rax) +; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movapd %xmm7, 32(%r9) +; SSE-NEXT: movapd %xmm5, 48(%r9) +; SSE-NEXT: movapd %xmm9, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movaps %xmm0, (%r9) +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movapd %xmm6, 32(%rax) +; SSE-NEXT: movapd %xmm1, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm14, 48(%rax) -; SSE-NEXT: movapd %xmm12, 32(%rax) -; SSE-NEXT: movapd %xmm11, (%rax) -; SSE-NEXT: movapd %xmm8, 16(%rax) -; SSE-NEXT: addq $424, %rsp # imm = 0x1A8 +; SSE-NEXT: movapd %xmm14, 32(%rax) +; SSE-NEXT: movapd %xmm12, 48(%rax) +; SSE-NEXT: movapd %xmm11, 16(%rax) +; SSE-NEXT: movapd %xmm8, (%rax) +; SSE-NEXT: addq $408, %rsp # imm = 0x198 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride7_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm10 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm11 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm3[6],ymm1[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm12[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm5[2,2],ymm7[5,5],ymm5[6,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0],xmm15[1],xmm10[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm8[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,0],ymm2[3,3],ymm1[4,4],ymm2[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm12[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1],ymm4[2,2],ymm13[5,5],ymm4[6,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0],xmm9[1],xmm11[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] +; AVX1-ONLY-NEXT: vmovups %ymm12, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm7[6],ymm5[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm9[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1],ymm7[2,2],ymm4[5,5],ymm7[6,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = mem[0],xmm3[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm8[2,3],ymm15[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,0],ymm2[3,3],ymm8[4,4],ymm2[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm6[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,1],ymm3[2,2],ymm10[5,5],ymm3[6,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = mem[0],xmm5[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm3[0,1] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm2[0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[0,0],ymm14[3,3],ymm0[4,4],ymm14[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm6[2] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm9[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm5[0,3],ymm14[7,5],ymm5[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm7[2,1],ymm14[2,0],ymm7[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm12[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm4[0,3],ymm12[7,5],ymm4[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[2,1],ymm12[2,0],ymm13[6,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm11[3,1],ymm7[0,3],ymm11[7,5],ymm7[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[2,1],ymm13[2,0],ymm4[6,5],ymm13[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm8[0],ymm15[0],ymm8[2],ymm15[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm7[0,0],ymm5[5,4],ymm7[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,1],ymm2[0,2],ymm7[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1,2],xmm15[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1],ymm8[1,3],ymm7[4,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[0,2],ymm5[2,0],ymm1[4,6],ymm5[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,0],ymm13[0,0],ymm4[5,4],ymm13[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,1],ymm2[0,2],ymm13[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[3,1],ymm3[0,3],ymm12[7,5],ymm3[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm10[2,1],ymm11[2,0],ymm10[6,5],ymm11[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm11[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm9[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,0],ymm4[0,0],ymm7[5,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1],ymm1[0,2],ymm4[7,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm14[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm3[1,3],ymm5[4,5],ymm3[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,2],ymm4[2,0],ymm0[4,6],ymm4[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm2[0,0],ymm1[7,4],ymm2[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,0],ymm8[2,0],ymm7[5,4],ymm8[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,0],ymm1[6,4],ymm2[6,4] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm11[0,0],ymm15[1,1],ymm11[4,4],ymm15[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,2],ymm7[2,0],ymm8[4,6],ymm7[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm7[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[0,0],ymm0[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm3[2,0],ymm5[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0],ymm10[0,0],ymm3[5,4],ymm10[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm10[3,1],ymm3[0,2],ymm10[7,5],ymm3[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[0,0],ymm2[1,1],ymm4[4,4],ymm2[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,2],ymm7[2,0],ymm0[4,6],ymm7[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm8[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[3,0],ymm3[0,0],ymm8[7,4],ymm3[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,0],ymm15[2,0],ymm11[5,4],ymm15[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[2,0],ymm7[2,0],ymm3[6,4],ymm7[6,4] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = mem[0],xmm5[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm5[0,0],ymm0[7,4],ymm5[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[1,0],ymm2[2,0],ymm4[5,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,0],ymm0[6,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm7[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = mem[0],xmm11[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,1],ymm8[3,3],ymm7[6,5],ymm8[7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,0],ymm2[3,1],ymm4[6,4],ymm2[7,5] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,0],ymm6[2,0],ymm8[5,4],ymm6[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm2[2,0],ymm5[5,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm10[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[0,0],ymm12[1,0],ymm13[4,4],ymm12[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[0,1],xmm2[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,1],ymm3[3,3],ymm5[6,5],ymm3[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0],xmm2[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm13[0,0],ymm3[1,0],ymm13[4,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1],xmm5[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm11[2,0],ymm15[3,1],ymm11[6,4],ymm15[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0],xmm15[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,0],ymm3[2,0],ymm6[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,0],ymm5[2,0],ymm6[5,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1,2],xmm9[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[0,0],ymm11[1,0],ymm10[4,4],ymm11[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm8[0,1],xmm6[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm7[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[3,0],ymm6[0,0],ymm7[7,4],ymm6[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm1[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[1,0],ymm12[2,0],ymm13[5,4],ymm12[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0],ymm7[0,0],ymm5[7,4],ymm7[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,0],ymm7[4,5],ymm5[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm14[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,0],ymm11[2,0],ymm10[5,4],ymm11[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,0],ymm2[1,0],ymm10[4,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm9[0,1],xmm6[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0],ymm6[0,0],ymm4[7,4],ymm6[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm1[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,0],ymm6[4,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, (%rsp), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,0],ymm3[2,0],ymm13[5,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,0],ymm6[0,0],ymm11[7,4],ymm6[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm14[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,0],ymm6[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm12[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm10[1,0],ymm2[2,0],ymm10[5,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1797,16 +1802,16 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -1814,45 +1819,44 @@ ; AVX2-SLOW-LABEL: load_i32_stride7_vf16: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $264, %rsp # imm = 0x108 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm15 ; AVX2-SLOW-NEXT: vpbroadcastq 80(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,7,6,u> -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm4[6],ymm9[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpbroadcastd 196(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm7[6],ymm9[7] +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm11[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpbroadcastd 196(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm0[6],ymm10[7] ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpbroadcastd 420(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpbroadcastd 420(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] @@ -1860,68 +1864,68 @@ ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm15[2,3],ymm5[4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,3,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm4 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,0] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,2,0] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,3,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm3 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm12 = ymm15[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm12 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm14[1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm12 = ymm3[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm12 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm13[1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] -; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] +; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm10[1],ymm12[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm14[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm11[1],ymm15[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,2],ymm2[1,3],ymm1[4,6],ymm2[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,2],ymm2[1,3],ymm1[4,6],ymm2[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm15[1],ymm5[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm5[1],ymm3[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] @@ -1936,28 +1940,28 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4,3,4,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm15 ; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm5, %ymm10 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,3] ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm11 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 212(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 212(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vpbroadcastd 324(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm10[2,3] ; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm0, %ymm10 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 436(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 436(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] @@ -1971,7 +1975,7 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm11[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm13[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] @@ -1992,7 +1996,7 @@ ; AVX2-SLOW-NEXT: vpbroadcastd 80(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm15[1],ymm6[2,3,4],ymm15[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] @@ -2025,8 +2029,8 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%r8) ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm15, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 32(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) @@ -2040,6 +2044,7 @@ ; AVX2-FAST-LABEL: load_i32_stride7_vf16: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $296, %rsp # imm = 0x128 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm4 @@ -2047,7 +2052,6 @@ ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm12 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm11 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-FAST-NEXT: vpbroadcastq 80(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] @@ -2065,11 +2069,11 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vpbroadcastq 304(%rdi), %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm4[4,5,6,7] @@ -2267,45 +2271,44 @@ ; AVX2-FAST-PERLANE-LABEL: load_i32_stride7_vf16: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $264, %rsp # imm = 0x108 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 80(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <0,7,6,u> -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm4[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 196(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm7[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm4, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 196(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm0[6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 420(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 420(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] @@ -2313,68 +2316,68 @@ ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm15[2,3],ymm5[4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm4 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,0] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,2,0] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm3 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm12 = ymm15[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm14[1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm12 = ymm3[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm13[1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm10[1],ymm12[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm14[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm11[1],ymm15[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,2],ymm2[1,3],ymm1[4,6],ymm2[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,2],ymm2[1,3],ymm1[4,6],ymm2[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm15[1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm5[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] @@ -2389,28 +2392,28 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4,3,4,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm5, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,7,0,7,0,7,0,7] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 212(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 212(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 324(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm10[2,3] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm0, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 436(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 436(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] @@ -2424,7 +2427,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] @@ -2445,7 +2448,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 80(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm15[1],ymm6[2,3,4],ymm15[5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] @@ -2478,8 +2481,8 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) @@ -2744,454 +2747,447 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i32_stride7_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $1176, %rsp # imm = 0x498 -; SSE-NEXT: movdqa 80(%rdi), %xmm6 -; SSE-NEXT: movdqa (%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm10 -; SSE-NEXT: movdqa 640(%rdi), %xmm2 -; SSE-NEXT: movdqa 608(%rdi), %xmm3 -; SSE-NEXT: movdqa 560(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: subq $1160, %rsp # imm = 0x488 ; SSE-NEXT: movdqa 192(%rdi), %xmm5 -; SSE-NEXT: movdqa 160(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm8 ; SSE-NEXT: movdqa 112(%rdi), %xmm12 -; SSE-NEXT: movdqa 128(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 528(%rdi), %xmm2 +; SSE-NEXT: movdqa 496(%rdi), %xmm3 +; SSE-NEXT: movdqa 448(%rdi), %xmm11 +; SSE-NEXT: movdqa 464(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm6 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm14 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: movdqa %xmm3, %xmm9 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,3,3,3] +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm1 +; SSE-NEXT: movdqa 560(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 464(%rdi), %xmm0 +; SSE-NEXT: movdqa 576(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 528(%rdi), %xmm2 -; SSE-NEXT: movdqa 496(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 640(%rdi), %xmm2 +; SSE-NEXT: movdqa 608(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm1 +; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 416(%rdi), %xmm2 -; SSE-NEXT: movdqa 384(%rdi), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 272(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 784(%rdi), %xmm1 +; SSE-NEXT: movdqa 672(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 800(%rdi), %xmm0 +; SSE-NEXT: movdqa 688(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 864(%rdi), %xmm2 -; SSE-NEXT: movdqa 832(%rdi), %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: movdqa 752(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 720(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm3, %xmm15 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: movdqa 336(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 352(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 304(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 416(%rdi), %xmm3 +; SSE-NEXT: movdqa 384(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 688(%rdi), %xmm0 +; SSE-NEXT: movdqa 784(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 800(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 752(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 720(%rdi), %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 864(%rdi), %xmm2 +; SSE-NEXT: movdqa 832(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa 144(%rdi), %xmm14 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa 592(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa 32(%rdi), %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa 32(%rdi), %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa 480(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 480(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa 144(%rdi), %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa 592(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa 256(%rdi), %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa 704(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa 368(%rdi), %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa 816(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa 368(%rdi), %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm9[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa 256(%rdi), %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[1,1,1,1] +; SSE-NEXT: movdqa 816(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm9[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 176(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 288(%rdi), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 400(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm8[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm15[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 512(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa 704(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm8[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 176(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 624(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa (%rsp), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 736(%rdi), %xmm3 +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm8[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm0 +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: movdqa 848(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm8[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 400(%rdi), %xmm0 +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,1,1] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 512(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm13[2],xmm7[3],xmm13[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm15[2],xmm7[3],xmm15[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 848(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 544(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; SSE-NEXT: movdqa 736(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm15[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm12[2],xmm8[3],xmm12[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 656(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 544(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 656(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 768(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 768(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm14[2],xmm4[3],xmm14[3] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 880(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] @@ -3201,58 +3197,50 @@ ; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -3261,23 +3249,31 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -3287,16 +3283,16 @@ ; SSE-NEXT: # xmm13 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -3316,7 +3312,7 @@ ; SSE-NEXT: # xmm6 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -3326,7 +3322,7 @@ ; SSE-NEXT: # xmm5 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -3336,74 +3332,74 @@ ; SSE-NEXT: # xmm4 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps %xmm1, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rdx) +; SSE-NEXT: movaps %xmm1, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps %xmm1, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rcx) +; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rcx) +; SSE-NEXT: movaps %xmm1, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps %xmm1, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps %xmm1, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 96(%r8) @@ -3436,11 +3432,11 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm12, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rax) +; SSE-NEXT: movaps %xmm1, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rax) +; SSE-NEXT: movaps %xmm1, 96(%rax) +; SSE-NEXT: movapd %xmm15, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3460,536 +3456,526 @@ ; SSE-NEXT: movapd %xmm7, 32(%rax) ; SSE-NEXT: movapd %xmm8, 16(%rax) ; SSE-NEXT: movapd %xmm13, (%rax) -; SSE-NEXT: addq $1176, %rsp # imm = 0x498 +; SSE-NEXT: addq $1160, %rsp # imm = 0x488 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i32_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1496, %rsp # imm = 0x5D8 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX1-ONLY-NEXT: subq $1432, %rsp # imm = 0x598 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps %xmm7, %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm8 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm12[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm4[1,2],xmm11[1] +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm14 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 752(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 752(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm15[0],ymm5[0],ymm15[2],ymm5[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1],ymm14[2,2],ymm10[5,5],ymm14[6,6] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1],ymm0[2,2],ymm6[5,5],ymm0[6,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm6[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,0],ymm5[3,3],ymm6[4,4],ymm5[7,7] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm9[0,1] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,0],ymm1[3,3],ymm6[4,4],ymm1[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,xmm1[1,2],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm9[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm0[2,2],ymm5[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[1,1],ymm0[2,2],ymm8[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm13[2,3],ymm11[0,1] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0],ymm2[3,3],ymm13[4,4],ymm2[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = zero,xmm2[1,2],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm0[2,2],ymm4[5,5],ymm0[6,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0],xmm15[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = mem[0],xmm2[1],mem[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm0[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[0,0],ymm5[3,3],ymm1[4,4],ymm5[7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3],ymm9[0,1] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,0],ymm5[3,3],ymm2[4,4],ymm5[7,7] +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm14[2] +; AVX1-ONLY-NEXT: vmovaps %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm12[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1],ymm6[2,2],ymm4[5,5],ymm6[6,6] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm1[0],xmm10[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[2,3],ymm3[0,1] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[0,0],ymm15[3,3],ymm5[4,4],ymm15[7,7] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1],ymm0[2,2],ymm3[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = mem[0],xmm13[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm15[1,2],xmm2[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm0[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm15[3,1],mem[0,3],ymm15[7,5],mem[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[2,1],ymm15[2,0],ymm5[6,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm9[0],ymm5[2],ymm9[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[3,1],ymm9[0,3],ymm15[7,5],ymm9[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[2,1],ymm15[2,0],ymm8[6,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm13[0],ymm11[0],ymm13[2],ymm11[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm1[1],xmm14[2,3] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm13[0,0],ymm5[3,3],ymm13[4,4],ymm5[7,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm11[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1],ymm0[2,2],ymm15[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm12 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm3[2,3],ymm4[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0],ymm5[3,3],ymm3[4,4],ymm5[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm7[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1],ymm14[0,3],ymm5[7,5],ymm14[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[3,1],ymm6[0,3],ymm15[7,5],ymm6[4,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,1],ymm15[2,0],ymm4[6,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm12[0],ymm3[0],ymm12[2],ymm3[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm2[1],xmm6[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm3[0,3],ymm14[7,5],ymm3[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm15[2,1],ymm5[2,0],ymm15[6,5],ymm5[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm1[3,1],mem[0,3],ymm1[7,5],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm7[2,1],ymm14[2,0],ymm7[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],ymm4[0],ymm11[2],ymm4[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[2,1],ymm5[2,0],ymm7[6,5],ymm5[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm1[3,1],mem[0,3],ymm1[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm5[2,1],ymm14[2,0],ymm5[6,5],ymm14[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm13[0],ymm1[0],ymm13[2],ymm1[2] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0],ymm1[0,0],ymm6[5,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,1],ymm6[0,2],ymm1[7,5],ymm6[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,1],ymm6[0,3],ymm9[7,5],ymm6[4,7] +; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm12[2,1],ymm9[2,0],ymm12[6,5],ymm9[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[0,1],ymm14[1,3],ymm0[4,5],ymm14[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[0,2],ymm10[2,0],ymm5[4,6],ymm10[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,0],ymm8[0,0],ymm9[5,4],ymm8[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[3,1],ymm6[0,2],ymm8[7,5],ymm6[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1],ymm0[1,3],ymm1[4,5],ymm0[5,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm13[0,2],ymm10[2,0],ymm13[4,6],ymm10[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,0],ymm7[0,0],ymm3[5,4],ymm7[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,1],ymm8[0,2],ymm7[7,5],ymm8[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm15[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm15[0,0],ymm0[5,4],ymm15[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[3,1],ymm0[0,2],ymm15[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,2],xmm8[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,1],ymm4[1,3],ymm2[4,5],ymm4[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,2],ymm9[2,0],ymm11[4,6],ymm9[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm9[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,0],ymm13[1,1],ymm9[4,4],ymm13[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm2[0,2],ymm8[2,0],ymm2[4,6],ymm8[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm7[0,0],ymm0[5,4],ymm7[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[3,1],ymm0[0,2],ymm7[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[0,0],ymm8[1,1],ymm3[4,4],ymm8[5,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm3[1,0],ymm4[0,0],ymm3[5,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1],ymm5[0,2],ymm4[7,5],ymm5[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm7[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[0,2],ymm7[2,0],ymm3[4,6],ymm7[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm5[0,0],ymm0[5,4],ymm5[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[3,1],ymm0[0,2],ymm5[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2],xmm11[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm8[0,1],ymm6[1,3],ymm8[4,5],ymm6[5,7] -; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,0],ymm1[1,1],ymm15[4,4],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,2],ymm5[2,0],ymm11[4,6],ymm5[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,0],ymm4[0,0],ymm6[5,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,1],ymm0[0,2],ymm4[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm14[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,0],ymm10[1,1],ymm4[4,4],ymm10[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,2],ymm1[2,0],ymm12[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,0],ymm13[2,0],ymm9[5,4],ymm13[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = mem[0],xmm7[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,0],ymm1[0,0],ymm3[7,4],ymm1[4,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm9[0,2],ymm5[2,0],ymm9[4,6],ymm5[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm13[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm13[3,0],ymm4[0,0],ymm13[7,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[2,0],ymm4[2,0],ymm3[6,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,0],ymm8[2,0],ymm9[5,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,0],ymm2[2,0],ymm1[6,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm10 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm13[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = mem[0],xmm8[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,0],ymm2[0,0],ymm11[7,4],ymm2[4,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[1,0],ymm14[2,0],ymm8[5,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm15[1,0],ymm1[2,0],ymm15[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[2,0],ymm2[6,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm13[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = mem[0],xmm7[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm12[3,0],ymm2[0,0],ymm12[7,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm5[2,0],ymm4[5,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[2,0],ymm2[6,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm11 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = mem[0],xmm12[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,0],ymm3[0,0],ymm11[7,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[1,0],ymm10[2,0],ymm11[5,4],ymm10[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,0],ymm1[6,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = mem[0],xmm12[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,0],ymm1[0,0],ymm9[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm15[1,0],ymm5[2,0],ymm15[5,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,0],ymm2[2,0],ymm1[6,4],ymm2[6,4] -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1,2],xmm12[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm8[2,1],mem[3,3],ymm8[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm3[1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,0],ymm2[2,0],ymm14[5,4],ymm2[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm3[0,0],mem[1,0],ymm3[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[0,1],xmm0[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm4[2,1],mem[3,3],ymm4[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm10[0],xmm8[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[2,0],ymm0[3,1],ymm9[6,4],ymm0[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm0[0],mem[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,0],ymm0[2,0],ymm14[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,0],ymm15[1,0],ymm12[4,4],ymm15[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm14[1,0],ymm3[2,0],ymm14[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm0[0,0],mem[1,0],ymm0[4,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm14[0,1],xmm9[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm14[0,1],xmm6[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm11[2,1],mem[3,3],ymm11[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, (%rsp), %xmm11, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0],xmm11[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm15[2,0],ymm1[3,1],ymm15[6,4],ymm1[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,0],ymm0[2,0],ymm14[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[0,0],mem[1,0],ymm14[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[1,0],ymm6[2,0],ymm14[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm13[0,1,2],xmm8[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm2[0,0],ymm3[1,0],ymm2[4,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm14[0,1],xmm13[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,1],ymm5[3,3],ymm6[6,5],ymm5[7,7] -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm9 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0],xmm2[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,0],ymm0[2,0],ymm14[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm14[0,1],xmm8[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm12[2,0],ymm5[3,1],ymm12[6,4],ymm5[7,5] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0],xmm5[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,0],ymm6[2,0],ymm8[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,0],ymm14[1,0],ymm9[4,4],ymm14[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[0,1],xmm4[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $114, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm8[2,0],mem[3,1],ymm8[6,4],mem[7,5] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0],xmm6[1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm10[0,0],mem[1,0],ymm10[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[0,1],xmm6[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm15[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm15[3,0],ymm4[0,0],ymm15[7,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm1[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,0],ymm6[4,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm2[1,0],ymm3[2,0],ymm2[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm0[3,0],ymm4[0,0],ymm0[7,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,0],ymm6[4,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm6[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm5[0,0],ymm6[1,0],ymm5[4,4],ymm6[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm14[0,1],xmm1[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,0],ymm0[0,0],ymm4[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,0],ymm15[2,0],ymm12[5,4],ymm15[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,0],ymm1[0,0],ymm9[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0],ymm6[2,0],ymm5[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,0],ymm1[0,0],ymm12[7,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm7[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[1,0],mem[2,0],ymm8[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, (%rsp), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,0],ymm7[4,5],ymm1[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm7[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm9[1,0],ymm14[2,0],ymm9[5,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm8[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,0],ymm4[0,0],ymm8[7,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vpermilps $238, (%rsp), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,0],ymm5[4,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm5[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm2[1,0],mem[2,0],ymm2[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm2[1,0],mem[2,0],ymm2[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -3999,10 +3985,10 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) @@ -4019,46 +4005,45 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: addq $1496, %rsp # imm = 0x5D8 +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: addq $1432, %rsp # imm = 0x598 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride7_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $1224, %rsp # imm = 0x4C8 +; AVX2-SLOW-NEXT: subq $1160, %rsp # imm = 0x488 ; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpbroadcastq 80(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm11 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6],ymm2[7] ; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm2 @@ -4070,10 +4055,10 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm6 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm13 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq 528(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] @@ -4087,14 +4072,13 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq 304(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm15 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -4113,9 +4097,9 @@ ; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vpbroadcastq 752(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %xmm2 @@ -4135,13 +4119,13 @@ ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm15[2,3],ymm10[4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm13[1],ymm4[2,3,4],ymm13[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -4157,12 +4141,14 @@ ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm8 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -4178,11 +4164,10 @@ ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm6 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm12[2,3],ymm7[4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7] @@ -4195,385 +4180,385 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm11 +; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 456(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX2-SLOW-NEXT: vpbroadcastd 652(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm14[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm2[1],xmm14[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm5[0],ymm8[0],ymm5[2],ymm8[2] +; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 752(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = ymm7[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 680(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm0[1],xmm14[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] +; AVX2-SLOW-NEXT: vpbroadcastd 876(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm15 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,2],ymm11[1,3],ymm6[4,6],ymm11[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,2],ymm10[1,3],ymm9[4,6],ymm10[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 656(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 528(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm3 = ymm10[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 456(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-SLOW-NEXT: vpbroadcastd 652(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 752(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm12 = ymm9[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm3[3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 680(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm3[1],xmm15[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX2-SLOW-NEXT: vpbroadcastd 876(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = ymm4[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,2],ymm13[1,3],ymm14[4,6],ymm13[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm7[1,3],ymm8[4,6],ymm7[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 656(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm5[1,3],ymm3[4,6],ymm5[5,7] -; AVX2-SLOW-NEXT: vmovaps %ymm5, %ymm10 -; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,2],ymm8[1,3],ymm9[4,6],ymm8[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm14 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm5[1,3],ymm7[4,6],ymm5[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $2, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,2],ymm9[1,3],ymm11[4,6],ymm9[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 880(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [4,3,4,3] -; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vbroadcastss 548(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm4 = [0,7,0,7,0,7,0,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 660(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vmovaps %ymm14, %ymm7 -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 212(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vbroadcastss 324(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm8[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 436(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vbroadcastss 772(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm5, %ymm12 -; AVX2-SLOW-NEXT: vbroadcastss 884(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0],ymm7[1],ymm13[2,3,4],ymm7[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpbroadcastd 100(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [4,3,4,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,7,0,7,0,7,0,7] +; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm8 +; AVX2-SLOW-NEXT: vpbroadcastd 212(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 548(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vpermd %ymm12, %ymm4, %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm5 +; AVX2-SLOW-NEXT: vpbroadcastd 660(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 324(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3] +; AVX2-SLOW-NEXT: vpermd %ymm9, %ymm4, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 436(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpbroadcastd 772(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpermd %ymm14, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm15 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 884(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm10[1],ymm8[2,3,4],ymm10[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-SLOW-NEXT: vpbroadcastd 216(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm12[1],ymm5[2,3,4],ymm12[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-SLOW-NEXT: vpbroadcastd 664(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm8[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 584(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm11[1],ymm13[2,3,4],ymm11[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-SLOW-NEXT: vpbroadcastd 440(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-SLOW-NEXT: vpbroadcastd 888(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 136(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermps 640(%rdi), %ymm4, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 528(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpermd 192(%rdi), %ymm4, %ymm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 80(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 808(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermps 864(%rdi), %ymm4, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 752(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 136(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 360(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpermd 416(%rdi), %ymm4, %ymm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 304(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 584(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermps 192(%rdi), %ymm4, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 360(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpermd 640(%rdi), %ymm4, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 528(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm10[3] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 808(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpermps 416(%rdi), %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 304(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%r9) +; AVX2-SLOW-NEXT: vpermd 864(%rdi), %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 752(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rax) -; AVX2-SLOW-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rax) +; AVX2-SLOW-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i32_stride7_vf32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $1192, %rsp # imm = 0x4A8 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq 80(%rdi), %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5],ymm8[6],ymm11[7] +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm2 @@ -4585,15 +4570,12 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm7[6],ymm9[7] -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm10 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 528(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %xmm3 @@ -4604,14 +4586,13 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -4622,11 +4603,13 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm15 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4651,15 +4634,13 @@ ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm15 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,6,5,6,5,6,5,6] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,7,5,4,7,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2,3,4],ymm11[5],ymm13[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] @@ -4667,20 +4648,21 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm13 ; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm13[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm13[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] @@ -4694,174 +4676,177 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm4[2,3],ymm7[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm10[2,3],ymm5[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm7[1],ymm12[2,3,4],ymm7[5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[12,13,14,15],ymm13[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm13[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm3[2,3],ymm13[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] -; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm4[0],ymm12[2],ymm4[2] +; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 456(%rdi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm0[1],xmm11[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 456(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] ; AVX2-FAST-NEXT: vpbroadcastd 652(%rdi), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 752(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm11 = ymm13[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm2[3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm5[0],ymm8[0],ymm5[2],ymm8[2] +; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 752(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm11 = ymm7[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 680(%rdi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm2[1],xmm15[2,3] +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm6[0],ymm14[2],ymm6[2] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm7[0],mem[0],ymm7[2],mem[2] ; AVX2-FAST-NEXT: vpbroadcastd 876(%rdi), %ymm14 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm11 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm14 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm3[1],xmm14[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0],ymm9[1],mem[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2],ymm4[1,3],ymm12[4,6],ymm4[5,7] +; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm14 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,2],ymm13[1,3],ymm6[4,6],ymm13[5,7] +; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastss 656(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm2[0],ymm15[0],ymm2[2],ymm15[2] -; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm8[0],mem[1],ymm8[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd $2, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm10[0,2],ymm5[1,3],ymm10[4,6],ymm5[5,7] -; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm15 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,2],ymm8[1,3],ymm5[4,6],ymm8[5,7] +; AVX2-FAST-NEXT: vbroadcastss 432(%rdi), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm10[1],mem[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,2],ymm9[1,3],ymm12[4,6],ymm9[5,7] -; AVX2-FAST-NEXT: vbroadcastss 656(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm4[1],ymm7[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm15[1,3],ymm2[4,6],ymm15[5,7] -; AVX2-FAST-NEXT: vbroadcastss 432(%rdi), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm13[1,3],ymm14[4,6],ymm13[5,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm9[1,3],ymm8[4,6],ymm9[5,7] ; AVX2-FAST-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,3,4,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm10 +; AVX2-FAST-NEXT: vpbroadcastd 212(%rdi), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastd 548(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpbroadcastd 548(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 660(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 212(%rdi), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 660(%rdi), %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] @@ -4871,135 +4856,139 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm14 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm13 ; AVX2-FAST-NEXT: vpbroadcastd 436(%rdi), %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastd 772(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm11, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm1 +; AVX2-FAST-NEXT: vpbroadcastd 772(%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1,2],xmm7[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm11, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastd 884(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [1,0,3,3,1,0,7,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm9, %ymm6 -; AVX2-FAST-NEXT: vpbroadcastd 216(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [1,0,3,3,1,0,7,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpbroadcastd 216(%rdi), %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %xmm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm10[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm12[1],mem[2,3,4],ymm12[5],mem[6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vpbroadcastd 664(%rdi), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm8, %ymm9 +; AVX2-FAST-NEXT: vpbroadcastd 664(%rdi), %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm8, %ymm9 +; AVX2-FAST-NEXT: vpbroadcastd 440(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %xmm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] ; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm12 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm9, %ymm12 -; AVX2-FAST-NEXT: vpbroadcastd 440(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %xmm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1,2],xmm7[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm13[0,1],xmm7[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpbroadcastd 888(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 584(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpermd 640(%rdi), %ymm11, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 528(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpbroadcastd 888(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 136(%rdi), %xmm8 +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm11, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 80(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3] ; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 808(%rdi), %xmm10 -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpermd 864(%rdi), %ymm11, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 752(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 360(%rdi), %xmm12 +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-NEXT: vpermd 416(%rdi), %ymm11, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 304(%rdi), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3] +; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 136(%rdi), %xmm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 584(%rdi), %xmm12 ; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm11, %ymm13 +; AVX2-FAST-NEXT: vpermd 640(%rdi), %ymm11, %ymm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 80(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vpbroadcastd 528(%rdi), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm13[0,1,2],xmm7[3] ; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 360(%rdi), %xmm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm13[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 808(%rdi), %xmm12 ; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] -; AVX2-FAST-NEXT: vpermd 416(%rdi), %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpermd 864(%rdi), %ymm11, %ymm11 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 304(%rdi), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3] +; AVX2-FAST-NEXT: vpbroadcastd 752(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3] ; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm11, 96(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload @@ -5008,25 +4997,25 @@ ; AVX2-FAST-NEXT: vmovaps %ymm11, 64(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm11, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm11, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm11, (%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r8) @@ -5035,45 +5024,43 @@ ; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm7, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 96(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm8, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, (%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm10, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm8, (%rax) ; AVX2-FAST-NEXT: addq $1192, %rsp # imm = 0x4A8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride7_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $1224, %rsp # imm = 0x4C8 +; AVX2-FAST-PERLANE-NEXT: subq $1160, %rsp # imm = 0x488 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 80(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm2 @@ -5085,10 +5072,10 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 528(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] @@ -5102,14 +5089,13 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 304(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -5128,9 +5114,9 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 752(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %xmm2 @@ -5150,13 +5136,13 @@ ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm15[2,3],ymm10[4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm13[1],ymm4[2,3,4],ymm13[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -5172,12 +5158,14 @@ ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm13[2,3],ymm9[4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -5193,11 +5181,10 @@ ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm12[2,3],ymm7[4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7] @@ -5210,360 +5197,360 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 456(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 652(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm2[1],xmm14[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm5[0],ymm8[0],ymm5[2],ymm8[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 752(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm13 = ymm7[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 680(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm0[1],xmm14[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 876(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,2],ymm11[1,3],ymm6[4,6],ymm11[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,2],ymm10[1,3],ymm9[4,6],ymm10[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 656(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 528(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm3 = ymm10[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 456(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 652(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 752(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm12 = ymm9[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm3[3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 680(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm3[1],xmm15[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 876(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm13 = ymm4[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,2],ymm13[1,3],ymm14[4,6],ymm13[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm7[1,3],ymm8[4,6],ymm7[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 656(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm5[1,3],ymm3[4,6],ymm5[5,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,2],ymm8[1,3],ymm9[4,6],ymm8[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm5[1,3],ymm7[4,6],ymm5[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $2, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,2],ymm9[1,3],ymm11[4,6],ymm9[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 880(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [4,3,4,3] -; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 548(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm4 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 660(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 212(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 324(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm4, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 436(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 772(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 884(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0],ymm7[1],ymm13[2,3,4],ymm7[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [4,3,4,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,7,0,7,0,7,0,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 212(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 548(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm12, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 660(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 324(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 436(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 772(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm14, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 884(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm10[1],ymm8[2,3,4],ymm10[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 216(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm12[1],ymm5[2,3,4],ymm12[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 664(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 584(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm11[1],ymm13[2,3,4],ymm11[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 440(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 888(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 136(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermps 640(%rdi), %ymm4, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 528(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermd 192(%rdi), %ymm4, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 80(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 808(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermps 864(%rdi), %ymm4, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 752(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 136(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 360(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermd 416(%rdi), %ymm4, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 304(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 584(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermps 192(%rdi), %ymm4, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 360(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermd 640(%rdi), %ymm4, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 528(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 808(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps 416(%rdi), %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 304(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vpermd 864(%rdi), %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 752(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $1160, %rsp # imm = 0x488 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -5972,614 +5959,613 @@ ; SSE-LABEL: load_i32_stride7_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $2456, %rsp # imm = 0x998 -; SSE-NEXT: movdqa 1088(%rdi), %xmm2 +; SSE-NEXT: movdqa 976(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1056(%rdi), %xmm3 +; SSE-NEXT: movdqa 944(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1008(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1024(%rdi), %xmm4 +; SSE-NEXT: movdqa 896(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 912(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 640(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 608(%rdi), %xmm5 +; SSE-NEXT: movdqa 528(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 560(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm6 +; SSE-NEXT: movdqa 496(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm7 +; SSE-NEXT: movdqa 448(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 464(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm1 +; SSE-NEXT: movdqa 80(%rdi), %xmm13 +; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1456(%rdi), %xmm1 +; SSE-NEXT: movdqa 1344(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1472(%rdi), %xmm0 +; SSE-NEXT: movdqa 1360(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 1536(%rdi), %xmm2 +; SSE-NEXT: movdqa 1424(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1504(%rdi), %xmm0 +; SSE-NEXT: movdqa 1392(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa 112(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 80(%rdi), %xmm14 -; SSE-NEXT: movdqa 48(%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 448(%rdi), %xmm2 +; SSE-NEXT: movdqa 560(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa 464(%rdi), %xmm0 +; SSE-NEXT: movdqa 576(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 528(%rdi), %xmm1 +; SSE-NEXT: movdqa 640(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 496(%rdi), %xmm0 +; SSE-NEXT: movdqa 608(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 896(%rdi), %xmm2 +; SSE-NEXT: movdqa 1008(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 912(%rdi), %xmm0 +; SSE-NEXT: movdqa 1024(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 976(%rdi), %xmm1 +; SSE-NEXT: movdqa 1088(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 944(%rdi), %xmm0 +; SSE-NEXT: movdqa 1056(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1344(%rdi), %xmm2 +; SSE-NEXT: movdqa 1456(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1360(%rdi), %xmm0 +; SSE-NEXT: movdqa 1472(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 1424(%rdi), %xmm1 +; SSE-NEXT: movdqa 1536(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1392(%rdi), %xmm0 +; SSE-NEXT: movdqa 1504(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 336(%rdi), %xmm11 -; SSE-NEXT: movdqa 352(%rdi), %xmm0 +; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movdqa 304(%rdi), %xmm9 +; SSE-NEXT: movdqa 272(%rdi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,2,3,3] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 416(%rdi), %xmm8 -; SSE-NEXT: movdqa 384(%rdi), %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 784(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 800(%rdi), %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 672(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 688(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 864(%rdi), %xmm13 -; SSE-NEXT: movdqa 832(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa 752(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1232(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1248(%rdi), %xmm0 +; SSE-NEXT: movdqa 720(%rdi), %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1120(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1136(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 1312(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa 1200(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1168(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1568(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1584(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa 1648(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1280(%rdi), %xmm0 +; SSE-NEXT: movdqa 1616(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1680(%rdi), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 336(%rdi), %xmm6 +; SSE-NEXT: movdqa 352(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa 416(%rdi), %xmm2 +; SSE-NEXT: movdqa 384(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 784(%rdi), %xmm14 +; SSE-NEXT: movdqa 800(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa 864(%rdi), %xmm7 +; SSE-NEXT: movdqa 832(%rdi), %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1232(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1248(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa 1312(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1280(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1680(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1696(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: movdqa 1760(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1728(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 224(%rdi), %xmm7 -; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm3[2],xmm12[3],xmm3[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa 304(%rdi), %xmm3 -; SSE-NEXT: movdqa 272(%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 672(%rdi), %xmm10 -; SSE-NEXT: movdqa 688(%rdi), %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: movdqa 144(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: movdqa 752(%rdi), %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm3[2],xmm11[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa 256(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 720(%rdi), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1120(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1136(%rdi), %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm3[0],xmm11[1] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa 368(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movdqa 1200(%rdi), %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[1,1,1,1] +; SSE-NEXT: movdqa 480(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1168(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1568(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1584(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; SSE-NEXT: movdqa 1648(%rdi), %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa 592(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1616(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,1,1,1] -; SSE-NEXT: movdqa 144(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa 32(%rdi), %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa 704(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm4[0],xmm15[1] -; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa 368(%rdi), %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm3[0],xmm10[1] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa 816(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm4[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa 256(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,1,1,1] -; SSE-NEXT: movdqa 592(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,1,1] -; SSE-NEXT: movdqa 480(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,1,1,1] -; SSE-NEXT: movdqa 816(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa 704(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm10[0],xmm8[1] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1040(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[1,1,1,1] +; SSE-NEXT: movdqa 928(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: movdqa 928(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1264(%rdi), %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa 1152(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1488(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1376(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,1,1] -; SSE-NEXT: movdqa 1712(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa 1600(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm2 +; SSE-NEXT: movdqa 1040(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 176(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] +; SSE-NEXT: movdqa 1152(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm10[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa 1264(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm10[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 288(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: movdqa 1376(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm10[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 400(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: movdqa 1488(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm10[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 512(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] +; SSE-NEXT: movdqa 1600(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm10[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 624(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 1712(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm10[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm10[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE-NEXT: movdqa 176(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE-NEXT: movdqa 288(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE-NEXT: movdqa 400(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE-NEXT: movdqa 512(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE-NEXT: movdqa 624(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE-NEXT: movdqa 736(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE-NEXT: movdqa 848(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 736(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 848(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE-NEXT: movdqa 960(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 960(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 1072(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE-NEXT: movdqa 1072(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 1184(%rdi), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE-NEXT: movdqa 1184(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm15[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] ; SSE-NEXT: movdqa 1296(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm8[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] ; SSE-NEXT: movdqa 1408(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 1520(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE-NEXT: movdqa 1520(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] ; SSE-NEXT: movdqa 1632(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[2,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa 1744(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[2,3,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm15[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE-NEXT: movdqa 1744(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm7[2,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -6589,68 +6575,69 @@ ; SSE-NEXT: movdqa 320(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 432(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 544(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $250, (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 656(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 768(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 880(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6662,15 +6649,14 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1104(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] @@ -6679,11 +6665,10 @@ ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1216(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] @@ -6698,15 +6683,14 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1440(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] @@ -6714,9 +6698,10 @@ ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 1552(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa 1552(%rdi), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6732,8 +6717,7 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 1776(%rdi), %xmm0 @@ -6743,79 +6727,80 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa (%rsp), %xmm10 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6823,43 +6808,35 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] +; SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] @@ -6870,23 +6847,34 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] @@ -6894,45 +6882,34 @@ ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6941,17 +6918,16 @@ ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6960,8 +6936,16 @@ ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -6970,25 +6954,26 @@ ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa %xmm8, %xmm9 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -6998,36 +6983,34 @@ ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -7036,16 +7019,15 @@ ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,2] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -7053,34 +7035,36 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: # xmm15 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1] ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] @@ -7088,8 +7072,7 @@ ; SSE-NEXT: # xmm14 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -7099,21 +7082,21 @@ ; SSE-NEXT: # xmm13 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[0,0,1,1] @@ -7124,21 +7107,23 @@ ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -7153,8 +7138,7 @@ ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] @@ -7170,12 +7154,12 @@ ; SSE-NEXT: # xmm6 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1] -; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] @@ -7185,7 +7169,8 @@ ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] @@ -7195,8 +7180,7 @@ ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] @@ -7206,20 +7190,13 @@ ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,3,2,3] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rsi) @@ -7228,13 +7205,13 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps %xmm0, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps %xmm0, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7244,38 +7221,46 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rdx) +; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rdx) +; SSE-NEXT: movaps %xmm0, 224(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rdx) +; SSE-NEXT: movaps %xmm0, 192(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rdx) +; SSE-NEXT: movaps %xmm0, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rcx) @@ -7428,252 +7413,323 @@ ; ; AVX1-ONLY-LABEL: load_i32_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3144, %rsp # imm = 0xC48 -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 +; AVX1-ONLY-NEXT: subq $3112, %rsp # imm = 0xC28 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm13 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm11 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm15[1] +; AVX1-ONLY-NEXT: vmovaps %xmm15, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm5[6],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm4[1,2],xmm14[1] +; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1424(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6],ymm3[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,xmm6[1,2],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6],ymm5[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm6 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 752(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm7[1,2],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm9[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm7[1,2],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm9 +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm10[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = zero,xmm7[1,2],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm11[1,1],ymm5[2,2],ymm11[5,5],ymm5[6,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm7[2,3],ymm5[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,0],ymm6[3,3],ymm7[4,4],ymm6[7,7] +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm11 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,xmm6[1,2],xmm15[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 752(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1],ymm0[2,2],ymm15[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,0],ymm1[3,3],ymm5[4,4],ymm1[7,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm10[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1],ymm0[2,2],ymm6[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0],xmm2[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm9[1] -; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm14[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 1424(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1],ymm0[2,2],ymm14[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0],xmm13[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm15[1] -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm7[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1],ymm0[2,2],ymm13[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0],xmm12[1],xmm13[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0],xmm4[1],mem[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,0],ymm1[3,3],ymm8[4,4],ymm1[7,7] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm8[2] +; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,xmm1[1,2],mem[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1],ymm0[2,2],ymm11[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1],ymm0[2,2],ymm3[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 @@ -7683,20 +7739,18 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm11[2] +; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,xmm1[1,2],mem[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm0[2,2],ymm5[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[2,2],ymm0[5,5],ymm1[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm10[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 @@ -7713,835 +7767,750 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[2,2],ymm1[5,5],ymm0[6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[1,1],ymm0[2,2],ymm9[5,5],ymm0[6,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0],xmm12[1],mem[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7] +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0],ymm1[3,3],ymm3[4,4],ymm1[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,xmm1[1,2],mem[0] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm8[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,1],mem[0,3],ymm1[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,1],ymm1[2,0],ymm2[6,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps $8, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,1],mem[0,3],ymm1[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,1],ymm1[2,0],ymm15[6,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm10[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[2,2],ymm1[5,5],ymm0[6,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,1],mem[0,3],ymm1[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[2,1],ymm1[2,0],ymm6[6,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,1],mem[0,3],ymm1[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,1],ymm1[2,0],ymm14[6,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0],ymm2[3,3],ymm0[4,4],ymm2[7,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm6[0,3],ymm2[7,5],ymm6[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[2,1],ymm2[2,0],ymm13[6,5],ymm2[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = zero,xmm2[1,2],mem[0] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm0[2,2],ymm3[5,5],ymm0[6,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm0[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0],ymm3[3,3],ymm1[4,4],ymm3[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm14[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1],ymm0[2,2],ymm6[5,5],ymm0[6,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm6[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm0[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0],ymm3[3,3],ymm1[4,4],ymm3[7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,1],ymm9[0,3],ymm0[7,5],ymm9[4,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[2,1],ymm3[2,0],ymm1[6,5],ymm3[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm9[2] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,1],ymm0[2,2],ymm7[5,5],ymm0[6,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0],xmm9[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0],mem[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3],ymm3[0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[0,0],ymm14[3,3],ymm1[4,4],ymm14[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm15[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm12[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm12[0,3],ymm14[7,5],ymm12[4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm0[3,1],mem[0,3],ymm0[7,5],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1,2],xmm8[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm0[2,1],ymm15[2,0],ymm0[6,5],ymm15[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm15[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1,2],xmm11[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm5[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm3[1,0],ymm0[0,0],ymm3[5,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,1],ymm14[0,2],ymm0[7,5],ymm14[4,6] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,2],xmm12[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm2[2,1],ymm14[2,0],ymm2[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm2[2,1],ymm14[2,0],ymm2[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[3,1],mem[0,3],ymm14[7,5],mem[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm5[0,3],ymm14[7,5],ymm5[4,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm2[2,1],ymm14[2,0],ymm2[6,5],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm6[0],ymm3[0],ymm6[2],ymm3[2] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,0],ymm14[0,0],ymm12[5,4],ymm14[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,1],ymm7[0,2],ymm14[7,5],ymm7[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm12[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm12[0,1],mem[1,3],ymm12[4,5],mem[5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[0,0],ymm15[1,1],ymm0[4,4],ymm15[5,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,2],ymm14[2,0],ymm12[4,6],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0],ymm14[0,0],ymm7[5,4],ymm14[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm14[3,1],ymm7[0,2],ymm14[7,5],ymm7[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm8[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm8[0,1],ymm4[1,3],ymm8[4,5],ymm4[5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm6[1,0],ymm13[0,0],ymm6[5,4],ymm13[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm13[3,1],ymm10[0,2],ymm13[7,5],ymm10[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm11[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0],ymm4[1,1],ymm7[4,4],ymm4[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[0,2],ymm10[2,0],ymm5[4,6],ymm10[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[0,2],ymm14[2,0],ymm0[4,6],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0],ymm4[0,0],ymm7[5,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[3,1],ymm7[0,2],ymm4[7,5],ymm7[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[0,1],ymm1[1,3],ymm4[4,5],ymm1[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[0,2],ymm14[2,0],ymm15[4,6],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm10[0,0],ymm0[5,4],ymm10[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[3,1],ymm0[0,2],ymm10[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm3[0,0],ymm11[1,1],ymm3[4,4],ymm11[5,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,2],ymm13[2,0],ymm10[4,6],ymm13[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[1,0],ymm1[0,0],ymm9[5,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm4[1,0],ymm1[0,0],ymm4[5,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm1[3,1],ymm7[0,2],ymm1[7,5],ymm7[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm13[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[0,1],ymm10[1,3],ymm1[4,5],ymm10[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm11[0,2],ymm14[2,0],ymm11[4,6],ymm14[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm2[0,0],ymm5[5,4],ymm2[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm9[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,1],ymm3[1,3],ymm0[4,5],ymm3[5,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,2],ymm5[2,0],ymm6[4,6],ymm5[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm4[1,1],ymm1[4,4],ymm4[5,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm1[2,0],ymm3[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm10[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm9[0,1],ymm8[1,3],ymm9[4,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm5[2,0],ymm15[4,6],ymm5[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm7[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[0,0],ymm0[5,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0],ymm13[1,1],ymm3[4,4],ymm13[5,5] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm14 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,1],ymm3[1,3],ymm0[4,5],ymm3[5,7] -; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,2],ymm5[2,0],ymm14[4,6],ymm5[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm1[2,0],ymm3[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[0,0],ymm0[5,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,0],ymm5[1,1],ymm6[4,4],ymm5[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,0],ymm2[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm4[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[0,0],ymm0[5,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm9[1,1],ymm1[4,4],ymm9[5,5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,0],ymm2[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm11[0,1],ymm1[1,3],ymm11[4,5],ymm1[5,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[0,0],ymm0[5,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm8[1,1],ymm1[4,4],ymm8[5,5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,2],ymm6[2,0],ymm2[4,6],ymm6[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm5[0,0],ymm2[7,4],ymm5[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm11[1,0],ymm1[2,0],ymm11[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm5[2,0],ymm0[6,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,0],ymm2[4,6],ymm1[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm0[0,0],ymm12[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = mem[0],xmm5[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,0],ymm0[0,0],ymm14[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,0],ymm3[2,0],ymm13[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm15[2,0],ymm1[5,4],ymm15[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = mem[0],xmm2[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[2,0],ymm1[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = mem[0],xmm2[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm15[3,0],ymm0[0,0],ymm15[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,0],ymm8[2,0],ymm9[5,4],ymm8[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[3,0],ymm0[0,0],ymm10[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,0],ymm11[2,0],ymm3[5,4],ymm11[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = mem[0],xmm2[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm4[1,0],mem[2,0],ymm4[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,0],ymm4[2,0],ymm11[5,4],ymm4[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = mem[0],xmm10[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = mem[0],xmm2[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[2,0],ymm1[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,0],ymm13[2,0],ymm14[5,4],ymm13[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm1[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = mem[0],xmm3[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vpermilps $238, (%rsp), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,0],ymm8[2,0],ymm2[5,4],ymm8[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm5[2,0],ymm6[5,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,0],ymm0[0,0],ymm4[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,0],ymm9[2,0],ymm10[5,4],ymm9[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm5[2,0],ymm0[6,4],ymm5[6,4] +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = mem[0],xmm8[1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[3,0],ymm5[0,0],ymm4[7,4],ymm5[4,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm4[1,0],mem[2,0],ymm4[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[2,0],ymm6[2,0],ymm5[6,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1,2],xmm9[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = mem[0],xmm12[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm11[2,1],mem[3,3],ymm11[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm12[0],xmm9[1],xmm12[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm10[1,0],ymm3[2,0],ymm10[5,4],ymm3[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $114, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm7[2,0],mem[3,1],ymm7[6,4],mem[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0],xmm8[1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,0],ymm6[2,0],ymm9[5,4],ymm6[6,4] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2],xmm7[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm13[0,0],ymm14[1,0],ymm13[4,4],ymm14[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[0,1],xmm7[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm6[2,1],mem[3,3],ymm6[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0],xmm15[1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,0],ymm3[2,0],ymm7[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm10[0,0],mem[1,0],ymm10[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[0,1],xmm7[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,2],xmm7[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm7[0,0],mem[1,0],ymm7[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm12[0,1],xmm9[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $114, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm3[2,0],mem[3,1],ymm3[6,4],mem[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0],xmm12[1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[1,0],ymm6[2,0],ymm9[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm3[0,0],mem[1,0],ymm3[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm9[0,1],xmm7[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm10[2,1],mem[3,3],ymm10[6,5],mem[7,7] +; AVX1-ONLY-NEXT: vshufps $114, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm11[2,0],mem[3,1],ymm11[6,4],mem[7,5] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[0],xmm7[1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,0],ymm3[2,0],ymm7[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,0],mem[1,0],ymm7[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[0,1],xmm6[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[2,1],mem[3,3],ymm3[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0],xmm6[1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,0],ymm3[2,0],ymm6[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,0],mem[1,0],ymm7[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[1,0],ymm6[2,0],ymm7[5,4],ymm6[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm11[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm9[0,0],mem[1,0],ymm9[4,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[0,1],xmm6[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[0,1],xmm3[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[2,1],mem[3,3],ymm3[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0],xmm6[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps $114, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm2[2,0],mem[3,1],ymm2[6,4],mem[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0],xmm7[1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,0],ymm3[2,0],ymm6[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm14[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0],ymm7[1,0],ymm6[4,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,0],mem[1,0],ymm6[4,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1],xmm5[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm4[2,1],mem[3,3],ymm4[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0],xmm5[1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[1,0],ymm3[2,0],ymm5[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,0],mem[1,0],ymm5[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm5[2,1],mem[3,3],ymm5[6,5],mem[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm4[0,0],mem[1,0],ymm4[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,1],xmm1[3,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,1],ymm8[3,3],ymm2[6,5],ymm8[7,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm2[1],mem[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm6[0,1],xmm2[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $114, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm15[2,0],mem[3,1],ymm15[6,4],mem[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0],xmm3[1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,0],ymm1[2,0],ymm3[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,0],ymm2[2,0],ymm3[5,4],ymm2[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,0],mem[1,0],ymm3[4,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[0,1],xmm0[3,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[0,1],xmm1[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $114, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm10[2,0],mem[3,1],ymm10[6,4],mem[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0],xmm10[1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[2,0],ymm2[5,4],ymm1[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,0],mem[1,0],ymm2[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,1],xmm0[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[3,0],ymm0[0,0],ymm11[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm12[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vshufps $114, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm4[2,0],mem[3,1],ymm4[6,4],mem[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm1[1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,0],mem[1,0],ymm2[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $114, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[2,0],mem[3,1],ymm0[6,4],mem[7,5] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0],xmm5[1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,0],mem[1,0],ymm2[4,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[3,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,0],ymm14[2,0],ymm13[5,4],ymm14[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[1,0],mem[2,0],ymm2[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm1[0,0],ymm0[7,4],ymm1[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm0[1,0],mem[2,0],ymm0[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[1,0],mem[2,0],ymm3[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3] +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm9[1,0],mem[2,0],ymm9[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,0],ymm1[0,0],ymm10[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[0,0],ymm3[7,4],ymm2[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm4[1,0],mem[2,0],ymm4[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[0,0],ymm4[7,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[3,0],ymm3[0,0],ymm6[7,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,0],ymm6[4,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,0],ymm7[2,0],ymm8[5,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[3,0],ymm3[0,0],ymm7[7,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,0],ymm7[4,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[1,0],mem[2,0],ymm8[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[3,0],ymm3[0,0],ymm5[7,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,0],ymm8[4,5],ymm3[6,4] -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm9[1,0],mem[2,0],ymm9[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm4[1,0],mem[2,0],ymm4[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm9[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[3,0],ymm8[0,0],ymm9[7,4],ymm8[4,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm15[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm15[3,0],ymm2[0,0],ymm15[7,4],ymm2[4,4] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,0],ymm9[4,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,0],ymm9[4,5],ymm2[6,4] ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm7[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[3,0],ymm4[0,0],ymm7[7,4],ymm4[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,0],ymm9[4,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm6[3,0],ymm9[0,0],ymm6[7,4],ymm9[4,4] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1,2],xmm14[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm6[2,3,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm6[3,0],ymm9[0,0],ymm6[7,4],ymm9[4,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4] +; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm10 = xmm10[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rsi) +; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm9, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -8551,13 +8520,13 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -8567,26 +8536,34 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%r9) @@ -8617,48 +8594,46 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $3144, %rsp # imm = 0xC48 +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) +; AVX1-ONLY-NEXT: addq $3112, %rsp # imm = 0xC28 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i32_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $2664, %rsp # imm = 0xA68 -; AVX2-SLOW-NEXT: vmovdqa 1216(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vmovdqa 1152(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 1120(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm11 +; AVX2-SLOW-NEXT: subq $2648, %rsp # imm = 0xA58 +; AVX2-SLOW-NEXT: vmovdqa 992(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 928(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 896(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 80(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpbroadcastd 420(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 196(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8668,155 +8643,153 @@ ; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastq 752(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq 528(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpbroadcastd 868(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 644(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastq 1200(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm7 +; AVX2-SLOW-NEXT: vpbroadcastq 976(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm4 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 1248(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 1280(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 1056(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpbroadcastd 1316(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 1092(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1600(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1568(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa 1376(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 1344(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm7[6],ymm10[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 1664(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 1440(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 1648(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpbroadcastq 1424(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 1696(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 1728(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 1472(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 1504(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpbroadcastd 1764(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 1540(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 80(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpbroadcastq 304(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpbroadcastd 196(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 420(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm2[6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 528(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpbroadcastq 752(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpbroadcastd 644(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 868(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 928(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 896(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm14 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 1152(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 1120(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 992(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpbroadcastq 976(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 1216(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 1200(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 1056(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 1248(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 1280(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpbroadcastd 1092(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 1316(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1376(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 1600(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1344(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 1568(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 1440(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq 1424(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 1664(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastq 1648(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 1472(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 1504(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 1696(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 1728(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastd 1540(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpbroadcastd 1764(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8825,20 +8798,20 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 1056(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 960(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm11[1],ymm6[2,3,4],ymm11[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8847,20 +8820,20 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 1280(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 1248(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 1504(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 1184(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8869,22 +8842,21 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 1728(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1696(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 1632(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8893,20 +8865,20 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm5[1],ymm15[2,3,4],ymm5[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8915,19 +8887,21 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 1056(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 1280(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-SLOW-NEXT: vmovdqa 1248(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 960(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 1184(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8936,17 +8910,17 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 1504(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 1728(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 1696(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa 1632(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm14[2,3],ymm10[4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -8963,17 +8937,15 @@ ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm11 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm13 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm3[2,3],ymm9[4,5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -8985,207 +8957,176 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] -; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 680(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-SLOW-NEXT: vpbroadcastd 876(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm11[0],ymm13[0],ymm11[2],ymm13[2] +; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1200(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 1128(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 1152(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpbroadcastd 456(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] -; AVX2-SLOW-NEXT: vpbroadcastd 1324(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-SLOW-NEXT: vpbroadcastd 652(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1648(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 976(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 1576(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 1600(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpbroadcastd 904(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 928(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-SLOW-NEXT: vpbroadcastd 1772(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] -; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpbroadcastd 1100(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 1424(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 456(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpbroadcastd 1352(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 1376(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-SLOW-NEXT: vpbroadcastd 652(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpbroadcastd 1548(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 976(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 904(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 928(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm12[1],xmm15[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX2-SLOW-NEXT: vpbroadcastd 1100(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1424(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm11[0],ymm6[0],ymm11[2],ymm6[2] +; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 752(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 680(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm14 = ymm13[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 1352(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 1376(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-SLOW-NEXT: vpbroadcastd 876(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 1200(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX2-SLOW-NEXT: vpbroadcastd 1548(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = ymm6[0],mem[1],ymm6[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2],ymm5[1,3],ymm9[4,6],ymm5[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2],ymm3[1,3],ymm6[4,6],ymm3[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 880(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm8[1,3],ymm15[4,6],ymm8[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 1328(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 1128(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa 1152(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm2[1],xmm15[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-SLOW-NEXT: vpbroadcastd 1324(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 1648(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm12 = ymm10[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm4[3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 1576(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa 1600(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm4[1],xmm15[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX2-SLOW-NEXT: vpbroadcastd 1772(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm9[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm14[0,2],mem[1,3],ymm14[4,6],mem[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 1776(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $253, (%rsp), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm7[1],mem[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm9[0,2],ymm4[1,3],ymm9[4,6],ymm4[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm12[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm13[1],mem[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,2],ymm1[1,3],ymm4[4,6],ymm1[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 1552(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2],ymm6[1,3],ymm11[4,6],ymm6[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,2],ymm12[1,3],ymm6[4,6],ymm12[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 656(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,2],ymm10[1,3],ymm11[4,6],ymm10[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 1104(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm8[1,3],ymm7[4,6],ymm8[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 880(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9196,10 +9137,24 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm7[1,3],ymm6[4,6],ymm7[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 656(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm11[1,3],ymm8[4,6],ymm11[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 1104(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm13[1,3],ymm2[4,6],ymm13[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 1328(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9211,16 +9166,32 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm5[1,3],ymm3[4,6],ymm5[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm15[1,3],ymm14[4,6],ymm15[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 1552(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm7[1,3],ymm1[4,6],ymm7[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 1776(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm0 = [4,3,4,3] ; AVX2-SLOW-NEXT: # xmm0 = mem[0,0] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -9228,9 +9199,9 @@ ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm9 = [0,7,0,7,0,7,0,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm9, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm10 = [0,7,0,7,0,7,0,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm10, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vbroadcastss 212(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -9243,135 +9214,134 @@ ; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 436(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 436(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vbroadcastss 548(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vbroadcastss 548(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 660(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 660(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vbroadcastss 772(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vbroadcastss 772(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm9, %ymm6 -; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 884(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 884(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vbroadcastss 996(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm9, %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm10, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm11[6,7] ; AVX2-SLOW-NEXT: vbroadcastss 1108(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vbroadcastss 1220(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 1184(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm9, %ymm6 -; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1332(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vbroadcastss 1444(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 1408(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1556(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vbroadcastss 1668(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 1632(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm9, %ymm6 -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1780(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vbroadcastss 1220(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vmovaps 1184(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1332(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vbroadcastss 1444(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vmovaps 1408(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm10, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1556(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vbroadcastss 1668(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vmovaps 1632(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm0[0,1,2],xmm8[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm10, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1780(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm7 = xmm4[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %xmm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1,2],xmm2[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] @@ -9382,8 +9352,8 @@ ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm1 @@ -9393,823 +9363,807 @@ ; AVX2-SLOW-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm11[1],mem[2,3,4],ymm11[5],mem[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 992(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm5[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 1112(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1112(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vbroadcastss 1336(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm13[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 1440(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm12[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1336(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1440(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1,2],xmm15[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm13[0,1],xmm7[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[0],ymm12[1],mem[2,3,4],ymm12[5],mem[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1560(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1664(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vbroadcastss 1560(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm12[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 1664(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1784(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] +; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = mem[0],ymm11[1],mem[2,3,4],ymm11[5],mem[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 1784(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 136(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermps 192(%rdi), %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm5[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps 192(%rdi), %ymm10, %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm13[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 360(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermps 416(%rdi), %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 304(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3] +; AVX2-SLOW-NEXT: vpermps 416(%rdi), %ymm10, %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 304(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3] ; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 584(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermps 640(%rdi), %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 528(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps 640(%rdi), %ymm10, %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 528(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 808(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermps 864(%rdi), %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 752(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpermps 864(%rdi), %ymm10, %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 752(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 1032(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpermps 1088(%rdi), %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 976(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpermps 1088(%rdi), %ymm10, %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 976(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 1256(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpermps 1312(%rdi), %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1200(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpermps 1312(%rdi), %ymm10, %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1200(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 1480(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermps 1536(%rdi), %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1424(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1704(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpermps 1760(%rdi), %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1648(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm15[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 224(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 224(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 224(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 224(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 224(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%r9) +; AVX2-SLOW-NEXT: vpermps 1536(%rdi), %ymm10, %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1424(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm13[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm13[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1704(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpermps 1760(%rdi), %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1648(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 160(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 160(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 160(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 160(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 160(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 128(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm13, 224(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm12, 224(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm7, 192(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm8, 160(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm11, 128(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 64(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 224(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm3, 192(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm2, 160(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm15, 64(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm14, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rax) -; AVX2-SLOW-NEXT: addq $2664, %rsp # imm = 0xA68 +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rax) +; AVX2-SLOW-NEXT: addq $2648, %rsp # imm = 0xA58 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i32_stride7_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $2648, %rsp # imm = 0xA58 -; AVX2-FAST-NEXT: vmovdqa 1216(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqa 1152(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqa 1120(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm15 -; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 992(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 928(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovdqa 896(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 80(%rdi), %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6],ymm9[7] +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastd 420(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpbroadcastd 196(%rdi), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm15[6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm11[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastq 752(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpbroadcastq 528(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastd 868(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpbroadcastd 644(%rdi), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm11[6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm15[6],ymm5[7] ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastq 1200(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq 976(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 1248(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 1280(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 1024(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 1056(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastd 1316(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpbroadcastd 1092(%rdi), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1600(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 1568(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 1376(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 1344(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm7[6],ymm12[7] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 1664(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 1648(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 1440(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastq 1424(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 1696(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 1728(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 1472(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 1504(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastd 1764(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpbroadcastd 1540(%rdi), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 80(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpbroadcastq 304(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastd 196(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpbroadcastd 420(%rdi), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 528(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpbroadcastq 752(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastd 644(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpbroadcastd 868(%rdi), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 928(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 896(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 1152(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa 1120(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 992(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 1216(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 976(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpbroadcastq 1200(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 1024(%rdi), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 1056(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 1248(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 1280(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastd 1092(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpbroadcastd 1316(%rdi), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1376(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1344(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 1600(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa 1568(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 1440(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 1664(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq 1424(%rdi), %ymm1 +; AVX2-FAST-NEXT: vpbroadcastq 1648(%rdi), %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 1472(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 1504(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 1696(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 1728(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastd 1540(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd 1764(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm14[2,3],ymm6[4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,6,5,6,5,6,5,6] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,0,7,7,5,4,7,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm0[2,3],ymm10[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,6,5,6,5,6,5,6] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,7,5,4,7,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm12[2,3],ymm7[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm13[2,3],ymm8[4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm14[1],ymm11[2,3,4],ymm14[5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 1280(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 1056(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 1024(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1248(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 1184(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm0[2,3],ymm8[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 960(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm14[2,3],ymm11[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm5[1],ymm15[2,3,4],ymm5[5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 1728(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1696(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 1504(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 1632(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm0[2,3],ymm8[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqa 1408(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm15[2,3],ymm9[4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm15 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm15[2,3],ymm10[4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm3[2,3],ymm12[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 1056(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1024(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 1280(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vmovdqa 1248(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 960(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 1184(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3,4],ymm4[5],mem[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 1504(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 1728(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 1696(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 1408(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 680(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %xmm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX2-FAST-NEXT: vpbroadcastd 876(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa 1632(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1200(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 1128(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 1152(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-NEXT: vpbroadcastd 1324(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1648(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 1576(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 1600(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpbroadcastd 456(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-NEXT: vpbroadcastd 1772(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpbroadcastd 652(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vmovdqa 976(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FAST-NEXT: vpbroadcastd 904(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 928(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpbroadcastd 1100(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vmovdqa 1424(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 456(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FAST-NEXT: vpbroadcastd 1352(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 1376(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-NEXT: vpbroadcastd 652(%rdi), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vpbroadcastd 1548(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 976(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 904(%rdi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 928(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm9 +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] +; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 752(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 680(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] -; AVX2-FAST-NEXT: vpbroadcastd 1100(%rdi), %ymm14 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX2-FAST-NEXT: vpbroadcastd 876(%rdi), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 1200(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 1128(%rdi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 1152(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm2[1],xmm15[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm6[0],ymm2[0],ymm6[2],ymm2[2] +; AVX2-FAST-NEXT: vpbroadcastd 1324(%rdi), %ymm14 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1424(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm14 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 1352(%rdi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 1376(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 1648(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm14 = ymm4[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm3[3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 1576(%rdi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 1600(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm3[1],xmm15[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX2-FAST-NEXT: vpbroadcastd 1548(%rdi), %ymm13 +; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm4[0],mem[0],ymm4[2],mem[2] +; AVX2-FAST-NEXT: vpbroadcastd 1772(%rdi), %ymm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[0],ymm11[1],mem[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,2],ymm3[1,3],ymm15[4,6],ymm3[5,7] -; AVX2-FAST-NEXT: vbroadcastss 432(%rdi), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4],ymm13[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm12[0,2],ymm14[1,3],ymm12[4,6],ymm14[5,7] -; AVX2-FAST-NEXT: vbroadcastss 880(%rdi), %ymm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,2],ymm13[1,3],ymm11[4,6],ymm13[5,7] -; AVX2-FAST-NEXT: vbroadcastss 1328(%rdi), %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7] -; AVX2-FAST-NEXT: vbroadcastss 1776(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm6[0,2],mem[1,3],ymm6[4,6],mem[5,7] +; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm9[1],ymm11[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,2],ymm1[1,3],ymm4[4,6],ymm1[5,7] -; AVX2-FAST-NEXT: vbroadcastss 1552(%rdi), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,2],ymm5[1,3],ymm7[4,6],ymm5[5,7] +; AVX2-FAST-NEXT: vbroadcastss 432(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[0,2],ymm14[1,3],ymm11[4,6],ymm14[5,7] +; AVX2-FAST-NEXT: vbroadcastss 656(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm8[1,3],ymm10[4,6],ymm8[5,7] +; AVX2-FAST-NEXT: vbroadcastss 880(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -10219,11 +10173,11 @@ ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm8 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm10[1,3],ymm7[4,6],ymm10[5,7] -; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm9 -; AVX2-FAST-NEXT: vbroadcastss 1104(%rdi), %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm9[1,3],ymm8[4,6],ymm9[5,7] +; AVX2-FAST-NEXT: vbroadcastss 1104(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10235,9 +10189,8 @@ ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm7[1,3],ymm6[4,6],ymm7[5,7] -; AVX2-FAST-NEXT: vbroadcastss 656(%rdi), %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm2[1,3],ymm6[4,6],ymm2[5,7] +; AVX2-FAST-NEXT: vbroadcastss 1328(%rdi), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10249,166 +10202,182 @@ ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm4[1,3],ymm3[4,6],ymm4[5,7] -; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,2],ymm13[1,3],ymm12[4,6],ymm13[5,7] +; AVX2-FAST-NEXT: vbroadcastss 1552(%rdi), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm15[0,2],mem[1,3],ymm15[4,6],mem[5,7] +; AVX2-FAST-NEXT: vbroadcastss 1776(%rdi), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [4,3,4,3] ; AVX2-FAST-NEXT: # xmm5 = mem[0,0] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm10 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vbroadcastss 212(%rdi), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vbroadcastss 324(%rdi), %xmm3 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm10 = [0,7,0,7,0,7,0,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-FAST-NEXT: vbroadcastss 212(%rdi), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vbroadcastss 324(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vbroadcastss 436(%rdi), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vbroadcastss 548(%rdi), %xmm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX2-FAST-NEXT: vbroadcastss 436(%rdi), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vbroadcastss 548(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vbroadcastss 660(%rdi), %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vbroadcastss 772(%rdi), %xmm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vbroadcastss 660(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vbroadcastss 772(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovaps 736(%rdi), %xmm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] ; AVX2-FAST-NEXT: vbroadcastss 884(%rdi), %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vbroadcastss 996(%rdi), %xmm7 ; AVX2-FAST-NEXT: vmovaps 960(%rdi), %xmm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vbroadcastss 1108(%rdi), %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm5, %ymm6 -; AVX2-FAST-NEXT: vbroadcastss 1220(%rdi), %xmm8 -; AVX2-FAST-NEXT: vmovaps 1184(%rdi), %xmm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1332(%rdi), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm5, %ymm6 -; AVX2-FAST-NEXT: vbroadcastss 1444(%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovaps 1408(%rdi), %xmm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1556(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vbroadcastss 1668(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovaps 1632(%rdi), %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm11[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1780(%rdi), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vbroadcastss 1220(%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovaps 1184(%rdi), %xmm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1332(%rdi), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vbroadcastss 1444(%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovaps 1408(%rdi), %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1,2],xmm7[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1556(%rdi), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vbroadcastss 1668(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovaps 1632(%rdi), %xmm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm13[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1780(%rdi), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0],ymm0[1],mem[2,3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [1,0,3,3,1,0,7,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vbroadcastss 216(%rdi), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm12 +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm12[0,1,2],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 320(%rdi), %xmm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm14 = [1,0,3,3,1,0,7,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm14, %ymm5 -; AVX2-FAST-NEXT: vbroadcastss 216(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3,4],ymm5[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vbroadcastss 440(%rdi), %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 320(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vbroadcastss 440(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 544(%rdi), %xmm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 544(%rdi), %xmm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] @@ -10417,7 +10386,7 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vbroadcastss 664(%rdi), %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -10429,12 +10398,14 @@ ; AVX2-FAST-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm12[1],mem[2,3,4],ymm12[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vbroadcastss 888(%rdi), %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vbroadcastss 888(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 992(%rdi), %xmm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm4[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] @@ -10445,180 +10416,171 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm14, %ymm4 -; AVX2-FAST-NEXT: vbroadcastss 1112(%rdi), %ymm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vbroadcastss 1112(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1216(%rdi), %xmm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm14, %ymm12 -; AVX2-FAST-NEXT: vbroadcastss 1336(%rdi), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1440(%rdi), %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm15[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[0],ymm5[1],mem[2,3,4],ymm5[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm14, %ymm12 -; AVX2-FAST-NEXT: vbroadcastss 1560(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1664(%rdi), %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm11[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm9[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vbroadcastss 1336(%rdi), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 1440(%rdi), %xmm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1,2],xmm11[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm11 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0],ymm13[1],mem[2,3,4],ymm13[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm14, %ymm11 -; AVX2-FAST-NEXT: vbroadcastss 1784(%rdi), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm5[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 136(%rdi), %xmm5 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpermps 192(%rdi), %ymm10, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vbroadcastss 80(%rdi), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 360(%rdi), %xmm5 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpermps 416(%rdi), %ymm10, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vbroadcastss 304(%rdi), %ymm14 -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm13 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm13 = xmm14[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 584(%rdi), %xmm5 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpermps 640(%rdi), %ymm10, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vbroadcastss 528(%rdi), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vbroadcastss 1560(%rdi), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 1664(%rdi), %xmm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm5[0,1,2],xmm13[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm13[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vbroadcastss 1784(%rdi), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 136(%rdi), %xmm0 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermps 192(%rdi), %ymm10, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vbroadcastss 80(%rdi), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 360(%rdi), %xmm0 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermps 416(%rdi), %ymm10, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vbroadcastss 304(%rdi), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm15[3] ; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 584(%rdi), %xmm0 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermps 640(%rdi), %ymm10, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vbroadcastss 528(%rdi), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 808(%rdi), %xmm0 ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps 864(%rdi), %ymm10, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vbroadcastss 752(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vpermps 864(%rdi), %ymm10, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vbroadcastss 752(%rdi), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 1032(%rdi), %xmm1 ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermps 1088(%rdi), %ymm10, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vbroadcastss 976(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpermps 1088(%rdi), %ymm10, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vbroadcastss 976(%rdi), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 1256(%rdi), %xmm2 ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpermps 1312(%rdi), %ymm10, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1200(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpermps 1312(%rdi), %ymm10, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1200(%rdi), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 1480(%rdi), %xmm3 ; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpermps 1536(%rdi), %ymm10, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1424(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 1704(%rdi), %xmm5 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-FAST-NEXT: vpermps 1760(%rdi), %ymm10, %ymm10 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1648(%rdi), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpermps 1536(%rdi), %ymm10, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1424(%rdi), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 1704(%rdi), %xmm4 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX2-FAST-NEXT: vpermps 1760(%rdi), %ymm10, %ymm7 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1648(%rdi), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 224(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -10628,13 +10590,13 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, (%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm5, (%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 224(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -10644,13 +10606,13 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, (%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm5, (%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 224(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -10660,22 +10622,30 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, (%r8) +; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%r8) +; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%r8) +; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%r8) +; AVX2-FAST-NEXT: vmovaps %ymm5, (%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 224(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 160(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 224(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%r9) @@ -10687,66 +10657,65 @@ ; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%r9) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm11, 224(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm8, 192(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm7, 160(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm4, (%rax) +; AVX2-FAST-NEXT: vmovaps %ymm13, 224(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm11, 192(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm9, 160(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm5, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm10, 224(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm4, 224(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm3, 192(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm2, 160(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm14, 64(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm13, 32(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm9, (%rax) +; AVX2-FAST-NEXT: vmovaps %ymm15, 64(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm14, 32(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm12, (%rax) ; AVX2-FAST-NEXT: addq $2648, %rsp # imm = 0xA58 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $2664, %rsp # imm = 0xA68 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1216(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1152(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1120(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm11 +; AVX2-FAST-PERLANE-NEXT: subq $2648, %rsp # imm = 0xA58 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 992(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 928(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 896(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 80(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 420(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 196(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10756,155 +10725,153 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 752(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 528(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 868(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 644(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 1200(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 976(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1248(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1280(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1024(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1056(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1316(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1092(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1600(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1568(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1376(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1344(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm7[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1664(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1440(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 1648(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 1424(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1696(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1728(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1472(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1504(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1764(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1540(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 80(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 304(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 196(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 420(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm2[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 528(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 752(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 644(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 868(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 928(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 896(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1152(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1120(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 992(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 976(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1216(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 1200(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1024(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1056(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1248(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1280(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1092(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1316(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1376(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1600(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1344(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1568(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1440(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 1424(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1664(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 1648(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1472(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1504(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1696(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1728(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1540(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1764(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10913,20 +10880,20 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1056(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1024(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 960(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm11[1],ymm6[2,3,4],ymm11[5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10935,20 +10902,20 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1280(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1248(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1504(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1472(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1184(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1408(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10957,22 +10924,21 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1728(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1696(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1632(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3],ymm8[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10981,20 +10947,20 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm5[1],ymm15[2,3,4],ymm5[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -11003,19 +10969,21 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1056(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1280(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1248(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 960(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1184(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -11024,17 +10992,17 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1504(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1728(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1472(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1696(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1408(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1632(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm14[2,3],ymm10[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -11051,17 +11019,15 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm3[2,3],ymm9[4,5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -11073,207 +11039,176 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm9[0],ymm5[0],ymm9[2],ymm5[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 752(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 680(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 876(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm11[0],ymm13[0],ymm11[2],ymm13[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1200(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1128(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1152(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 456(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1324(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1648(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1576(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1600(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1772(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 652(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 976(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 904(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 928(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1100(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1424(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 456(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1352(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1376(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 652(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1548(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 976(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 904(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 928(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm12[1],xmm15[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1100(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1424(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm11[0],ymm6[0],ymm11[2],ymm6[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 752(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 680(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm14 = ymm13[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1352(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1376(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 876(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1200(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1548(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm6[0],mem[1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2],ymm5[1,3],ymm9[4,6],ymm5[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2],ymm3[1,3],ymm6[4,6],ymm3[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 880(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,2],ymm8[1,3],ymm15[4,6],ymm8[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1328(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1128(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1152(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm2[1],xmm15[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1324(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1648(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm12 = ymm10[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm4[3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1576(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1600(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm4[1],xmm15[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1772(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm9[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm14[0,2],mem[1,3],ymm14[4,6],mem[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1776(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $253, (%rsp), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm7[1],mem[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm9[0,2],ymm4[1,3],ymm9[4,6],ymm4[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm13[1],mem[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,2],ymm1[1,3],ymm4[4,6],ymm1[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1552(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2],ymm6[1,3],ymm11[4,6],ymm6[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,2],ymm12[1,3],ymm6[4,6],ymm12[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 656(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,2],ymm10[1,3],ymm11[4,6],ymm10[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1104(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm8[1,3],ymm7[4,6],ymm8[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 880(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11284,10 +11219,24 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm7[1,3],ymm6[4,6],ymm7[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 656(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm11[1,3],ymm8[4,6],ymm11[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1104(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm13[1,3],ymm2[4,6],ymm13[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1328(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11299,16 +11248,32 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm5[1,3],ymm3[4,6],ymm5[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm15[1,3],ymm14[4,6],ymm15[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1552(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2],ymm7[1,3],ymm1[4,6],ymm7[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1776(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm0 = [4,3,4,3] ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[0,0] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -11316,9 +11281,9 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm9 = [0,7,0,7,0,7,0,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm9, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm10 = [0,7,0,7,0,7,0,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm10, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 212(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -11331,135 +11296,134 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 436(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 436(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 548(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 548(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 660(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 660(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 772(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 772(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm9, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 884(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 884(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 996(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm9, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm10, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1108(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1220(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 1184(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm9, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1332(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1444(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 1408(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1556(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1668(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 1632(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm9, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1780(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1220(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1184(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1332(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1444(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1408(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm10, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1556(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1668(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1632(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm0[0,1,2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm10, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1780(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm4[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1,2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] @@ -11470,8 +11434,8 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm1 @@ -11481,428 +11445,425 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm11[1],mem[2,3,4],ymm11[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 992(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1112(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1112(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1336(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1440(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1336(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1440(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1,2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm13[0,1],xmm7[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0],ymm12[1],mem[2,3,4],ymm12[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1560(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1664(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1560(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1664(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1784(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0],ymm11[1],mem[2,3,4],ymm11[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1784(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 136(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermps 192(%rdi), %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps 192(%rdi), %ymm10, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm13[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 360(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermps 416(%rdi), %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 304(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpermps 416(%rdi), %ymm10, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 304(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 584(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermps 640(%rdi), %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 528(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps 640(%rdi), %ymm10, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 528(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 808(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermps 864(%rdi), %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 752(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps 864(%rdi), %ymm10, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 752(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1032(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps 1088(%rdi), %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 976(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps 1088(%rdi), %ymm10, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 976(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1256(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermps 1312(%rdi), %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1200(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps 1312(%rdi), %ymm10, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1200(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1480(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermps 1536(%rdi), %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1424(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1704(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps 1760(%rdi), %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1648(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 224(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 224(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 224(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 224(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%r9) +; AVX2-FAST-PERLANE-NEXT: vpermps 1536(%rdi), %ymm10, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1424(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm13[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm13[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1704(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps 1760(%rdi), %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1648(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 160(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 160(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 160(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 160(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 128(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $2664, %rsp # imm = 0xA68 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $2648, %rsp # imm = 0xA58 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-LABEL: load_i32_stride7_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm14 +; AVX512F-NEXT: subq $3336, %rsp # imm = 0xD08 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm12 ; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm14 ; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm6 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm14, %zmm4, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 +; AVX512F-NEXT: vpermi2d %zmm12, %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 ; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm14, %zmm4, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm12, %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 ; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm14, %zmm4, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm12, %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm2, %zmm3 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm2, %zmm3 ; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm4, %zmm14, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm4, %zmm12, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm2, %zmm3 ; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm4, %zmm14, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm2, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm4, %zmm12, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm16 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] ; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: vpermt2d %zmm16, %zmm22, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm17 ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm22, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512F-NEXT: vpermt2d %zmm20, %zmm22, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm9 @@ -11910,456 +11871,450 @@ ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512F-NEXT: vpermt2d %zmm21, %zmm22, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm22, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm22, %zmm8 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm22, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm22, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm14, %zmm4, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm22, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm22, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm12, %zmm4, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm22 ; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm8 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] ; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm17, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm6, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm6, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm13, %zmm6, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm6, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm6, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm6, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm6, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm24, %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm23, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512F-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm27, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm24, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm28, %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] +; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm26, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm29, %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] +; AVX512F-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm27, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm24, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm23, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm27, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm24, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm28, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm26, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm29, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm27, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm23, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm24, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm9, %zmm21, %zmm24 ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm27, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm24, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm21, %zmm9, %zmm27 ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm28, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm26, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm21, %zmm9, %zmm28 ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm29, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm27, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm21, %zmm9, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm9, %zmm21, %zmm23 +; AVX512F-NEXT: vpermi2d %zmm21, %zmm9, %zmm24 +; AVX512F-NEXT: vpermi2d %zmm21, %zmm9, %zmm26 +; AVX512F-NEXT: vpermi2d %zmm21, %zmm9, %zmm27 ; AVX512F-NEXT: vpermt2d %zmm9, %zmm0, %zmm21 ; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <0,7,14,21,28,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm20 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <1,8,15,22,29,u,u,u> +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <0,7,14,21,28,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm1, %zmm19 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <1,8,15,22,29,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm2, %zmm20 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <18,25,0,7,14,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm21 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <18,25,0,7,14,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm5, %zmm22 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <19,26,1,8,15,u,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm8, %zmm23 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm5, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm8, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm9 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm2 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm4, %zmm10 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm5, %zmm12 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm8, %zmm13 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm18, %zmm25 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm19, %zmm26 -; AVX512F-NEXT: vpermi2d %zmm0, %zmm2, %zmm19 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm31 = [6,13,20,27] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm31, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm3, %zmm21 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <19,26,1,8,15,u,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm4, %zmm22 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm17 = [4,11,18,25] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm18 = [5,12,19,26] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm18, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13,20,27] -; AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm1, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm2, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm3, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm4, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm17, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm18, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm3, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm4, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm17, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm18, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm0 +; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 +; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm11, %zmm3 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm11, %zmm4 +; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm17 +; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm31 = [6,13,20,27] +; AVX512F-NEXT: vpermt2d %zmm11, %zmm31, %zmm0 ; AVX512F-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm19 {%k1} ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 {%k2} ; AVX512F-NEXT: movw $480, %ax # imm = 0x1E0 ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm20 {%k2} ; AVX512F-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm21 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm8 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm25, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm21 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm22 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, (%rsp), %zmm11, %zmm11 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm29, %zmm23, %zmm23 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm23 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 +; AVX512F-NEXT: vinserti32x4 $0, %xmm25, %zmm24, %zmm24 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} -; AVX512F-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 +; AVX512F-NEXT: vinserti32x4 $0, %xmm17, %zmm26, %zmm17 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm17 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm30, %zmm26, %zmm26 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm28, %zmm29, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm28 {%k1} +; AVX512F-NEXT: vinserti32x4 $0, %xmm18, %zmm27, %zmm18 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm18 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} -; AVX512F-NEXT: vinserti32x4 $0, %xmm19, %zmm29, %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm11, %zmm28, %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm28, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm7, %zmm28, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm9, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm15, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm16, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm8, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm17, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm13, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm18, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r9) +; AVX512F-NEXT: vinserti32x4 $0, %xmm7, %zmm27, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm27, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm6, %zmm27, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm0, %zmm27, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm9, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm20, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm15, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm22, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm17, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm24, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm11, (%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm19, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm27, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm28, 128(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm26, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm25, (%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512F-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512F-NEXT: addq $3336, %rsp # imm = 0xD08 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i32_stride7_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3400, %rsp # imm = 0xD48 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm14 +; AVX512BW-NEXT: subq $3336, %rsp # imm = 0xD08 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm12 ; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm6 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [20,27,6,13,20,27,6,13,20,27,6,13,20,27,6,13] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,10,11,12,13,18,25,0,0,10,11,12,13,18,25] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm4, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm4, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,7,14,21,28,0,0,0,0,7,14,21,28,0,0] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,19,26,0,9,10,11,12,13,19,26] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm4, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm4, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm3 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,20,27,0,9,10,11,12,13,20,27] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm4, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm4, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm2, %zmm3 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,21,28,0,9,10,11,12,13,21,28] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm2, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm12, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,19,26,1,8,15,0,0,0,19,26,1,8,15,0,0] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,13,22,29,0,9,10,11,12,13,22,29] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm2, %zmm3 ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm14, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm2, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm12, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm16 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [25,4,11,18,25,4,11,18,25,4,11,18,25,4,11,18] ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm22, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm22, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm20, %zmm22, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm9 @@ -12367,324 +12322,321 @@ ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm22, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm22, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm22, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,9,10,11,12,16,23,30,0,9,10,11,12,16,23,30] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm22, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm22, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm4, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm22, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm22, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm12, %zmm4, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm22 ; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm8 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,9,10,11,12,17,24,31,0,9,10,11,12,17,24,31] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm6, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm6, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm6, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm12, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm24, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1] +; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] -; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm27, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [27,6,13,20,27,6,13,20,27,6,13,20,27,6,13,20] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm24, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm28, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [28,0,0,0,0,7,14,21,28,0,0,0,0,7,14,21] +; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm26, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm29, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [29,0,0,0,1,8,15,22,29,0,0,0,1,8,15,22] +; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm27, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,0,18,25,0,7,14,0,0,0,18,25,0,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm24, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm23, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm27, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm24, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm28, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm26, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm27, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm23, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm20 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm24, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm21, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm27, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm24, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm9, %zmm27 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm28, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm26, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm9, %zmm28 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm29, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm27, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm9, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm9, %zmm21, %zmm23 +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm9, %zmm24 +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm9, %zmm26 +; AVX512BW-NEXT: vpermi2d %zmm21, %zmm9, %zmm27 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm21 ; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <0,7,14,21,28,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm20 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <1,8,15,22,29,u,u,u> +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,7,14,21,28,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm1, %zmm19 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <1,8,15,22,29,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm2, %zmm20 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <18,25,0,7,14,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm21 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <18,25,0,7,14,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm5, %zmm22 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <19,26,1,8,15,u,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm8, %zmm23 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [5,12,19,26] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13,20,27] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm5, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm9 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm2 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm4, %zmm10 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm5, %zmm12 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm13 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm18, %zmm25 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm19, %zmm26 -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm31 = [6,13,20,27] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm3, %zmm21 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <19,26,1,8,15,u,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm22 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [4,11,18,25] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [5,12,19,26] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm18, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [6,13,20,27] -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm1, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm2, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm3, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm4, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm17, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm18, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm4, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm17, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm18, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm3 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm11, %zmm4 +; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm17 +; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm31 = [6,13,20,27] +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm31, %zmm0 ; AVX512BW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm19 {%k1} ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 {%k2} ; AVX512BW-NEXT: movw $480, %ax # imm = 0x1E0 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm20 {%k2} ; AVX512BW-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm25, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, (%rsp), %zmm11, %zmm11 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm29, %zmm23, %zmm23 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm24, %zmm24 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm25, %zmm24, %zmm24 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm24 {%k1} -; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm28, %zmm18 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm26, %zmm17 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm26, %zmm25, %zmm25 +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm26 # 16-byte Folded Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm30, %zmm26, %zmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm28, %zmm29, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm28 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm27, %zmm18 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm18 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm27 {%k1} -; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm29, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm11, %zmm28, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm28, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm7, %zmm28, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r9) +; AVX512BW-NEXT: vinserti32x4 $0, %xmm7, %zmm27, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm27, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm27, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm27, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm22, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm27, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm25, (%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512BW-NEXT: addq $3400, %rsp # imm = 0xD48 +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512BW-NEXT: addq $3336, %rsp # imm = 0xD08 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <448 x i32>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -114,12 +114,14 @@ ; ; AVX512F-SLOW-LABEL: load_i32_stride8_vf2: ; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: pushq %rbx ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-SLOW-NEXT: vmovd %xmm1, %ebx +; AVX512F-SLOW-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm2 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -142,17 +144,20 @@ ; AVX512F-SLOW-NEXT: vmovlps %xmm6, (%r11) ; AVX512F-SLOW-NEXT: vmovlps %xmm4, (%r10) ; AVX512F-SLOW-NEXT: vmovlps %xmm1, (%rax) +; AVX512F-SLOW-NEXT: popq %rbx ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i32_stride8_vf2: ; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: pushq %rbx ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-FAST-NEXT: vmovd %xmm1, %ebx +; AVX512F-FAST-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm2 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5] ; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -160,8 +165,7 @@ ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,5,5,13,5,5] -; AVX512F-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [5,13,5,13,5,13,5,13] ; AVX512F-FAST-NEXT: vpermi2d %ymm1, %ymm4, %ymm6 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] @@ -176,17 +180,20 @@ ; AVX512F-FAST-NEXT: vmovq %xmm6, (%r11) ; AVX512F-FAST-NEXT: vmovq %xmm4, (%r10) ; AVX512F-FAST-NEXT: vmovq %xmm1, (%rax) +; AVX512F-FAST-NEXT: popq %rbx ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: load_i32_stride8_vf2: ; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: pushq %rbx ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512BW-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BW-SLOW-NEXT: vmovd %xmm1, %ebx +; AVX512BW-SLOW-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm2 ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -209,17 +216,20 @@ ; AVX512BW-SLOW-NEXT: vmovlps %xmm6, (%r11) ; AVX512BW-SLOW-NEXT: vmovlps %xmm4, (%r10) ; AVX512BW-SLOW-NEXT: vmovlps %xmm1, (%rax) +; AVX512BW-SLOW-NEXT: popq %rbx ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: load_i32_stride8_vf2: ; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: pushq %rbx ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512BW-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BW-FAST-NEXT: vmovd %xmm1, %ebx +; AVX512BW-FAST-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm2 ; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5] ; AVX512BW-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512BW-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -227,8 +237,7 @@ ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512BW-FAST-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,5,5,13,5,5] -; AVX512BW-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [5,13,5,13,5,13,5,13] ; AVX512BW-FAST-NEXT: vpermi2d %ymm1, %ymm4, %ymm6 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX512BW-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] @@ -243,6 +252,7 @@ ; AVX512BW-FAST-NEXT: vmovq %xmm6, (%r11) ; AVX512BW-FAST-NEXT: vmovq %xmm4, (%r10) ; AVX512BW-FAST-NEXT: vmovq %xmm1, (%rax) +; AVX512BW-FAST-NEXT: popq %rbx ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %wide.vec = load <16 x i32>, ptr %in.vec, align 64 @@ -493,7 +503,7 @@ ; SSE-NEXT: movaps 144(%rdi), %xmm9 ; SSE-NEXT: movaps (%rdi), %xmm11 ; SSE-NEXT: movaps 32(%rdi), %xmm0 -; SSE-NEXT: movaps 96(%rdi), %xmm13 +; SSE-NEXT: movaps 96(%rdi), %xmm14 ; SSE-NEXT: movaps 64(%rdi), %xmm12 ; SSE-NEXT: movaps 160(%rdi), %xmm1 ; SSE-NEXT: movaps 128(%rdi), %xmm5 @@ -506,15 +516,15 @@ ; SSE-NEXT: movaps %xmm6, %xmm4 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; SSE-NEXT: movaps %xmm12, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] ; SSE-NEXT: movaps %xmm11, %xmm7 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] ; SSE-NEXT: movaps %xmm7, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm14[0] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm13[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1] ; SSE-NEXT: movaps 240(%rdi), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm10[2],xmm15[3],xmm10[3] @@ -523,18 +533,18 @@ ; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm12[0] +; SSE-NEXT: movaps %xmm11, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm15[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] ; SSE-NEXT: movaps %xmm10, %xmm15 ; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1] ; SSE-NEXT: movaps %xmm9, %xmm12 ; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; SSE-NEXT: movaps %xmm12, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm15[0] +; SSE-NEXT: movaps %xmm12, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm15[1] ; SSE-NEXT: movaps 80(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm15 @@ -560,24 +570,24 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movaps %xmm7, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps %xmm6, 16(%rdx) -; SSE-NEXT: movaps %xmm14, (%rcx) +; SSE-NEXT: movaps %xmm7, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm11, (%r8) +; SSE-NEXT: movaps %xmm13, (%rcx) ; SSE-NEXT: movaps %xmm5, 16(%r8) +; SSE-NEXT: movaps %xmm11, (%r8) +; SSE-NEXT: movaps %xmm14, 16(%r9) ; SSE-NEXT: movaps %xmm4, (%r9) -; SSE-NEXT: movaps %xmm13, 16(%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, (%rax) ; SSE-NEXT: movaps %xmm12, 16(%rax) +; SSE-NEXT: movaps %xmm2, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm10, (%rax) ; SSE-NEXT: movaps %xmm3, 16(%rax) +; SSE-NEXT: movaps %xmm10, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm9, 16(%rax) ; SSE-NEXT: movaps %xmm1, (%rax) @@ -899,89 +909,90 @@ ; SSE-LABEL: load_i32_stride8_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $296, %rsp # imm = 0x128 -; SSE-NEXT: movaps 288(%rdi), %xmm1 -; SSE-NEXT: movaps 352(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 288(%rdi), %xmm2 +; SSE-NEXT: movaps 352(%rdi), %xmm1 ; SSE-NEXT: movaps 320(%rdi), %xmm7 -; SSE-NEXT: movaps 416(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 416(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 384(%rdi), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 480(%rdi), %xmm11 +; SSE-NEXT: movaps 480(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 448(%rdi), %xmm3 -; SSE-NEXT: movaps 160(%rdi), %xmm8 +; SSE-NEXT: movaps 160(%rdi), %xmm11 ; SSE-NEXT: movaps 128(%rdi), %xmm15 ; SSE-NEXT: movaps 224(%rdi), %xmm5 ; SSE-NEXT: movaps 192(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, %xmm6 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; SSE-NEXT: movaps %xmm15, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] ; SSE-NEXT: movaps %xmm9, %xmm13 -; SSE-NEXT: movaps %xmm9, %xmm12 +; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm6[0] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1] ; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm9[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; SSE-NEXT: movaps 256(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movaps %xmm1, %xmm8 +; SSE-NEXT: movaps 256(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm6 +; SSE-NEXT: movaps 96(%rdi), %xmm9 ; SSE-NEXT: movaps 64(%rdi), %xmm12 ; SSE-NEXT: movaps %xmm12, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] ; SSE-NEXT: movaps (%rdi), %xmm2 ; SSE-NEXT: movaps 32(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm13[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm13[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm6[2],xmm12[3],xmm6[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm10[1] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm7[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -989,8 +1000,8 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps 208(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 176(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1009,119 +1020,119 @@ ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 432(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 400(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: movaps 400(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] -; SSE-NEXT: movaps 368(%rdi), %xmm12 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] +; SSE-NEXT: movaps 368(%rdi), %xmm15 ; SSE-NEXT: movaps 336(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] -; SSE-NEXT: movaps 304(%rdi), %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: movaps 304(%rdi), %xmm14 ; SSE-NEXT: movaps 272(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps %xmm13, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1] +; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] -; SSE-NEXT: movaps 112(%rdi), %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: movaps 112(%rdi), %xmm11 ; SSE-NEXT: movaps 80(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movaps 16(%rdi), %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movaps 16(%rdi), %xmm8 ; SSE-NEXT: movaps 48(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm15[1] -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: movaps %xmm13, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: movaps %xmm0, %xmm9 +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm2[1] -; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: movaps %xmm7, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm3[1] -; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; SSE-NEXT: movaps %xmm8, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%r9) -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%r9) +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm7, 32(%rax) -; SSE-NEXT: movaps %xmm8, 48(%rax) -; SSE-NEXT: movaps %xmm5, (%rax) +; SSE-NEXT: movaps %xmm6, 32(%rax) +; SSE-NEXT: movaps %xmm10, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movaps %xmm5, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps %xmm2, 48(%rax) -; SSE-NEXT: movaps %xmm4, 32(%rax) -; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps %xmm4, 16(%rax) ; SSE-NEXT: movaps %xmm3, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm10, 48(%rax) +; SSE-NEXT: movaps %xmm7, 48(%rax) ; SSE-NEXT: movaps %xmm13, 32(%rax) ; SSE-NEXT: movaps %xmm9, 16(%rax) -; SSE-NEXT: movaps %xmm6, (%rax) +; SSE-NEXT: movaps %xmm8, (%rax) ; SSE-NEXT: addq $296, %rsp # imm = 0x128 ; SSE-NEXT: retq ; @@ -1959,47 +1970,46 @@ ; SSE-LABEL: load_i32_stride8_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $952, %rsp # imm = 0x3B8 -; SSE-NEXT: movaps 544(%rdi), %xmm4 +; SSE-NEXT: movaps 672(%rdi), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 608(%rdi), %xmm5 +; SSE-NEXT: movaps 736(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 576(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 672(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 640(%rdi), %xmm2 +; SSE-NEXT: movaps 704(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 736(%rdi), %xmm7 +; SSE-NEXT: movaps 160(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 704(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm11 +; SSE-NEXT: movaps 128(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdi), %xmm11 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm0 +; SSE-NEXT: movaps 192(%rdi), %xmm15 +; SSE-NEXT: movaps 544(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 512(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1] +; SSE-NEXT: movaps 608(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 576(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movaps 512(%rdi), %xmm1 +; SSE-NEXT: movaps 640(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -2007,160 +2017,157 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 480(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 448(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 416(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 384(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 992(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 960(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 928(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 896(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 352(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movaps 320(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: movaps 288(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 256(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 864(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 832(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: movaps 800(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 768(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm11[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 864(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 832(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; SSE-NEXT: movaps 800(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 768(%rdi), %xmm2 +; SSE-NEXT: movaps 480(%rdi), %xmm6 +; SSE-NEXT: movaps 448(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] +; SSE-NEXT: movaps 416(%rdi), %xmm4 +; SSE-NEXT: movaps 384(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm13[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm11[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 992(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 960(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: movaps 928(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 896(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm13[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm15 +; SSE-NEXT: movaps 96(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; SSE-NEXT: movaps (%rdi), %xmm6 -; SSE-NEXT: movaps 32(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm13[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] +; SSE-NEXT: movaps (%rdi), %xmm8 +; SSE-NEXT: movaps 32(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm13[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm6[2],xmm12[3],xmm6[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: unpckhps (%rsp), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm14[2],xmm6[3],xmm14[3] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm8[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: unpckhps (%rsp), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm15[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm15[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm13[1] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm12[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm13[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm9[1] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm11[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 208(%rdi), %xmm0 @@ -2179,14 +2186,14 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 336(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movaps 336(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 304(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 272(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movaps 304(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 272(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2194,8 +2201,8 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 496(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 464(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: movaps 464(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 432(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2209,8 +2216,8 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 624(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 592(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps 592(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 560(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2224,8 +2231,8 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 752(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 720(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps 720(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 688(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2239,8 +2246,8 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 880(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 848(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps 848(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 816(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2254,13 +2261,13 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1008(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 976(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps 976(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 944(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 912(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm15 +; SSE-NEXT: movaps 912(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm15 ; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] ; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -2272,25 +2279,20 @@ ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 16(%rdi), %xmm12 -; SSE-NEXT: movaps 48(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] -; SSE-NEXT: movaps %xmm1, %xmm14 +; SSE-NEXT: movaps 48(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: movaps %xmm10, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2299,22 +2301,28 @@ ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload @@ -2324,80 +2332,83 @@ ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm14[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm8[1] -; SSE-NEXT: movaps %xmm13, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm7[1] -; SSE-NEXT: movaps %xmm11, %xmm7 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1] +; SSE-NEXT: movaps %xmm13, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm6[1] +; SSE-NEXT: movaps %xmm11, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm5[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm5[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] ; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movaps %xmm10, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] +; SSE-NEXT: movaps %xmm9, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm3[1] ; SSE-NEXT: movaps %xmm12, %xmm4 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm2[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps %xmm1, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rdx) +; SSE-NEXT: movaps %xmm1, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps %xmm1, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rcx) +; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rcx) +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rcx) +; SSE-NEXT: movaps %xmm1, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps %xmm1, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rcx) +; SSE-NEXT: movaps %xmm1, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 96(%r8) @@ -2443,26 +2454,27 @@ ; SSE-NEXT: movaps %xmm1, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rax) +; SSE-NEXT: movaps %xmm10, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm5, 112(%rax) ; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps %xmm7, 80(%rax) -; SSE-NEXT: movaps %xmm8, 64(%rax) -; SSE-NEXT: movaps %xmm9, 48(%rax) +; SSE-NEXT: movaps %xmm8, 80(%rax) +; SSE-NEXT: movaps %xmm7, 64(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps %xmm4, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm10, 112(%rax) +; SSE-NEXT: movaps %xmm9, 112(%rax) ; SSE-NEXT: movaps %xmm6, 96(%rax) ; SSE-NEXT: movaps %xmm11, 80(%rax) ; SSE-NEXT: movaps %xmm13, 64(%rax) ; SSE-NEXT: movaps %xmm14, 48(%rax) -; SSE-NEXT: movaps %xmm3, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps %xmm12, (%rax) @@ -2471,25 +2483,25 @@ ; ; AVX1-ONLY-LABEL: load_i32_stride8_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1736, %rsp # imm = 0x6C8 -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm14 +; AVX1-ONLY-NEXT: subq $1768, %rsp # imm = 0x6E8 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm14 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1,0,1] @@ -2497,83 +2509,83 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm13[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm12[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm3[0] @@ -2597,7 +2609,7 @@ ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpermilps $85, (%rsp), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] @@ -2624,7 +2636,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -2651,10 +2663,10 @@ ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload @@ -2692,9 +2704,9 @@ ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm11[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload @@ -2728,7 +2740,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpermilps $238, (%rsp), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] @@ -2749,104 +2761,105 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps $238, (%rsp), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm8[0],ymm3[1],ymm8[1],ymm3[4],ymm8[4],ymm3[5],ymm8[5] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] -; AVX1-ONLY-NEXT: vmovups %ymm7, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[4],ymm1[4],ymm3[5],ymm1[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm12 -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm9 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm9[0,1],ymm8[2,0],ymm9[4,5],ymm8[6,4] +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm12 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm14 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1],ymm12[2,0],ymm13[4,5],ymm12[6,4] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2857,100 +2870,107 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[4],ymm15[4],ymm2[5],ymm15[5] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[4],ymm3[4],ymm6[5],ymm3[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm12[1,0],ymm8[5,4],ymm12[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[4],mem[4],ymm5[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[1],ymm14[1],ymm1[4],ymm14[4],ymm1[5],ymm14[5] +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[4],ymm7[4],ymm3[5],ymm7[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[1,0],ymm6[1,0],ymm7[5,4],ymm6[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm2[1,0],mem[1,0],ymm2[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,0],ymm8[1,0],ymm14[5,4],ymm8[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[4],ymm6[4],ymm3[5],ymm6[5] +; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm9 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm2[1,0],ymm4[5,4],ymm2[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[4],mem[4],ymm4[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[1,0],ymm9[1,0],ymm7[5,4],ymm9[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,0],ymm7[1,0],ymm5[5,4],ymm7[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm15[1,0],ymm1[5,4],ymm15[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,0],ymm2[1,0],ymm5[5,4],ymm2[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,0],ymm8[1,0],ymm13[5,4],ymm8[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[4],mem[4],ymm12[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm14[1,0],ymm1[5,4],ymm14[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[4],ymm11[4],ymm14[5],ymm11[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm13[1,0],mem[1,0],ymm13[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm3[1],ymm9[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps (%rsp), %ymm6, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm3[1],ymm12[1],ymm3[3],ymm12[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm14[2],ymm8[3],ymm14[3],ymm8[6],ymm14[6],ymm8[7],ymm14[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[6],ymm7[6],ymm9[7],ymm7[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload @@ -2960,35 +2980,34 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[6],ymm7[6],ymm3[7],ymm7[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[6],ymm5[6],ymm2[7],ymm5[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[6],ymm10[6],ymm8[7],ymm10[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm7[1],ymm15[3],ymm7[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm10[2],ymm4[3],ymm10[3],ymm4[6],ymm10[6],ymm4[7],ymm10[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm12[1],ymm8[3],ymm12[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm9[2],ymm15[2],ymm9[3],ymm15[3],ymm9[6],ymm15[6],ymm9[7],ymm15[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[6],ymm9[6],ymm11[7],ymm9[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2999,21 +3018,8 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm11[3,0],mem[3,0],ymm11[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[6],ymm6[6],ymm8[7],ymm6[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,0],ymm3[3,0],ymm7[7,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] @@ -3022,7 +3028,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -3030,8 +3036,7 @@ ; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[2,3],ymm2[6,4],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] @@ -3040,222 +3045,231 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm14[2],mem[2],ymm14[3],mem[3],ymm14[6],mem[6],ymm14[7],mem[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm10[3,0],ymm4[3,0],ymm10[7,4],ymm4[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,3],ymm3[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm15[3,0],ymm9[3,0],ymm15[7,4],ymm9[7,4] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,0],ymm2[2,3],ymm3[6,4],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm7[2],ymm15[2],ymm7[3],ymm15[3],ymm7[6],ymm15[6],ymm7[7],ymm15[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm10[3,0],mem[3,0],ymm10[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm12[2],ymm8[2],ymm12[3],ymm8[3],ymm12[6],ymm8[6],ymm12[7],ymm8[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[3,0],ymm5[3,0],ymm6[7,4],ymm5[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,0],ymm3[2,3],ymm4[6,4],ymm3[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm14[2],mem[2],ymm14[3],mem[3],ymm14[6],mem[6],ymm14[7],mem[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm9[3,0],ymm11[3,0],ymm9[7,4],ymm11[7,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) -; AVX1-ONLY-NEXT: addq $1736, %rsp # imm = 0x6C8 +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) +; AVX1-ONLY-NEXT: addq $1768, %rsp # imm = 0x6E8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i32_stride8_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $1576, %rsp # imm = 0x628 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm11 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm13 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm14 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm15 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm10 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vbroadcastss %xmm4, %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm4, %xmm15 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vbroadcastss %xmm8, %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vbroadcastss %xmm8, %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm6 +; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm12 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm11 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -3265,134 +3279,138 @@ ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm8[0],mem[0],xmm8[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm4[0],mem[0],xmm4[1],mem[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm13[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1,2],xmm15[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1],xmm15[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm15 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm11[2],mem[2],xmm11[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm12[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm12[0,1,2],xmm14[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm3[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm10[0,1,2],xmm14[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm10[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm9[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1,2],xmm14[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm7[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm4[0,1,2],xmm8[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm9[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0,1,2],xmm14[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm5[0,1,2],xmm14[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm13[0,1],xmm14[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm14[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm3[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -3400,8 +3418,8 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm1[1] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm12[2],mem[2],xmm12[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] @@ -3409,343 +3427,339 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm14[2],xmm3[2],xmm14[3],xmm3[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm1[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm8[2],xmm12[3],xmm8[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm9 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm10 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[4],ymm9[4],ymm10[5],ymm9[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[2,2,2,2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,2,2,2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm13[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[4],ymm6[4],ymm2[5],ymm6[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[4],ymm6[4],ymm2[5],ymm6[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm13[0],ymm0[0],ymm13[1],ymm0[1],ymm13[4],ymm0[4],ymm13[5],ymm0[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm10 -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm12 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm15[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm12[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[4],ymm0[4],ymm3[5],ymm0[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[4],ymm0[4],ymm3[5],ymm0[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm11[0],ymm1[2],ymm11[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 404(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm5 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovaps %ymm9, %ymm3 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 916(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vbroadcastss 660(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm4 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 148(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vbroadcastss 404(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm2 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 660(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastss 916(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vmovaps %ymm13, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 504(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[6],ymm10[6],ymm0[7],ymm10[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm11[2],ymm3[2],ymm11[3],ymm3[3],ymm11[6],ymm3[6],ymm11[7],ymm3[7] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 248(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm2 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 1016(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm1[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm12[2],ymm0[3],ymm12[3],ymm0[6],ymm12[6],ymm0[7],ymm12[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm6[2],ymm1[3],ymm6[3],ymm1[6],ymm6[6],ymm1[7],ymm6[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[6],ymm14[6],ymm9[7],ymm14[7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm3 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 248(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 504(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm8[2],ymm3[3],ymm8[3],ymm3[6],ymm8[6],ymm3[7],ymm8[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm6 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm11[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 760(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[6],ymm7[6],ymm5[7],ymm7[7] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm5[2],ymm15[2],ymm5[3],ymm15[3],ymm5[6],ymm15[6],ymm5[7],ymm15[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm3[2],ymm8[2],ymm3[3],ymm8[3],ymm3[6],ymm8[6],ymm3[7],ymm8[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm14 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm14 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm15 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm14[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm6 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm14[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 1016(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm12 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastss 220(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm5 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 476(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 732(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm10[1],ymm4[1],ymm10[3],ymm4[3] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm5 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm13[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 988(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 476(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 732(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm13[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 988(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm0[1],ymm7[3],ymm0[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm5 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm5, (%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%r9) -; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, (%r9) +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rax) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) ; AVX2-ONLY-NEXT: addq $1576, %rsp # imm = 0x628 ; AVX2-ONLY-NEXT: vzeroupper @@ -6062,38 +6076,39 @@ ; SSE-LABEL: load_i32_stride8_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $2232, %rsp # imm = 0x8B8 -; SSE-NEXT: movaps 288(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 352(%rdi), %xmm7 +; SSE-NEXT: movaps 416(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 416(%rdi), %xmm8 +; SSE-NEXT: movaps 480(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 384(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 480(%rdi), %xmm9 +; SSE-NEXT: movaps 448(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps 288(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 448(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm10 +; SSE-NEXT: movaps 256(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 352(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm2 +; SSE-NEXT: movaps 320(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] ; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 @@ -6101,26 +6116,11 @@ ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movaps 256(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 736(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 704(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 672(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 640(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movaps 384(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6141,14 +6141,14 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 992(%rdi), %xmm1 +; SSE-NEXT: movaps 736(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 960(%rdi), %xmm0 +; SSE-NEXT: movaps 704(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 928(%rdi), %xmm2 +; SSE-NEXT: movaps 672(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 896(%rdi), %xmm1 +; SSE-NEXT: movaps 640(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -6158,8 +6158,8 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 864(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 832(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movaps 832(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 800(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6171,14 +6171,14 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1248(%rdi), %xmm1 +; SSE-NEXT: movaps 992(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1216(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps 960(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 1184(%rdi), %xmm2 +; SSE-NEXT: movaps 928(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1152(%rdi), %xmm1 +; SSE-NEXT: movaps 896(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -6188,8 +6188,8 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1120(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1088(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: movaps 1088(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1056(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6201,14 +6201,14 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1504(%rdi), %xmm1 +; SSE-NEXT: movaps 1248(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1472(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: movaps 1216(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 1440(%rdi), %xmm2 +; SSE-NEXT: movaps 1184(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1408(%rdi), %xmm1 +; SSE-NEXT: movaps 1152(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -6218,8 +6218,8 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1376(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1344(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps 1344(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1312(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6231,14 +6231,14 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1760(%rdi), %xmm1 +; SSE-NEXT: movaps 1504(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1728(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps 1472(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 1696(%rdi), %xmm2 +; SSE-NEXT: movaps 1440(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1664(%rdi), %xmm1 +; SSE-NEXT: movaps 1408(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -6248,8 +6248,8 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1632(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1600(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movaps 1600(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps 1568(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6261,14 +6261,14 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 2016(%rdi), %xmm1 +; SSE-NEXT: movaps 1760(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1984(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps 1728(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 1952(%rdi), %xmm2 +; SSE-NEXT: movaps 1696(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1920(%rdi), %xmm1 +; SSE-NEXT: movaps 1664(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -6278,13 +6278,28 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1888(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1856(%rdi), %xmm2 +; SSE-NEXT: movaps 1856(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps 1824(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1792(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 2016(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1984(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 1824(%rdi), %xmm7 +; SSE-NEXT: movaps 1952(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1792(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movaps 1920(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] ; SSE-NEXT: movaps %xmm1, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] @@ -6296,25 +6311,22 @@ ; SSE-NEXT: movaps 64(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movaps (%rdi), %xmm11 +; SSE-NEXT: movaps (%rdi), %xmm12 ; SSE-NEXT: movaps 32(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm10 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] -; SSE-NEXT: movaps %xmm10, %xmm7 +; SSE-NEXT: movaps %xmm12, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] +; SSE-NEXT: movaps %xmm11, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: movaps (%rsp), %xmm10 # 16-byte Reload -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] @@ -6323,6 +6335,9 @@ ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rsp), %xmm11 # 16-byte Reload +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] @@ -6343,16 +6358,20 @@ ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] @@ -6363,8 +6382,8 @@ ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] @@ -6375,8 +6394,8 @@ ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] @@ -6387,70 +6406,71 @@ ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] ; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm12[0] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm13[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm7 @@ -6460,9 +6480,9 @@ ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm13[0] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm15[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm7 @@ -6472,38 +6492,33 @@ ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm15[0] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm14 ; SSE-NEXT: movaps 208(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6661,7 +6676,7 @@ ; SSE-NEXT: movaps 1456(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1424(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] @@ -6676,7 +6691,7 @@ ; SSE-NEXT: movaps 1584(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 1552(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] @@ -6811,16 +6826,16 @@ ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm12 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; SSE-NEXT: movaps (%rsp), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3] -; SSE-NEXT: movaps %xmm12, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload @@ -6900,16 +6915,16 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm7 # 16-byte Reload ; SSE-NEXT: movaps %xmm7, %xmm12 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm6[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: movaps %xmm6, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] @@ -6928,14 +6943,6 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 176(%rsi) @@ -6944,13 +6951,13 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 192(%rsi) +; SSE-NEXT: movaps %xmm1, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 128(%rsi) +; SSE-NEXT: movaps %xmm1, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps %xmm1, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) +; SSE-NEXT: movaps %xmm1, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -6960,38 +6967,46 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 224(%rdx) +; SSE-NEXT: movaps %xmm1, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 240(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 192(%rdx) +; SSE-NEXT: movaps %xmm1, 224(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rdx) +; SSE-NEXT: movaps %xmm1, 192(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 128(%rdx) +; SSE-NEXT: movaps %xmm1, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rdx) +; SSE-NEXT: movaps %xmm1, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rdx) +; SSE-NEXT: movaps %xmm1, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps %xmm1, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 240(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 224(%rcx) @@ -7148,9 +7163,9 @@ ; SSE-NEXT: movaps %xmm0, 224(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 192(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) @@ -7178,362 +7193,363 @@ ; AVX1-ONLY-LABEL: load_i32_stride8_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $3688, %rsp # imm = 0xE68 -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm15 -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm15, %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm14[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpermilps $85, (%rsp), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[1],xmm13[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm10[1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps $85, (%rsp), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7550,8 +7566,8 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7571,8 +7587,8 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7580,196 +7596,193 @@ ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm2[0,1,2],xmm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2],xmm12[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm4[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm1[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm1[0,1,2],xmm14[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm2[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2],xmm12[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm7[0,1,2],xmm12[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm6[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm14[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm11[0,1],xmm14[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,1,2],xmm11[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm14[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm12[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm0[0,1],xmm11[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm0[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm2[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm15[2],mem[2],xmm15[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],xmm13[2],xmm1[3],xmm13[3] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] @@ -7777,10 +7790,12 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = mem[2,3,2,3] @@ -7881,7 +7896,7 @@ ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm3 @@ -7960,7 +7975,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm7[0,1],ymm0[2,0],ymm7[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] @@ -8068,7 +8083,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm12[0],ymm5[0],ymm12[1],ymm5[1],ymm12[4],ymm5[4],ymm12[5],ymm5[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vshufps $17, (%rsp), %ymm1, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 @@ -8123,7 +8138,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm8[0],mem[0],ymm8[1],mem[1],ymm8[4],mem[4],ymm8[5],mem[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,0],mem[1,0],ymm0[5,4],mem[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] @@ -8208,7 +8223,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,0],ymm8[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 @@ -8261,7 +8276,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[6],ymm7[6],ymm9[7],ymm7[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,0],ymm8[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps (%rsp), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -8352,7 +8367,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vshufps $51, (%rsp), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] @@ -8402,7 +8417,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm7 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,0],mem[3,0],ymm8[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 @@ -8425,14 +8440,6 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rsi) @@ -8441,13 +8448,13 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload @@ -8457,13 +8464,13 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload @@ -8473,22 +8480,30 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%r9) @@ -8553,29 +8568,29 @@ ; AVX2-ONLY-LABEL: load_i32_stride8_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $3464, %rsp # imm = 0xD88 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm3 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -8583,59 +8598,59 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vbroadcastss %xmm13, %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -8643,59 +8658,59 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm2 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -8703,59 +8718,59 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -8763,9 +8778,9 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vbroadcastss %xmm10, %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8777,51 +8792,44 @@ ; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vbroadcastss %xmm10, %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm2 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm10[0],xmm7[1],xmm10[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8831,16 +8839,33 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm13[0],xmm7[1],xmm13[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm15[0],xmm5[1],xmm15[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8850,16 +8875,17 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1] +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8869,29 +8895,17 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8905,13 +8919,13 @@ ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm10[0],mem[0],xmm10[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8925,13 +8939,13 @@ ; AVX2-ONLY-NEXT: # xmm0 = mem[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm9[0],mem[0],xmm9[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklps (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[1,1,1,1] @@ -8942,45 +8956,21 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8988,232 +8978,247 @@ ; AVX2-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm10[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm9 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm10 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = mem[0,1,2],xmm4[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm3[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm2[0,1,2],xmm14[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm10[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm8[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = mem[0,1,2],xmm12[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm14[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm4[0,1,2],xmm12[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm6[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpermilps $170, (%rsp), %xmm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm8 = mem[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm8 = mem[0,1,2],xmm8[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm12[2,2,2,2] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm13[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm8 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = mem[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = mem[0,1,2],xmm6[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm8 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm13[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm8[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm13[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm9[0,1,2],xmm15[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm15 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1,2],xmm15[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm15 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = mem[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm13[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm5[1],xmm13[1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm5[1],xmm12[1] -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm12 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm5[1],xmm11[1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm10 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm5[1],xmm9[1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm8 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm3[1],xmm7[1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm6 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm5[1] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = mem[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm14[2],xmm4[3],xmm14[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 @@ -9356,7 +9361,7 @@ ; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 @@ -9410,7 +9415,7 @@ ; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm1 @@ -9509,7 +9514,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX2-ONLY-NEXT: vpermilps $85, (%rsp), %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] @@ -9537,7 +9542,7 @@ ; AVX2-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-ONLY-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermilps $85, (%rsp), %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] @@ -9642,7 +9647,7 @@ ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -9653,7 +9658,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 1784(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] @@ -9677,7 +9682,7 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9701,7 +9706,7 @@ ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastss 476(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] @@ -9775,14 +9780,6 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) @@ -9791,13 +9788,13 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -9807,13 +9804,13 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -9823,22 +9820,30 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r9) @@ -9876,7 +9881,7 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) @@ -9897,7 +9902,7 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm15, 64(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX2-ONLY-NEXT: addq $3464, %rsp # imm = 0xD88 ; AVX2-ONLY-NEXT: vzeroupper @@ -9906,602 +9911,603 @@ ; AVX512F-ONLY-SLOW-LABEL: load_i32_stride8_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 ; AVX512F-ONLY-SLOW-NEXT: movb $-64, %al ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm1, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm1, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm5, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm1, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm1, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm15, %zmm3, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm1, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm15, %zmm5, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm8, %zmm12, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm5, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm8, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm3, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm25, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,13,21,29,5,13,21,29] +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm3, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm5, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm25, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm3, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm6, %zmm3, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm6, %zmm0, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm1, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm1, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm6, %zmm0, %zmm20 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512F-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm3, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm8, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm8, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm29, %zmm4, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm26, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [7,15,23,31,7,15,23,31] +; AVX512F-ONLY-SLOW-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm21, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm21, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm29, %zmm4, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm21, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm18, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm19, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 192(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 128(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 64(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 192(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 128(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 64(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 192(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 128(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 64(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, (%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 192(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 128(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 192(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 128(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, 64(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, (%r9) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 128(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm3, (%rax) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512F-ONLY-SLOW-NEXT: addq $3304, %rsp # imm = 0xCE8 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq @@ -10509,602 +10515,603 @@ ; AVX512F-ONLY-FAST-LABEL: load_i32_stride8_vf64: ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 ; AVX512F-ONLY-FAST-NEXT: movb $-64, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] ; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm1, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm1, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm15, %zmm3, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] ; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm1, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm15, %zmm5, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] ; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm8, %zmm12, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm5, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] ; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm8, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm25, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,13,21,29,5,13,21,29] +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm3, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm5, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm25, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm1, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm3, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm6, %zmm3, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm6, %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm1, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm1, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm6, %zmm1, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm6, %zmm0, %zmm20 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512F-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm3, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm8, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm8, %zmm4, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm5, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm29, %zmm4, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm26, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [7,15,23,31,7,15,23,31] +; AVX512F-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm21, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm21, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm29, %zmm4, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm21, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm18, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm19, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 192(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 128(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 64(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 192(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 128(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 64(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 192(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 128(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 64(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 192(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 128(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 64(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 192(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 128(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, 64(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 128(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm3, (%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512F-ONLY-FAST-NEXT: addq $3304, %rsp # imm = 0xCE8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq @@ -11112,602 +11119,603 @@ ; AVX512DQ-SLOW-LABEL: load_i32_stride8_vf64: ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm28 ; AVX512DQ-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 ; AVX512DQ-SLOW-NEXT: movb $-64, %al ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm8, %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm20 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm4, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm5, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] ; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm8, %zmm1, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm1, %zmm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm12, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm15, %zmm3, %zmm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] ; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm8, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm1, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm29 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm12, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm15, %zmm5, %zmm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] ; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm11, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm8, %zmm12, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm9, %zmm5, %zmm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] ; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm11, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm8, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm9, %zmm3, %zmm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm25, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,13,21,29,5,13,21,29] +; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm12, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm3, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm5, %zmm7 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm25, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm1, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm3, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm6, %zmm3, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm6, %zmm0, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm1, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm11, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm11, %zmm1, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm11, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm6, %zmm0, %zmm20 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQ-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm10, %zmm3, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm10, %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm8, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm8, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm29, %zmm4, %zmm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm26, %zmm26 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] +; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [7,15,23,31,7,15,23,31] +; AVX512DQ-SLOW-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm21, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm10, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm10, %zmm21, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm8, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm29, %zmm4, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm21, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm18, %zmm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%rcx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%r8) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $15, (%rsp), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm19, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 192(%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 128(%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 64(%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 192(%rdx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 128(%rdx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 64(%rdx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 192(%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 128(%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 64(%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 192(%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 128(%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, (%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 192(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 128(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, 64(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, (%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 128(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm3, (%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512DQ-SLOW-NEXT: addq $3304, %rsp # imm = 0xCE8 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq @@ -11715,602 +11723,603 @@ ; AVX512DQ-FAST-LABEL: load_i32_stride8_vf64: ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512DQ-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm28 ; AVX512DQ-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 ; AVX512DQ-FAST-NEXT: movb $-64, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm8, %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm20 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm4, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] ; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm8, %zmm1, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm14 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm1, %zmm14 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm12, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm15, %zmm3, %zmm1 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] ; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm8, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm14 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm1, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm29 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2d %zmm12, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm15, %zmm5, %zmm1 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] ; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm14 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm11, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm8, %zmm12, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm9, %zmm5, %zmm1 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] ; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm14 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm11, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm8, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm25, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,13,21,29,5,13,21,29] +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm12, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm3, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm5, %zmm7 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm25, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm1, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm4, %zmm3, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm6, %zmm3, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm6, %zmm0, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm1, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm11, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm11, %zmm1, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm11, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm6, %zmm1, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm6, %zmm0, %zmm20 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm10, %zmm3, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm10, %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm8, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm8, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm8, %zmm5, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm29, %zmm4, %zmm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm26, %zmm26 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [7,15,23,31,7,15,23,31] +; AVX512DQ-FAST-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm21, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm10, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm10, %zmm21, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm8, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm8, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm29, %zmm4, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm21, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm18, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%rsi) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%rdx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%r8) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $15, (%rsp), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm19, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm7 +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 192(%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 128(%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 64(%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 192(%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 128(%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 64(%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 192(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 128(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 192(%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 128(%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 192(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 128(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 64(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 128(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm3, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512DQ-FAST-NEXT: addq $3304, %rsp # imm = 0xCE8 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -12318,602 +12327,603 @@ ; AVX512BW-ONLY-SLOW-LABEL: load_i32_stride8_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: ; AVX512BW-ONLY-SLOW-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm28 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 ; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm1, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm5, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm1, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm1, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm15, %zmm3, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm1, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm15, %zmm5, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm8, %zmm12, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm5, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm8, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm3, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm25, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm3, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm5, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm25, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm3, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm6, %zmm3, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm6, %zmm0, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm1, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm1, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm6, %zmm0, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm3, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm8, %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm8, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm29, %zmm4, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm26, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-SLOW-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm21, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm10, %zmm21, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm29, %zmm4, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm21, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm18, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm19, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 192(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 128(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 64(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, (%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 192(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 128(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 64(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 192(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 128(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 64(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, (%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 192(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 128(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 64(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, (%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 192(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 128(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, 64(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, (%r9) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 128(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm3, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: addq $3304, %rsp # imm = 0xCE8 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq @@ -12921,602 +12931,603 @@ ; AVX512BW-ONLY-FAST-LABEL: load_i32_stride8_vf64: ; AVX512BW-ONLY-FAST: # %bb.0: ; AVX512BW-ONLY-FAST-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm28 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 ; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm1, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm1, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm1, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm15, %zmm3, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm1, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm15, %zmm5, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm8, %zmm12, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm5, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm8, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm25, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm3, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm5, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm25, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm3, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm6, %zmm3, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm6, %zmm0, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm1, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm1, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm6, %zmm1, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm6, %zmm0, %zmm20 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm3, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm8, %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm8, %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm5, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm29, %zmm4, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm26, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512BW-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm21, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm10, %zmm21, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm29, %zmm4, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm21, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm18, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm4 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm19, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm7 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 192(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 128(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 64(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, (%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 192(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 128(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 64(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, (%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 192(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 128(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 64(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 192(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 128(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 64(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, (%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 192(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 128(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, 64(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, (%r9) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 128(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm3, (%rax) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512BW-ONLY-FAST-NEXT: addq $3304, %rsp # imm = 0xCE8 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq @@ -13524,602 +13535,603 @@ ; AVX512DQBW-SLOW-LABEL: load_i32_stride8_vf64: ; AVX512DQBW-SLOW: # %bb.0: ; AVX512DQBW-SLOW-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm28 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 ; AVX512DQBW-SLOW-NEXT: movb $-64, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm8, %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm20 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm4, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm5, %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] ; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm8, %zmm1, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm1, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm12, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm15, %zmm3, %zmm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] ; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm8, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm1, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm12, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm15, %zmm5, %zmm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] ; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm11, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm8, %zmm12, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm9, %zmm5, %zmm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] ; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm13 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm1, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm11, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm8, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm9, %zmm3, %zmm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm25, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,13,21,29,5,13,21,29] +; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm12, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm3, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm5, %zmm7 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQBW-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm25, %zmm1, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm25, %zmm0, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm1, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm4, %zmm3, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm6, %zmm3, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm6, %zmm0, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm1, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm1, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm11, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm11, %zmm1, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm11, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm6, %zmm1, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm6, %zmm0, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm10, %zmm3, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm10, %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm8, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm8, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm29, %zmm4, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm26, %zmm26 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] +; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512DQBW-SLOW-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [7,15,23,31,7,15,23,31] +; AVX512DQBW-SLOW-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm21, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm10, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm10, %zmm21, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm8, %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm8, %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm29, %zmm4, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm21, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm18, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%rsi) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%rdx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%rcx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%r8) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%r9) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm19, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 192(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 128(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 64(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, (%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 192(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 128(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 64(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 192(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 128(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 64(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, (%rcx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 192(%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 128(%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 64(%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, (%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 192(%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 128(%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, 64(%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, (%r9) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 128(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%rax) -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm3, (%rax) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512DQBW-SLOW-NEXT: addq $3304, %rsp # imm = 0xCE8 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq @@ -14127,602 +14139,603 @@ ; AVX512DQBW-FAST-LABEL: load_i32_stride8_vf64: ; AVX512DQBW-FAST: # %bb.0: ; AVX512DQBW-FAST-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovaps 1152(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm28 ; AVX512DQBW-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 ; AVX512DQBW-FAST-NEXT: movb $-64, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] -; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm8, %zmm1, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm3, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm20 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm4, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] ; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm8, %zmm1, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm12, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm14 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm1, %zmm14 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm12, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm15, %zmm3, %zmm1 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] ; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm8, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm14 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 ; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm1, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm29 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm12, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm15, %zmm5, %zmm1 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] ; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm0, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm14 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm11, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm8, %zmm12, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm9, %zmm5, %zmm1 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] ; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm15 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm3, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm1, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm14 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm11, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm8, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm1 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] ; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm25, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,13,21,29,5,13,21,29] +; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm12, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm3, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm21 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm5, %zmm7 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm25, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm25, %zmm0, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm4, %zmm3, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm4, %zmm1, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm6, %zmm3, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm6, %zmm1, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm6, %zmm0, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm1, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm1, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm1, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm11, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm11, %zmm1, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm11, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm6, %zmm1, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm6, %zmm0, %zmm20 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm10, %zmm3, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm10, %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm8, %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm8, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm25 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] -; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm8, %zmm5, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm29, %zmm4, %zmm5 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm26, %zmm26 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] +; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] -; AVX512DQBW-FAST-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [7,15,23,31,7,15,23,31] +; AVX512DQBW-FAST-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm21, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm10, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm10, %zmm21, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm8, %zmm5, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm8, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm29, %zmm4, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm21, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm18, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm23, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%rsi) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%rdx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%rcx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%r8) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%r9) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm4 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm19, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm17, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm31 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm7 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 192(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 128(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 64(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, (%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 192(%rdx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 128(%rdx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 64(%rdx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, (%rdx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 192(%rcx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 128(%rcx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 64(%rcx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, (%rcx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 192(%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 128(%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 64(%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, (%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 192(%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 128(%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 64(%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, (%r9) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 128(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%rax) -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, (%rax) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512DQBW-FAST-NEXT: addq $3304, %rsp # imm = 0xCE8 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll @@ -51,16 +51,16 @@ ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm2 ; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm5, (%rsi) -; SSE-NEXT: movaps %xmm4, 16(%rsi) -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm5, 16(%rsi) +; SSE-NEXT: movaps %xmm4, (%rsi) ; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride2_vf4: @@ -154,48 +154,48 @@ ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm2 ; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps 80(%rdi), %xmm4 -; SSE-NEXT: movaps 64(%rdi), %xmm5 -; SSE-NEXT: movaps 112(%rdi), %xmm6 -; SSE-NEXT: movaps 96(%rdi), %xmm7 +; SSE-NEXT: movaps 112(%rdi), %xmm4 +; SSE-NEXT: movaps 96(%rdi), %xmm5 +; SSE-NEXT: movaps 80(%rdi), %xmm6 +; SSE-NEXT: movaps 64(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0] ; SSE-NEXT: movaps %xmm5, %xmm9 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm4[0] -; SSE-NEXT: movaps %xmm2, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm3[0] -; SSE-NEXT: movaps %xmm0, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm3[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm4[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm9, 32(%rsi) -; SSE-NEXT: movaps %xmm8, 48(%rsi) -; SSE-NEXT: movaps %xmm11, (%rsi) -; SSE-NEXT: movaps %xmm10, 16(%rsi) -; SSE-NEXT: movaps %xmm5, 32(%rdx) -; SSE-NEXT: movaps %xmm7, 48(%rdx) -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: movaps %xmm9, 48(%rsi) +; SSE-NEXT: movaps %xmm8, 32(%rsi) +; SSE-NEXT: movaps %xmm11, 16(%rsi) +; SSE-NEXT: movaps %xmm10, (%rsi) +; SSE-NEXT: movaps %xmm5, 48(%rdx) +; SSE-NEXT: movaps %xmm7, 32(%rdx) ; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride2_vf8: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -205,18 +205,18 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -246,58 +246,58 @@ ; SSE-NEXT: movaps (%rdi), %xmm1 ; SSE-NEXT: movaps 16(%rdi), %xmm8 ; SSE-NEXT: movaps 32(%rdi), %xmm0 -; SSE-NEXT: movaps 208(%rdi), %xmm9 -; SSE-NEXT: movaps 192(%rdi), %xmm2 -; SSE-NEXT: movaps 240(%rdi), %xmm10 -; SSE-NEXT: movaps 224(%rdi), %xmm4 -; SSE-NEXT: movaps 144(%rdi), %xmm12 -; SSE-NEXT: movaps 128(%rdi), %xmm3 -; SSE-NEXT: movaps 176(%rdi), %xmm13 -; SSE-NEXT: movaps 160(%rdi), %xmm6 -; SSE-NEXT: movaps 80(%rdi), %xmm14 -; SSE-NEXT: movaps 64(%rdi), %xmm5 -; SSE-NEXT: movaps 112(%rdi), %xmm15 -; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm5, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm14[1] -; SSE-NEXT: movaps %xmm7, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: movaps 240(%rdi), %xmm9 +; SSE-NEXT: movaps 224(%rdi), %xmm2 +; SSE-NEXT: movaps 208(%rdi), %xmm11 +; SSE-NEXT: movaps 192(%rdi), %xmm3 +; SSE-NEXT: movaps 176(%rdi), %xmm12 +; SSE-NEXT: movaps 160(%rdi), %xmm4 +; SSE-NEXT: movaps 144(%rdi), %xmm13 +; SSE-NEXT: movaps 128(%rdi), %xmm5 +; SSE-NEXT: movaps 112(%rdi), %xmm14 +; SSE-NEXT: movaps 96(%rdi), %xmm6 +; SSE-NEXT: movaps 80(%rdi), %xmm15 +; SSE-NEXT: movaps 64(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm15[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm15[1] -; SSE-NEXT: movaps %xmm3, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm12[1] -; SSE-NEXT: movaps %xmm6, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm13[1] -; SSE-NEXT: movaps %xmm2, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm9[0] +; SSE-NEXT: movaps %xmm6, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm14[1] +; SSE-NEXT: movaps %xmm5, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] +; SSE-NEXT: movaps %xmm4, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm12[1] +; SSE-NEXT: movaps %xmm3, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1] +; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm9[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm9[1] -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] -; SSE-NEXT: movaps %xmm1, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0] +; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] -; SSE-NEXT: movaps %xmm13, 96(%rsi) -; SSE-NEXT: movaps %xmm9, 112(%rsi) -; SSE-NEXT: movaps %xmm15, 64(%rsi) -; SSE-NEXT: movaps %xmm12, 80(%rsi) -; SSE-NEXT: movaps %xmm11, 32(%rsi) -; SSE-NEXT: movaps %xmm14, 48(%rsi) -; SSE-NEXT: movaps %xmm10, (%rsi) +; SSE-NEXT: movaps %xmm11, 112(%rsi) +; SSE-NEXT: movaps %xmm12, 96(%rsi) +; SSE-NEXT: movaps %xmm13, 80(%rsi) +; SSE-NEXT: movaps %xmm14, 64(%rsi) +; SSE-NEXT: movaps %xmm15, 48(%rsi) +; SSE-NEXT: movaps %xmm10, 32(%rsi) ; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps %xmm4, 112(%rdx) -; SSE-NEXT: movaps %xmm2, 96(%rdx) -; SSE-NEXT: movaps %xmm6, 80(%rdx) -; SSE-NEXT: movaps %xmm3, 64(%rdx) -; SSE-NEXT: movaps %xmm7, 48(%rdx) -; SSE-NEXT: movaps %xmm5, 32(%rdx) +; SSE-NEXT: movaps %xmm9, (%rsi) +; SSE-NEXT: movaps %xmm2, 112(%rdx) +; SSE-NEXT: movaps %xmm3, 96(%rdx) +; SSE-NEXT: movaps %xmm4, 80(%rdx) +; SSE-NEXT: movaps %xmm5, 64(%rdx) +; SSE-NEXT: movaps %xmm6, 48(%rdx) +; SSE-NEXT: movaps %xmm7, 32(%rdx) ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) @@ -309,30 +309,30 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm10[1],ymm2[3],ymm10[3] -; AVX1-ONLY-NEXT: vmovaps %ymm11, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm8[1],ymm1[3],ymm8[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] +; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -342,34 +342,34 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,1,3] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,1,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,1,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,1,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,1,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm9, 64(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm11, (%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 96(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 32(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm9, 96(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm10, (%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -404,21 +404,21 @@ ; SSE-LABEL: load_i64_stride2_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps 208(%rdi), %xmm7 -; SSE-NEXT: movaps 192(%rdi), %xmm1 -; SSE-NEXT: movaps 80(%rdi), %xmm10 -; SSE-NEXT: movaps 64(%rdi), %xmm0 -; SSE-NEXT: movaps 240(%rdi), %xmm11 -; SSE-NEXT: movaps 224(%rdi), %xmm3 -; SSE-NEXT: movaps 112(%rdi), %xmm13 -; SSE-NEXT: movaps 96(%rdi), %xmm2 -; SSE-NEXT: movaps 272(%rdi), %xmm9 -; SSE-NEXT: movaps 144(%rdi), %xmm14 -; SSE-NEXT: movaps 128(%rdi), %xmm4 -; SSE-NEXT: movaps 304(%rdi), %xmm12 -; SSE-NEXT: movaps 288(%rdi), %xmm6 -; SSE-NEXT: movaps 176(%rdi), %xmm15 -; SSE-NEXT: movaps 160(%rdi), %xmm5 +; SSE-NEXT: movaps 240(%rdi), %xmm7 +; SSE-NEXT: movaps 224(%rdi), %xmm1 +; SSE-NEXT: movaps 112(%rdi), %xmm10 +; SSE-NEXT: movaps 96(%rdi), %xmm0 +; SSE-NEXT: movaps 208(%rdi), %xmm11 +; SSE-NEXT: movaps 192(%rdi), %xmm3 +; SSE-NEXT: movaps 80(%rdi), %xmm13 +; SSE-NEXT: movaps 64(%rdi), %xmm2 +; SSE-NEXT: movaps 304(%rdi), %xmm9 +; SSE-NEXT: movaps 176(%rdi), %xmm14 +; SSE-NEXT: movaps 160(%rdi), %xmm4 +; SSE-NEXT: movaps 272(%rdi), %xmm12 +; SSE-NEXT: movaps 256(%rdi), %xmm6 +; SSE-NEXT: movaps 144(%rdi), %xmm15 +; SSE-NEXT: movaps 128(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm2, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm13[0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -454,236 +454,236 @@ ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm12[1] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm0 +; SSE-NEXT: movaps 288(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 352(%rdi), %xmm15 +; SSE-NEXT: movaps 336(%rdi), %xmm0 +; SSE-NEXT: movaps 320(%rdi), %xmm15 ; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] -; SSE-NEXT: movaps 336(%rdi), %xmm0 -; SSE-NEXT: movaps 320(%rdi), %xmm13 +; SSE-NEXT: movaps 368(%rdi), %xmm0 +; SSE-NEXT: movaps 352(%rdi), %xmm13 ; SSE-NEXT: movaps %xmm13, %xmm12 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 432(%rdi), %xmm0 -; SSE-NEXT: movaps 416(%rdi), %xmm9 +; SSE-NEXT: movaps 400(%rdi), %xmm0 +; SSE-NEXT: movaps 384(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 400(%rdi), %xmm0 -; SSE-NEXT: movaps 384(%rdi), %xmm6 +; SSE-NEXT: movaps 432(%rdi), %xmm0 +; SSE-NEXT: movaps 416(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, %xmm10 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSE-NEXT: movaps 496(%rdi), %xmm1 -; SSE-NEXT: movaps 480(%rdi), %xmm4 +; SSE-NEXT: movaps 464(%rdi), %xmm1 +; SSE-NEXT: movaps 448(%rdi), %xmm4 ; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSE-NEXT: movaps 464(%rdi), %xmm3 -; SSE-NEXT: movaps 448(%rdi), %xmm1 +; SSE-NEXT: movaps 496(%rdi), %xmm3 +; SSE-NEXT: movaps 480(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: movaps 32(%rdi), %xmm11 -; SSE-NEXT: movaps 48(%rdi), %xmm2 +; SSE-NEXT: movaps (%rdi), %xmm11 +; SSE-NEXT: movaps 16(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm11, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] -; SSE-NEXT: movaps (%rdi), %xmm8 -; SSE-NEXT: movaps 16(%rdi), %xmm2 +; SSE-NEXT: movaps 32(%rdi), %xmm8 +; SSE-NEXT: movaps 48(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm8, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] -; SSE-NEXT: movaps %xmm0, 224(%rsi) -; SSE-NEXT: movaps %xmm12, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps %xmm5, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rsi) +; SSE-NEXT: movaps %xmm0, 240(%rsi) +; SSE-NEXT: movaps %xmm12, 176(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) -; SSE-NEXT: movaps %xmm10, 192(%rsi) +; SSE-NEXT: movaps %xmm5, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps %xmm0, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) -; SSE-NEXT: movaps %xmm7, (%rsi) -; SSE-NEXT: movaps %xmm14, 208(%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps %xmm10, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps %xmm1, 224(%rdx) -; SSE-NEXT: movaps %xmm4, 240(%rdx) -; SSE-NEXT: movaps %xmm6, 192(%rdx) -; SSE-NEXT: movaps %xmm9, 208(%rdx) -; SSE-NEXT: movaps %xmm13, 160(%rdx) -; SSE-NEXT: movaps %xmm15, 176(%rdx) +; SSE-NEXT: movaps %xmm7, 16(%rsi) +; SSE-NEXT: movaps %xmm14, 192(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm3, (%rsi) +; SSE-NEXT: movaps %xmm1, 240(%rdx) +; SSE-NEXT: movaps %xmm4, 224(%rdx) +; SSE-NEXT: movaps %xmm6, 208(%rdx) +; SSE-NEXT: movaps %xmm9, 192(%rdx) +; SSE-NEXT: movaps %xmm13, 176(%rdx) +; SSE-NEXT: movaps %xmm15, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movaps %xmm8, (%rdx) -; SSE-NEXT: movaps %xmm11, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm8, 16(%rdx) +; SSE-NEXT: movaps %xmm11, (%rdx) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride2_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm3, %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm3, %ymm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm7[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm5[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm5[0],ymm12[0],ymm5[2],ymm12[2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm4[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm6[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm6[0],ymm15[0],ymm6[2],ymm15[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm15[1],ymm6[3],ymm15[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm7[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm12[0],ymm7[2],ymm12[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm14[0],ymm5[2],ymm14[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm6[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm12[1],ymm7[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm4[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm14[1],ymm5[3],ymm14[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm2[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm2[0],ymm14[0],ymm2[2],ymm14[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm12[1],ymm4[3],ymm12[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm11[1],ymm6[3],ymm11[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm11[0],ymm1[2],ymm11[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] -; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm7[0],ymm15[0],ymm7[2],ymm15[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm15[1],ymm7[3],ymm15[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm2[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm12[1],ymm5[3],ymm12[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm15[0],ymm2[2],ymm15[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm4[0],ymm14[0],ymm4[2],ymm14[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm14[1],ymm4[3],ymm14[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride2_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm7[0],ymm4[0],ymm7[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm8[0],ymm5[0],ymm8[2],ymm5[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm5[1],ymm8[3],ymm5[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm11[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm7[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm8[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm11[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 224(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm13[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 160(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm15[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm15[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rsi) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -702,22 +702,22 @@ ; AVX512-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 ; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm10 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512-NEXT: vpermt2q %zmm3, %zmm8, %zmm11 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512-NEXT: vpermt2q %zmm1, %zmm8, %zmm11 +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,3,5,7,9,11,13,15] -; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm5 ; AVX512-NEXT: vpermt2q %zmm6, %zmm12, %zmm7 -; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 +; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm5 ; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm10, 192(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm8, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm11, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm9, 128(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm8, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm11, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm5, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm7, 128(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <64 x i64>, ptr %in.vec, align 64 @@ -1060,30 +1060,30 @@ ; AVX1-ONLY-LABEL: load_i64_stride2_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $424, %rsp # imm = 0x1A8 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm1, %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm1, %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm2, %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm2, %ymm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm14[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdi), %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] @@ -1091,7 +1091,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm5, %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdi), %ymm5, %ymm15 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] @@ -1099,7 +1099,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm3[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm3, %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm3, %ymm15 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm9[0],ymm15[2],ymm9[2] @@ -1111,101 +1111,101 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm4[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm3[0],ymm12[0],ymm3[2],ymm12[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm12[1],ymm3[3],ymm12[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 800(%rdi), %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 800(%rdi), %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 992(%rdi), %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 992(%rdi), %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 320(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 256(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 416(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 352(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 288(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 224(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 480(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 416(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 352(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 288(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 448(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 384(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 256(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 480(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 384(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 416(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 320(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 352(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 416(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 352(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 320(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: addq $424, %rsp # imm = 0x1A8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -1213,21 +1213,21 @@ ; AVX2-ONLY-LABEL: load_i64_stride2_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $424, %rsp # imm = 0x1A8 -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm12[0],ymm2[0],ymm12[2],ymm2[2] ; AVX2-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm2[1],ymm12[3],ymm2[3] @@ -1255,126 +1255,126 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm9[0],ymm4[0],ymm9[2],ymm4[2] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm4[1],ymm9[3],ymm4[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm10[0],ymm13[2],ymm10[2] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] ; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm10[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm10[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, (%rsp), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rdx) -; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermpd $216, (%rsp), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rdx) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) ; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vpermpd $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,2,1,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-ONLY-NEXT: addq $424, %rsp # imm = 0x1A8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -1387,16 +1387,16 @@ ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm7 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm8 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm11 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm12 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm13 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,2,4,6,8,10,12,14] ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512-NEXT: vpermt2q %zmm14, %zmm16, %zmm17 @@ -1410,34 +1410,34 @@ ; AVX512-NEXT: vpermt2q %zmm6, %zmm16, %zmm21 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm22 ; AVX512-NEXT: vpermt2q %zmm4, %zmm16, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512-NEXT: vpermt2q %zmm3, %zmm16, %zmm23 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512-NEXT: vpermt2q %zmm1, %zmm16, %zmm23 +; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm16 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm24 = [1,3,5,7,9,11,13,15] +; AVX512-NEXT: vpermt2q %zmm14, %zmm24, %zmm15 ; AVX512-NEXT: vpermt2q %zmm12, %zmm24, %zmm13 ; AVX512-NEXT: vpermt2q %zmm10, %zmm24, %zmm11 ; AVX512-NEXT: vpermt2q %zmm8, %zmm24, %zmm9 ; AVX512-NEXT: vpermt2q %zmm6, %zmm24, %zmm7 ; AVX512-NEXT: vpermt2q %zmm4, %zmm24, %zmm5 -; AVX512-NEXT: vpermt2q %zmm14, %zmm24, %zmm15 -; AVX512-NEXT: vpermt2q %zmm3, %zmm24, %zmm2 ; AVX512-NEXT: vpermt2q %zmm1, %zmm24, %zmm0 +; AVX512-NEXT: vpermt2q %zmm3, %zmm24, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm22, 448(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm21, 256(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm21, 384(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm20, 320(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm19, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm19, 256(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm23, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm17, 384(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm15, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm17, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm23, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm5, 448(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm7, 256(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm7, 384(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm9, 320(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm11, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm11, 256(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm13, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <128 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll @@ -56,57 +56,18 @@ ; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rcx) ; AVX2-ONLY-NEXT: retq ; -; AVX512F-SLOW-LABEL: load_i64_stride3_vf2: -; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,3,2,3,4,7,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vmovaps %xmm0, (%rsi) -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, (%rcx) -; AVX512F-SLOW-NEXT: vzeroupper -; AVX512F-SLOW-NEXT: retq -; -; AVX512F-FAST-LABEL: load_i64_stride3_vf2: -; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovaps {{.*#+}} xmm0 = [1,4] -; AVX512F-FAST-NEXT: vmovaps (%rdi), %zmm1 -; AVX512F-FAST-NEXT: vpermpd %zmm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7] -; AVX512F-FAST-NEXT: vmovaps 16(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512F-FAST-NEXT: vmovaps %xmm1, (%rsi) -; AVX512F-FAST-NEXT: vmovaps %xmm0, (%rdx) -; AVX512F-FAST-NEXT: vmovaps %xmm2, (%rcx) -; AVX512F-FAST-NEXT: vzeroupper -; AVX512F-FAST-NEXT: retq -; -; AVX512BW-SLOW-LABEL: load_i64_stride3_vf2: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,3,2,3,4,7,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512BW-SLOW-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; AVX512BW-SLOW-NEXT: vmovaps %xmm0, (%rsi) -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, (%rdx) -; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, (%rcx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq -; -; AVX512BW-FAST-LABEL: load_i64_stride3_vf2: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovaps {{.*#+}} xmm0 = [1,4] -; AVX512BW-FAST-NEXT: vmovaps (%rdi), %zmm1 -; AVX512BW-FAST-NEXT: vpermpd %zmm1, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7] -; AVX512BW-FAST-NEXT: vmovaps 16(%rdi), %xmm2 -; AVX512BW-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512BW-FAST-NEXT: vmovaps %xmm1, (%rsi) -; AVX512BW-FAST-NEXT: vmovaps %xmm0, (%rdx) -; AVX512BW-FAST-NEXT: vmovaps %xmm2, (%rcx) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX512-LABEL: load_i64_stride3_vf2: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX512-NEXT: vpunpcklqdq 24(%rdi){1to2}, %xmm0, %xmm0 +; AVX512-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; AVX512-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-NEXT: vmovaps %xmm2, (%rdx) +; AVX512-NEXT: vmovaps %xmm1, (%rcx) +; AVX512-NEXT: retq %wide.vec = load <6 x i64>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <6 x i64> %wide.vec, <6 x i64> poison, <2 x i32> %strided.vec1 = shufflevector <6 x i64> %wide.vec, <6 x i64> poison, <2 x i32> @@ -146,17 +107,18 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2],ymm1[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm0[0],ymm2[3],ymm0[2] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm0, (%rcx) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0],ymm1[1],ymm3[2],ymm1[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm0[0],ymm3[3],ymm0[2] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm1, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -210,46 +172,46 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i64_stride3_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movapd 128(%rdi), %xmm2 -; SSE-NEXT: movapd 176(%rdi), %xmm1 +; SSE-NEXT: movapd 176(%rdi), %xmm2 +; SSE-NEXT: movapd 128(%rdi), %xmm1 ; SSE-NEXT: movapd 80(%rdi), %xmm0 -; SSE-NEXT: movapd 96(%rdi), %xmm3 +; SSE-NEXT: movapd 144(%rdi), %xmm4 +; SSE-NEXT: movapd 160(%rdi), %xmm7 +; SSE-NEXT: movapd 96(%rdi), %xmm5 ; SSE-NEXT: movapd 112(%rdi), %xmm8 -; SSE-NEXT: movapd 144(%rdi), %xmm5 -; SSE-NEXT: movapd 160(%rdi), %xmm9 ; SSE-NEXT: movapd (%rdi), %xmm6 -; SSE-NEXT: movapd 16(%rdi), %xmm10 -; SSE-NEXT: movapd 32(%rdi), %xmm4 -; SSE-NEXT: movapd 48(%rdi), %xmm7 +; SSE-NEXT: movapd 16(%rdi), %xmm9 +; SSE-NEXT: movapd 32(%rdi), %xmm3 +; SSE-NEXT: movapd 48(%rdi), %xmm10 ; SSE-NEXT: movapd 64(%rdi), %xmm11 ; SSE-NEXT: movapd %xmm11, %xmm12 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm7[0],xmm12[1] -; SSE-NEXT: movapd %xmm9, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm10[0],xmm12[1] +; SSE-NEXT: movapd %xmm8, %xmm13 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm5[0],xmm13[1] -; SSE-NEXT: movapd %xmm8, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm3[0],xmm14[1] -; SSE-NEXT: movapd %xmm10, %xmm15 +; SSE-NEXT: movapd %xmm7, %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm4[0],xmm14[1] +; SSE-NEXT: movapd %xmm9, %xmm15 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm6[0],xmm15[1] -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm0[0] +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm0[0] ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm1[0] -; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm2[0] -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm4[0] +; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm2[0] +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm3[0] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm8[0],xmm2[1] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm10[0],xmm4[1] -; SSE-NEXT: movapd %xmm14, 32(%rsi) -; SSE-NEXT: movapd %xmm13, 48(%rsi) -; SSE-NEXT: movapd %xmm15, (%rsi) +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] +; SSE-NEXT: movapd %xmm14, 48(%rsi) +; SSE-NEXT: movapd %xmm13, 32(%rsi) ; SSE-NEXT: movapd %xmm12, 16(%rsi) -; SSE-NEXT: movapd %xmm3, 32(%rdx) -; SSE-NEXT: movapd %xmm5, 48(%rdx) +; SSE-NEXT: movapd %xmm15, (%rsi) +; SSE-NEXT: movapd %xmm4, 48(%rdx) +; SSE-NEXT: movapd %xmm5, 32(%rdx) +; SSE-NEXT: movapd %xmm10, 16(%rdx) ; SSE-NEXT: movapd %xmm6, (%rdx) -; SSE-NEXT: movapd %xmm7, 16(%rdx) -; SSE-NEXT: movapd %xmm2, 32(%rcx) -; SSE-NEXT: movapd %xmm1, 48(%rcx) -; SSE-NEXT: movapd %xmm4, (%rcx) +; SSE-NEXT: movapd %xmm2, 48(%rcx) +; SSE-NEXT: movapd %xmm1, 32(%rcx) ; SSE-NEXT: movapd %xmm0, 16(%rcx) +; SSE-NEXT: movapd %xmm3, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride3_vf8: @@ -257,29 +219,31 @@ ; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2],ymm5[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm1[0],ymm3[3],ymm1[2] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm0[0],ymm6[3],ymm0[2] -; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm4, (%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 32(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm5[0],ymm2[1],ymm5[2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm7[0],ymm4[1],ymm7[2],ymm4[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm5[1],ymm1[0],ymm5[3],ymm1[2] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[1],ymm0[0],ymm7[3],ymm0[2] +; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd %ymm8, 32(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm6, (%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -358,62 +322,62 @@ ; SSE-LABEL: load_i64_stride3_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movapd 128(%rdi), %xmm0 -; SSE-NEXT: movapd 176(%rdi), %xmm1 -; SSE-NEXT: movapd 224(%rdi), %xmm4 +; SSE-NEXT: movapd 176(%rdi), %xmm0 +; SSE-NEXT: movapd 128(%rdi), %xmm1 ; SSE-NEXT: movapd 272(%rdi), %xmm3 ; SSE-NEXT: movapd 80(%rdi), %xmm2 -; SSE-NEXT: movapd 96(%rdi), %xmm5 -; SSE-NEXT: movapd 112(%rdi), %xmm11 -; SSE-NEXT: movapd 144(%rdi), %xmm6 -; SSE-NEXT: movapd 160(%rdi), %xmm14 -; SSE-NEXT: movapd 192(%rdi), %xmm7 -; SSE-NEXT: movapd 208(%rdi), %xmm12 -; SSE-NEXT: movapd 240(%rdi), %xmm10 -; SSE-NEXT: movapd 256(%rdi), %xmm13 -; SSE-NEXT: movapd 48(%rdi), %xmm9 +; SSE-NEXT: movapd 224(%rdi), %xmm4 +; SSE-NEXT: movapd 144(%rdi), %xmm5 +; SSE-NEXT: movapd 160(%rdi), %xmm11 +; SSE-NEXT: movapd 96(%rdi), %xmm6 +; SSE-NEXT: movapd 112(%rdi), %xmm13 +; SSE-NEXT: movapd 240(%rdi), %xmm7 +; SSE-NEXT: movapd 256(%rdi), %xmm12 +; SSE-NEXT: movapd 48(%rdi), %xmm10 ; SSE-NEXT: movapd 64(%rdi), %xmm15 +; SSE-NEXT: movapd 192(%rdi), %xmm9 +; SSE-NEXT: movapd 208(%rdi), %xmm14 ; SSE-NEXT: movapd %xmm15, %xmm8 -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1] +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm10[0],xmm8[1] ; SSE-NEXT: movapd %xmm8, (%rsp) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm2[0] +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm2[0] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm15[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm14, %xmm15 +; SSE-NEXT: movapd %xmm13, %xmm15 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm6[0],xmm15[1] ; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm1[0] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm11, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm5[0],xmm14[1] +; SSE-NEXT: movapd %xmm11, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm5[0],xmm13[1] ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm0[0] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm13, %xmm11 -; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm10[0],xmm11[1] -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm3[0] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm13[0],xmm3[1] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm12, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm7[0],xmm13[1] -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm4[0] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm12[0],xmm4[1] +; SSE-NEXT: movapd %xmm14, %xmm11 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm9[0],xmm11[1] +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm4[0] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm14[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 336(%rdi), %xmm12 -; SSE-NEXT: movapd 352(%rdi), %xmm2 +; SSE-NEXT: movapd %xmm12, %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm7[0],xmm14[1] +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm3[0] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm12[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 288(%rdi), %xmm12 +; SSE-NEXT: movapd 304(%rdi), %xmm2 ; SSE-NEXT: movapd %xmm2, %xmm7 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm12[0],xmm7[1] -; SSE-NEXT: movapd 368(%rdi), %xmm4 +; SSE-NEXT: movapd 320(%rdi), %xmm4 ; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm4[0] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] -; SSE-NEXT: movapd 288(%rdi), %xmm2 -; SSE-NEXT: movapd 304(%rdi), %xmm5 +; SSE-NEXT: movapd 336(%rdi), %xmm2 +; SSE-NEXT: movapd 352(%rdi), %xmm5 ; SSE-NEXT: movapd %xmm5, %xmm3 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movapd 320(%rdi), %xmm0 +; SSE-NEXT: movapd 368(%rdi), %xmm0 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; SSE-NEXT: movapd (%rdi), %xmm5 @@ -423,169 +387,173 @@ ; SSE-NEXT: movapd 32(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm1[0] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] -; SSE-NEXT: movapd %xmm3, 96(%rsi) -; SSE-NEXT: movapd %xmm14, 32(%rsi) -; SSE-NEXT: movapd %xmm7, 112(%rsi) -; SSE-NEXT: movapd %xmm15, 48(%rsi) -; SSE-NEXT: movapd %xmm13, 64(%rsi) -; SSE-NEXT: movapd %xmm6, (%rsi) -; SSE-NEXT: movapd %xmm11, 80(%rsi) +; SSE-NEXT: movapd %xmm3, 112(%rsi) +; SSE-NEXT: movapd %xmm13, 48(%rsi) +; SSE-NEXT: movapd %xmm7, 96(%rsi) +; SSE-NEXT: movapd %xmm15, 32(%rsi) +; SSE-NEXT: movapd %xmm14, 80(%rsi) ; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movapd %xmm2, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rdx) -; SSE-NEXT: movapd %xmm12, 112(%rdx) +; SSE-NEXT: movapd %xmm11, 64(%rsi) +; SSE-NEXT: movapd %xmm6, (%rsi) +; SSE-NEXT: movapd %xmm2, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rdx) +; SSE-NEXT: movapd %xmm12, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rdx) +; SSE-NEXT: movaps %xmm2, 80(%rdx) +; SSE-NEXT: movapd %xmm10, 16(%rdx) +; SSE-NEXT: movapd %xmm9, 64(%rdx) ; SSE-NEXT: movapd %xmm5, (%rdx) -; SSE-NEXT: movapd %xmm10, 80(%rdx) -; SSE-NEXT: movapd %xmm9, 16(%rdx) -; SSE-NEXT: movapd %xmm0, 96(%rcx) -; SSE-NEXT: movapd %xmm4, 112(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movapd %xmm0, 112(%rcx) +; SSE-NEXT: movapd %xmm4, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps %xmm0, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) -; SSE-NEXT: movapd %xmm1, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movapd %xmm1, (%rcx) ; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride3_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = mem[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm4[1],ymm3[0],ymm4[3],ymm3[2] -; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm5[1],ymm2[0],ymm5[3],ymm2[2] -; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm7[1],ymm1[0],ymm7[3],ymm1[2] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm6[1],ymm0[0],ymm6[3],ymm0[2] -; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = mem[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0],ymm10[1],ymm9[2],ymm10[3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0],ymm4[1],ymm11[2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = mem[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm9[1],ymm7[0],ymm9[3],ymm7[2] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm12[0],ymm8[1],ymm12[2],ymm8[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm11[1],ymm6[0],ymm11[3],ymm6[2] +; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm12[1],ymm5[0],ymm12[3],ymm5[2] +; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm11[1],ymm2[0],ymm11[3],ymm2[2] +; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm12[3] ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd %ymm7, (%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 96(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 32(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 64(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm10, (%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 96(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 32(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 64(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 96(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2],ymm10[3] +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm10[1],ymm4[2],ymm10[3] +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3] +; AVX1-ONLY-NEXT: vmovapd %ymm11, 96(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 32(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 64(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm0, (%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 96(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 32(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 64(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm7, (%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 96(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 64(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm3, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride3_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm10[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm7[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm9[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm9[0,3,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm6[0,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rsi) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, (%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 96(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rcx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -639,56 +607,56 @@ ; SSE-LABEL: load_i64_stride3_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $408, %rsp # imm = 0x198 -; SSE-NEXT: movapd 224(%rdi), %xmm0 -; SSE-NEXT: movapd 272(%rdi), %xmm3 -; SSE-NEXT: movapd 128(%rdi), %xmm1 -; SSE-NEXT: movapd 176(%rdi), %xmm5 -; SSE-NEXT: movapd 80(%rdi), %xmm4 -; SSE-NEXT: movapd 96(%rdi), %xmm6 -; SSE-NEXT: movapd 112(%rdi), %xmm11 -; SSE-NEXT: movapd 144(%rdi), %xmm7 -; SSE-NEXT: movapd 160(%rdi), %xmm12 -; SSE-NEXT: movapd 192(%rdi), %xmm8 -; SSE-NEXT: movapd 208(%rdi), %xmm13 -; SSE-NEXT: movapd 240(%rdi), %xmm9 -; SSE-NEXT: movapd 256(%rdi), %xmm2 -; SSE-NEXT: movapd 48(%rdi), %xmm10 +; SSE-NEXT: movapd 272(%rdi), %xmm1 +; SSE-NEXT: movapd 224(%rdi), %xmm2 +; SSE-NEXT: movapd 176(%rdi), %xmm3 +; SSE-NEXT: movapd 128(%rdi), %xmm4 +; SSE-NEXT: movapd 80(%rdi), %xmm5 +; SSE-NEXT: movapd 144(%rdi), %xmm6 +; SSE-NEXT: movapd 160(%rdi), %xmm11 +; SSE-NEXT: movapd 96(%rdi), %xmm7 +; SSE-NEXT: movapd 112(%rdi), %xmm13 +; SSE-NEXT: movapd 240(%rdi), %xmm8 +; SSE-NEXT: movapd 256(%rdi), %xmm12 +; SSE-NEXT: movapd 48(%rdi), %xmm9 ; SSE-NEXT: movapd 64(%rdi), %xmm15 +; SSE-NEXT: movapd 192(%rdi), %xmm10 +; SSE-NEXT: movapd 208(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm15, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm10[0],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm9[0],xmm14[1] ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm4[0] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm15[0],xmm4[1] +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm5[0] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm15[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm13, %xmm5 +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1] +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm4[0] +; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm13[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm11, %xmm4 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm3[0] ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm11[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm12, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm7[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm5[0] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm12[0],xmm5[1] -; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm13, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm0[0] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm2, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm3[0] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm11[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm0, %xmm3 +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm10[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm2[0] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm12, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm1[0] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 288(%rdi), %xmm2 ; SSE-NEXT: movapd 304(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 @@ -739,107 +707,107 @@ ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 528(%rdi), %xmm15 +; SSE-NEXT: movapd 528(%rdi), %xmm14 ; SSE-NEXT: movapd 544(%rdi), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm14[0],xmm15[1] ; SSE-NEXT: movapd 560(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm0[0] +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm0[0] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 576(%rdi), %xmm11 +; SSE-NEXT: movapd 576(%rdi), %xmm12 ; SSE-NEXT: movapd 592(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm11[0],xmm14[1] +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 608(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm1[0] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 624(%rdi), %xmm8 ; SSE-NEXT: movapd 640(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm8[0],xmm13[1] -; SSE-NEXT: movapd 656(%rdi), %xmm12 -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm12[0] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: movapd %xmm0, %xmm11 +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1] +; SSE-NEXT: movapd 656(%rdi), %xmm13 +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm13[0] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1] ; SSE-NEXT: movapd 672(%rdi), %xmm0 -; SSE-NEXT: movapd 688(%rdi), %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd 688(%rdi), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm7 +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] ; SSE-NEXT: movapd 704(%rdi), %xmm10 ; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm10[0] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1] -; SSE-NEXT: movapd 720(%rdi), %xmm2 -; SSE-NEXT: movapd 736(%rdi), %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; SSE-NEXT: movapd 752(%rdi), %xmm7 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm7[0] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm4[0],xmm7[1] -; SSE-NEXT: movapd (%rdi), %xmm4 -; SSE-NEXT: movapd 16(%rdi), %xmm6 -; SSE-NEXT: movapd %xmm6, %xmm5 -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] +; SSE-NEXT: movapd 720(%rdi), %xmm1 +; SSE-NEXT: movapd 736(%rdi), %xmm3 +; SSE-NEXT: movapd %xmm3, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movapd 752(%rdi), %xmm6 +; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm6[0] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] +; SSE-NEXT: movapd (%rdi), %xmm3 +; SSE-NEXT: movapd 16(%rdi), %xmm5 +; SSE-NEXT: movapd %xmm5, %xmm4 +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] ; SSE-NEXT: movapd 32(%rdi), %xmm9 -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm9[0] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm6[0],xmm9[1] -; SSE-NEXT: movapd %xmm1, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rsi) -; SSE-NEXT: movapd %xmm3, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 176(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rsi) -; SSE-NEXT: movapd %xmm14, 192(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 128(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rsi) -; SSE-NEXT: movapd %xmm5, (%rsi) -; SSE-NEXT: movapd %xmm13, 208(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 144(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) +; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm9[0] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] +; SSE-NEXT: movapd %xmm2, 240(%rsi) +; SSE-NEXT: movapd %xmm15, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 112(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rsi) +; SSE-NEXT: movapd %xmm7, 224(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 160(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movapd %xmm11, 208(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 144(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 64(%rsi) +; SSE-NEXT: movapd %xmm4, (%rsi) +; SSE-NEXT: movapd %xmm1, 240(%rdx) ; SSE-NEXT: movapd %xmm0, 224(%rdx) -; SSE-NEXT: movapd %xmm2, 240(%rdx) -; SSE-NEXT: movapd %xmm11, 192(%rdx) ; SSE-NEXT: movapd %xmm8, 208(%rdx) +; SSE-NEXT: movapd %xmm12, 192(%rdx) +; SSE-NEXT: movapd %xmm14, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rdx) -; SSE-NEXT: movapd %xmm15, 176(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) -; SSE-NEXT: movapd %xmm4, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movapd %xmm7, 240(%rcx) +; SSE-NEXT: movapd %xmm3, (%rdx) +; SSE-NEXT: movapd %xmm6, 240(%rcx) ; SSE-NEXT: movapd %xmm10, 224(%rcx) -; SSE-NEXT: movapd %xmm12, 208(%rcx) +; SSE-NEXT: movapd %xmm13, 208(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -870,306 +838,303 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride3_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128 -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm2 +; AVX1-ONLY-NEXT: subq $168, %rsp ; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = mem[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm5 +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = mem[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = mem[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = mem[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = mem[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm8[1],ymm2[0],ymm8[3],ymm2[2] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm6[1],ymm5[0],ymm6[3],ymm5[2] +; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm3[1],ymm9[0],ymm3[3],ymm9[2] +; AVX1-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm1[1],ymm11[0],ymm1[3],ymm11[2] +; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm9 +; AVX1-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1,2],ymm15[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm14[1],ymm10[0],ymm14[3],ymm10[2] +; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm13[1],ymm7[0],ymm13[3],ymm7[2] +; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1,2],ymm15[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm7[1],ymm6[0],ymm7[3],ymm6[2] -; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm3[1],ymm8[0],ymm3[3],ymm8[2] -; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm10[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm4[1],ymm5[0],ymm4[3],ymm5[2] -; AVX1-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm12[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm1[1],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm12[1],ymm4[0],ymm12[3],ymm4[2] +; AVX1-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[1],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm2[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0],ymm0[1],ymm8[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm2, %ymm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm11[1],ymm2[0],ymm11[3],ymm2[2] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0],ymm0[1],ymm14[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = mem[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm3[1],ymm0[0],ymm3[3],ymm0[2] -; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm15[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = mem[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm4[1],ymm9[0],ymm4[3],ymm9[2] -; AVX1-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm1[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm5[1],ymm1[0],ymm5[3],ymm1[2] -; AVX1-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm2[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0],ymm6[1],ymm11[2],ymm6[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0],ymm13[1],ymm6[2],ymm13[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0],ymm13[1],mem[2],ymm13[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0],ymm5[1],ymm13[2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2],ymm11[3] +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm11[1],ymm3[2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm13[0],ymm8[1],ymm13[2],ymm8[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2],ymm13[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm13[0],ymm0[1],ymm13[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm13[2,3],mem[4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm14[1],ymm4[2],ymm14[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0],ymm11[1],ymm12[2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3] +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0],ymm11[1],ymm9[2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm15, %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2],ymm15[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm15[0],ymm1[1],ymm15[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd %ymm5, 192(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 128(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm11, (%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 96(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 192(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm11[0],ymm9[1],ymm11[2],ymm9[3] +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0],ymm15[1],ymm11[2],ymm15[3] +; AVX1-ONLY-NEXT: vmovapd %ymm1, 224(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 160(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 96(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 32(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 192(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 128(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 64(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 224(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 192(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 224(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 128(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 160(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 64(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 224(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 192(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 160(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 128(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 96(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 64(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX1-ONLY-NEXT: addq $296, %rsp # imm = 0x128 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) +; AVX1-ONLY-NEXT: addq $168, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride3_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $232, %rsp -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm5[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm7[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm10[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm12[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm15[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm0[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm1[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 496(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 496(%rdi), %xmm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rsi) ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 128(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 160(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 128(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 224(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 160(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, (%rcx) ; AVX2-ONLY-NEXT: addq $232, %rsp ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -1180,21 +1145,21 @@ ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm7 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm9 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm11 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,3,6,9,12,15,u,u> ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm13 +; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm13 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13] ; AVX512-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm15 +; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm15 ; AVX512-NEXT: vpermt2q %zmm6, %zmm14, %zmm15 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm16 ; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm16 @@ -1202,26 +1167,26 @@ ; AVX512-NEXT: vpermi2q %zmm3, %zmm9, %zmm12 ; AVX512-NEXT: vpermt2q %zmm10, %zmm14, %zmm12 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = <1,4,7,10,13,u,u,u> -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm17 ; AVX512-NEXT: vpermt2q %zmm2, %zmm14, %zmm17 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14] -; AVX512-NEXT: vpermt2q %zmm6, %zmm18, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512-NEXT: vpermt2q %zmm8, %zmm18, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm19 ; AVX512-NEXT: vpermt2q %zmm1, %zmm14, %zmm19 -; AVX512-NEXT: vpermt2q %zmm8, %zmm18, %zmm19 +; AVX512-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512-NEXT: vpermt2q %zmm0, %zmm14, %zmm20 ; AVX512-NEXT: vpermt2q %zmm4, %zmm18, %zmm20 ; AVX512-NEXT: vpermi2q %zmm3, %zmm9, %zmm14 ; AVX512-NEXT: vpermt2q %zmm10, %zmm18, %zmm14 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = <10,13,0,3,6,u,u,u> -; AVX512-NEXT: vpermt2q %zmm11, %zmm18, %zmm1 +; AVX512-NEXT: vpermt2q %zmm11, %zmm18, %zmm2 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15] -; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm1 +; AVX512-NEXT: vpermt2q %zmm8, %zmm11, %zmm2 +; AVX512-NEXT: vpermt2q %zmm7, %zmm18, %zmm1 +; AVX512-NEXT: vpermt2q %zmm6, %zmm11, %zmm1 ; AVX512-NEXT: vpermt2q %zmm5, %zmm18, %zmm0 ; AVX512-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 -; AVX512-NEXT: vpermt2q %zmm7, %zmm18, %zmm2 -; AVX512-NEXT: vpermt2q %zmm6, %zmm11, %zmm2 ; AVX512-NEXT: vpermt2q %zmm9, %zmm18, %zmm3 ; AVX512-NEXT: vpermt2q %zmm10, %zmm11, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rsi) @@ -1229,13 +1194,13 @@ ; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm12, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm20, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm19, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm14, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm17, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm2, 128(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm3, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <96 x i64>, ptr %in.vec, align 64 @@ -1740,275 +1705,245 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1128, %rsp # imm = 0x468 -; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm9 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: subq $936, %rsp # imm = 0x3A8 +; AVX1-ONLY-NEXT: vmovapd 1376(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 1184(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = mem[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[1],ymm4[0],ymm5[3],ymm4[2] -; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm3[0],ymm6[3],ymm3[2] -; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = mem[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = mem[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = mem[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = mem[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = mem[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[1],ymm0[0],ymm10[3],ymm0[2] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[1],ymm2[0],ymm7[3],ymm2[2] -; AVX1-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[1],ymm11[0],ymm2[3],ymm11[2] +; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[1],ymm9[0],ymm8[3],ymm9[2] -; AVX1-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm14[1],ymm8[0],ymm14[3],ymm8[2] +; AVX1-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[1],ymm11[0],ymm10[3],ymm11[2] -; AVX1-ONLY-NEXT: vbroadcastsd 944(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[1],ymm6[0],ymm12[3],ymm6[2] +; AVX1-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm9[1],ymm4[0],ymm9[3],ymm4[2] +; AVX1-ONLY-NEXT: vbroadcastsd 848(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm11[0],ymm0[3],ymm11[2] -; AVX1-ONLY-NEXT: vbroadcastsd 1136(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[1],ymm1[0],ymm7[3],ymm1[2] +; AVX1-ONLY-NEXT: vbroadcastsd 1040(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm8[0],ymm0[3],ymm8[2] -; AVX1-ONLY-NEXT: vbroadcastsd 1328(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm13[0],ymm15[3],ymm13[2] +; AVX1-ONLY-NEXT: vbroadcastsd 1232(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm5[0],ymm0[3],ymm5[2] -; AVX1-ONLY-NEXT: vbroadcastsd 1520(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = mem[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm14[1],ymm5[0],ymm14[3],ymm5[2] +; AVX1-ONLY-NEXT: vbroadcastsd 1424(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = mem[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm0[0],ymm11[3],ymm0[2] +; AVX1-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm10[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm10[0],ymm0[3],ymm10[2] -; AVX1-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[1],ymm0[0],ymm8[3],ymm0[2] +; AVX1-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm9[0],ymm0[3],ymm9[2] -; AVX1-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[1],ymm0[0],ymm4[3],ymm0[2] +; AVX1-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm7[0],ymm0[3],ymm7[2] -; AVX1-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm0[0],ymm6[3],ymm0[2] +; AVX1-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = mem[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[1],ymm4[0],ymm12[3],ymm4[2] -; AVX1-ONLY-NEXT: vbroadcastsd 848(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = mem[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm3[0],ymm6[3],ymm3[2] -; AVX1-ONLY-NEXT: vbroadcastsd 1040(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1184(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = mem[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm2[0],ymm15[3],ymm2[2] -; AVX1-ONLY-NEXT: vbroadcastsd 1232(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = mem[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[1],ymm0[0],ymm7[3],ymm0[2] +; AVX1-ONLY-NEXT: vbroadcastsd 944(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1376(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = mem[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm14[1],ymm0[0],ymm14[3],ymm0[2] -; AVX1-ONLY-NEXT: vbroadcastsd 1424(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm0[1],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vbroadcastsd 1136(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = mem[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm5[1],ymm3[0],ymm5[3],ymm3[2] +; AVX1-ONLY-NEXT: vbroadcastsd 1328(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1,2],ymm12[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = mem[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm9[1],ymm12[0],ymm9[3],ymm12[2] +; AVX1-ONLY-NEXT: vbroadcastsd 1520(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0],ymm12[1],ymm10[2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0],ymm10[1],ymm12[2],ymm10[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2],ymm10[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0],ymm11[1],ymm10[2],ymm11[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm10[1],ymm2[2],ymm10[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0],ymm2[1],ymm10[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0],ymm2[1],ymm8[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm8[1],ymm2[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm4[0],ymm2[1],ymm4[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm4[1],ymm2[2],ymm4[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0],ymm2[1],ymm6[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm6[1],ymm2[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm11[2,3],ymm2[4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm7[0],ymm2[1],ymm7[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm7[1],ymm2[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1024(%rdi), %ymm2, %ymm7 +; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm7[2,3],mem[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm11[2,3],ymm7[4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0],ymm1[1],mem[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1264(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = mem[0],ymm1[1],mem[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2],ymm8[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0],ymm1[1],mem[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[2],ymm11[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm14[0],ymm1[1],ymm14[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0],ymm0[1],ymm15[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1024(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0],ymm0[1],ymm12[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0],ymm1[1],mem[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm1[0],ymm7[1],ymm1[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0],ymm7[1],mem[2],ymm7[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendpd $5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0],ymm9[1],mem[2],ymm9[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2],ymm10[3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm10[2,3],mem[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3],ymm10[4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovapd %ymm5, 448(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm15, 384(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 320(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 256(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 192(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 128(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 480(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 416(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 352(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm15[0],ymm7[1],ymm15[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd 1184(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovaps 1264(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm15[1],ymm7[2],ymm15[3] +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2],ymm15[3] +; AVX1-ONLY-NEXT: vmovapd 1376(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0],ymm13[1],ymm15[2],ymm13[3] +; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2],ymm13[3] +; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0],ymm11[1],ymm13[2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd %ymm9, 480(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 416(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 352(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 288(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 224(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 160(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 448(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 384(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 256(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 192(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2025,307 +1960,323 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 64(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 128(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 192(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rcx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 480(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 448(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 416(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 384(%rcx) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX1-ONLY-NEXT: addq $1128, %rsp # imm = 0x468 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) +; AVX1-ONLY-NEXT: addq $936, %rsp # imm = 0x3A8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride3_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $968, %rsp # imm = 0x3C8 -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm10[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1024(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm9[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm8[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm5[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm4[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 1024(%rdi), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 944(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vbroadcastsd 848(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm10[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1136(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vbroadcastsd 1040(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm9[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1328(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vbroadcastsd 1232(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1520(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vbroadcastsd 1424(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = mem[0,1],ymm7[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm6[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm5[2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 848(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vbroadcastsd 944(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1040(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vbroadcastsd 1136(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1232(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastsd 1328(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vbroadcastsd 1424(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 1520(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 496(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 880(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 1072(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 496(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 1264(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 1456(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 1360(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 1168(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 880(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -2333,43 +2284,27 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1072(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1168(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1264(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1360(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1456(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = mem[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 448(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 384(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 320(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 256(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm15, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm15, 480(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm15, 416(%rsi) @@ -2385,169 +2320,187 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm15, 96(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm15, 32(%rsi) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 448(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 384(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm15, 448(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm15, 384(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm15, 320(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm15, 256(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm15, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm15, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm15, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm15, (%rsi) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 480(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 416(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 256(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 288(%rdx) ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 160(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 480(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 448(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 416(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 384(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 288(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 256(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 192(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 160(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 128(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 256(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 384(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 352(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 320(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 384(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 480(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 416(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 352(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 288(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 224(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 288(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 160(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-ONLY-NEXT: addq $968, %rsp # imm = 0x3C8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512-LABEL: load_i64_stride3_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $200, %rsp +; AVX512-NEXT: subq $136, %rsp ; AVX512-NEXT: vmovaps 1472(%rdi), %zmm0 ; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm16 +; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm14 ; AVX512-NEXT: vmovaps 1280(%rdi), %zmm0 +; AVX512-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm17 +; AVX512-NEXT: vmovaps 1088(%rdi), %zmm0 ; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm15 -; AVX512-NEXT: vmovaps 1088(%rdi), %zmm1 -; AVX512-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm22 -; AVX512-NEXT: vmovaps 896(%rdi), %zmm1 -; AVX512-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm17 -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm19 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm18 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm23 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm4 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm28 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm21 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm25 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm23 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm27 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm19 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm16 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm29 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm22 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm20 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm30 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,3,6,9,12,15,u,u> ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm6 -; AVX512-NEXT: vpermt2q %zmm20, %zmm13, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm3 -; AVX512-NEXT: vpermt2q %zmm23, %zmm13, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512-NEXT: vpermt2q %zmm12, %zmm13, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm9 -; AVX512-NEXT: vpermt2q %zmm17, %zmm13, %zmm9 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm8 +; AVX512-NEXT: vpermt2q %zmm22, %zmm13, %zmm6 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512-NEXT: vpermt2q %zmm16, %zmm13, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512-NEXT: vpermt2q %zmm21, %zmm13, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm8 ; AVX512-NEXT: vpermt2q %zmm18, %zmm13, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512-NEXT: vpermt2q %zmm0, %zmm13, %zmm24 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512-NEXT: vpermt2q %zmm14, %zmm13, %zmm25 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm26 = <1,4,7,10,13,u,u,u> -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512-NEXT: vpermt2q %zmm20, %zmm26, %zmm29 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512-NEXT: vpermt2q %zmm11, %zmm13, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512-NEXT: vpermt2q %zmm10, %zmm13, %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm26 +; AVX512-NEXT: vpermt2q %zmm9, %zmm13, %zmm26 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm27 = <1,4,7,10,13,u,u,u> +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512-NEXT: vpermt2q %zmm22, %zmm27, %zmm28 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = <10,13,0,3,6,u,u,u> -; AVX512-NEXT: vpermt2q %zmm30, %zmm31, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm30 -; AVX512-NEXT: vpermt2q %zmm12, %zmm26, %zmm30 -; AVX512-NEXT: vpermt2q %zmm28, %zmm31, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm28 -; AVX512-NEXT: vpermt2q %zmm23, %zmm26, %zmm28 -; AVX512-NEXT: vpermt2q %zmm27, %zmm31, %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512-NEXT: vpermt2q %zmm18, %zmm26, %zmm27 -; AVX512-NEXT: vpermt2q %zmm22, %zmm31, %zmm18 -; AVX512-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512-NEXT: vpermt2q %zmm17, %zmm26, %zmm22 -; AVX512-NEXT: vpermt2q %zmm21, %zmm31, %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512-NEXT: vpermt2q %zmm14, %zmm26, %zmm21 -; AVX512-NEXT: vpermt2q %zmm16, %zmm31, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm16 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512-NEXT: vpermt2q %zmm0, %zmm26, %zmm16 -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm12 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512-NEXT: vpermi2q %zmm15, %zmm19, %zmm13 -; AVX512-NEXT: vpermi2q %zmm15, %zmm19, %zmm26 -; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm15 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm19 +; AVX512-NEXT: vpermt2q %zmm30, %zmm31, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm29, %zmm30 +; AVX512-NEXT: vpermt2q %zmm16, %zmm27, %zmm30 +; AVX512-NEXT: vpermt2q %zmm29, %zmm31, %zmm16 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm29 +; AVX512-NEXT: vpermt2q %zmm21, %zmm27, %zmm29 +; AVX512-NEXT: vpermt2q %zmm25, %zmm31, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm23, %zmm25 +; AVX512-NEXT: vpermt2q %zmm18, %zmm27, %zmm25 +; AVX512-NEXT: vpermt2q %zmm23, %zmm31, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512-NEXT: vpermt2q %zmm11, %zmm27, %zmm23 +; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm15 +; AVX512-NEXT: vpermt2q %zmm10, %zmm27, %zmm19 +; AVX512-NEXT: vpermt2q %zmm17, %zmm31, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm12 +; AVX512-NEXT: vpermt2q %zmm9, %zmm27, %zmm17 +; AVX512-NEXT: vpermt2q %zmm14, %zmm31, %zmm12 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm14 +; AVX512-NEXT: vpermi2q %zmm14, %zmm20, %zmm13 +; AVX512-NEXT: vpermi2q %zmm14, %zmm20, %zmm27 +; AVX512-NEXT: vpermt2q %zmm20, %zmm31, %zmm14 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm20 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,10,13] ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512-NEXT: vpermt2q %zmm20, %zmm31, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm11 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm10 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm8, %zmm9 ; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm9 -; AVX512-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm8 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload ; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm24 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm25 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm26 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm13 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,8,11,14] -; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm16 -; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm29 -; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm30 -; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm28 -; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm27 -; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm22 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm21 -; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm26 +; AVX512-NEXT: vpermt2q %zmm20, %zmm31, %zmm28 +; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm30 +; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm29 +; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm25 +; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm23 +; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm19 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm17 +; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm27 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,9,12,15] -; AVX512-NEXT: vpermt2q %zmm19, %zmm31, %zmm20 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm19 -; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm23 -; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm18 -; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm17 -; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm14 -; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm12 -; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm15 -; AVX512-NEXT: vmovdqa64 %zmm25, 448(%rsi) +; AVX512-NEXT: vpermt2q %zmm20, %zmm31, %zmm22 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512-NEXT: vpermt2q %zmm5, %zmm31, %zmm20 +; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm21 +; AVX512-NEXT: vpermt2q %zmm3, %zmm31, %zmm18 +; AVX512-NEXT: vpermt2q %zmm2, %zmm31, %zmm16 +; AVX512-NEXT: vpermt2q %zmm1, %zmm31, %zmm15 +; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm12 +; AVX512-NEXT: vpermt2q %zmm6, %zmm31, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm26, 448(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm24, 384(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm8, 320(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm9, 256(%rsi) @@ -2555,23 +2508,23 @@ ; AVX512-NEXT: vmovdqa64 %zmm11, 128(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm13, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm21, 448(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm22, 256(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm27, 320(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm28, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm30, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm26, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm29, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm16, 384(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm12, 384(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm14, 448(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm17, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm18, 320(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm23, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm19, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm20, 64(%rcx) -; AVX512-NEXT: addq $200, %rsp +; AVX512-NEXT: vmovdqa64 %zmm17, 448(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm19, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm23, 320(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm25, 256(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm29, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm30, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm28, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm27, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm12, 448(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm15, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm16, 320(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm18, 256(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm21, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm20, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm14, (%rcx) +; AVX512-NEXT: addq $136, %rsp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <192 x i64>, ptr %in.vec, align 64 @@ -2591,15 +2544,19 @@ ; AVX2-FAST-PERLANE: {{.*}} ; AVX2-SLOW: {{.*}} ; AVX512BW: {{.*}} +; AVX512BW-FAST: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} ; AVX512BW-ONLY-SLOW: {{.*}} +; AVX512BW-SLOW: {{.*}} ; AVX512DQ-FAST: {{.*}} ; AVX512DQ-SLOW: {{.*}} ; AVX512DQBW-FAST: {{.*}} ; AVX512DQBW-SLOW: {{.*}} ; AVX512F: {{.*}} +; AVX512F-FAST: {{.*}} ; AVX512F-ONLY-FAST: {{.*}} ; AVX512F-ONLY-SLOW: {{.*}} +; AVX512F-SLOW: {{.*}} ; FALLBACK0: {{.*}} ; FALLBACK1: {{.*}} ; FALLBACK10: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll @@ -50,90 +50,22 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, (%r8) ; AVX1-ONLY-NEXT: retq ; -; AVX2-ONLY-LABEL: load_i64_stride4_vf2: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsi) -; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rdx) -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, (%rcx) -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, (%r8) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq -; -; AVX512F-SLOW-LABEL: load_i64_stride4_vf2: -; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovaps (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] -; AVX512F-SLOW-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512F-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vmovaps (%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512F-SLOW-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512F-SLOW-NEXT: vmovaps %xmm2, (%rsi) -; AVX512F-SLOW-NEXT: vmovaps %xmm0, (%rdx) -; AVX512F-SLOW-NEXT: vextractf128 $1, %ymm4, (%rcx) -; AVX512F-SLOW-NEXT: vextractf128 $1, %ymm1, (%r8) -; AVX512F-SLOW-NEXT: vzeroupper -; AVX512F-SLOW-NEXT: retq -; -; AVX512F-FAST-LABEL: load_i64_stride4_vf2: -; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0] -; AVX512F-FAST-NEXT: vpermpd (%rdi), %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovaps (%rdi), %xmm1 -; AVX512F-FAST-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX512F-FAST-NEXT: vmovaps (%rdi), %ymm2 -; AVX512F-FAST-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX512F-FAST-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX512F-FAST-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX512F-FAST-NEXT: vmovaps %xmm0, (%rsi) -; AVX512F-FAST-NEXT: vmovaps %xmm1, (%rdx) -; AVX512F-FAST-NEXT: vextractf128 $1, %ymm4, (%rcx) -; AVX512F-FAST-NEXT: vextractf128 $1, %ymm2, (%r8) -; AVX512F-FAST-NEXT: vzeroupper -; AVX512F-FAST-NEXT: retq -; -; AVX512BW-SLOW-LABEL: load_i64_stride4_vf2: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovaps (%rdi), %xmm0 -; AVX512BW-SLOW-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX512BW-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] -; AVX512BW-SLOW-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512BW-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX512BW-SLOW-NEXT: vmovaps (%rdi), %ymm3 -; AVX512BW-SLOW-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512BW-SLOW-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512BW-SLOW-NEXT: vmovaps %xmm2, (%rsi) -; AVX512BW-SLOW-NEXT: vmovaps %xmm0, (%rdx) -; AVX512BW-SLOW-NEXT: vextractf128 $1, %ymm4, (%rcx) -; AVX512BW-SLOW-NEXT: vextractf128 $1, %ymm1, (%r8) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq -; -; AVX512BW-FAST-LABEL: load_i64_stride4_vf2: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0] -; AVX512BW-FAST-NEXT: vpermpd (%rdi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovaps (%rdi), %xmm1 -; AVX512BW-FAST-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX512BW-FAST-NEXT: vmovaps (%rdi), %ymm2 -; AVX512BW-FAST-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX512BW-FAST-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX512BW-FAST-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX512BW-FAST-NEXT: vmovaps %xmm0, (%rsi) -; AVX512BW-FAST-NEXT: vmovaps %xmm1, (%rdx) -; AVX512BW-FAST-NEXT: vextractf128 $1, %ymm4, (%rcx) -; AVX512BW-FAST-NEXT: vextractf128 $1, %ymm2, (%r8) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX2-LABEL: load_i64_stride4_vf2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX2-NEXT: vmovaps %xmm2, (%rsi) +; AVX2-NEXT: vmovaps %xmm0, (%rdx) +; AVX2-NEXT: vextractf128 $1, %ymm4, (%rcx) +; AVX2-NEXT: vextractf128 $1, %ymm1, (%r8) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %wide.vec = load <8 x i64>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <8 x i64> %wide.vec, <8 x i64> poison, <2 x i32> %strided.vec1 = shufflevector <8 x i64> %wide.vec, <8 x i64> poison, <2 x i32> @@ -264,20 +196,20 @@ ; SSE-NEXT: movaps 16(%rdi), %xmm0 ; SSE-NEXT: movaps 32(%rdi), %xmm12 ; SSE-NEXT: movaps 224(%rdi), %xmm13 -; SSE-NEXT: movaps 192(%rdi), %xmm8 +; SSE-NEXT: movaps 192(%rdi), %xmm7 ; SSE-NEXT: movaps 160(%rdi), %xmm14 -; SSE-NEXT: movaps 128(%rdi), %xmm7 +; SSE-NEXT: movaps 128(%rdi), %xmm8 ; SSE-NEXT: movaps 96(%rdi), %xmm15 ; SSE-NEXT: movaps 64(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1] -; SSE-NEXT: movaps %xmm7, %xmm15 +; SSE-NEXT: movaps %xmm8, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] -; SSE-NEXT: movaps %xmm8, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm14[1] +; SSE-NEXT: movaps %xmm7, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm13[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1] ; SSE-NEXT: movaps %xmm4, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm12[1] @@ -296,13 +228,13 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; SSE-NEXT: movaps %xmm14, 48(%rsi) -; SSE-NEXT: movaps %xmm13, (%rsi) ; SSE-NEXT: movaps %xmm15, 32(%rsi) ; SSE-NEXT: movaps %xmm11, 16(%rsi) -; SSE-NEXT: movaps %xmm8, 48(%rdx) -; SSE-NEXT: movaps %xmm4, (%rdx) -; SSE-NEXT: movaps %xmm7, 32(%rdx) +; SSE-NEXT: movaps %xmm13, (%rsi) +; SSE-NEXT: movaps %xmm7, 48(%rdx) +; SSE-NEXT: movaps %xmm8, 32(%rdx) ; SSE-NEXT: movaps %xmm9, 16(%rdx) +; SSE-NEXT: movaps %xmm4, (%rdx) ; SSE-NEXT: movaps %xmm6, 48(%rcx) ; SSE-NEXT: movaps %xmm10, 32(%rcx) ; SSE-NEXT: movaps %xmm12, 16(%rcx) @@ -483,61 +415,61 @@ ; SSE-LABEL: load_i64_stride4_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps (%rdi), %xmm4 -; SSE-NEXT: movaps 416(%rdi), %xmm0 -; SSE-NEXT: movaps 384(%rdi), %xmm5 -; SSE-NEXT: movaps 160(%rdi), %xmm1 -; SSE-NEXT: movaps 128(%rdi), %xmm6 -; SSE-NEXT: movaps 480(%rdi), %xmm2 -; SSE-NEXT: movaps 448(%rdi), %xmm7 -; SSE-NEXT: movaps 224(%rdi), %xmm3 -; SSE-NEXT: movaps 192(%rdi), %xmm11 -; SSE-NEXT: movaps 288(%rdi), %xmm8 -; SSE-NEXT: movaps 256(%rdi), %xmm13 -; SSE-NEXT: movaps 352(%rdi), %xmm9 +; SSE-NEXT: movaps (%rdi), %xmm5 +; SSE-NEXT: movaps 480(%rdi), %xmm0 +; SSE-NEXT: movaps 448(%rdi), %xmm6 +; SSE-NEXT: movaps 224(%rdi), %xmm1 +; SSE-NEXT: movaps 192(%rdi), %xmm7 +; SSE-NEXT: movaps 416(%rdi), %xmm2 +; SSE-NEXT: movaps 384(%rdi), %xmm10 +; SSE-NEXT: movaps 160(%rdi), %xmm3 +; SSE-NEXT: movaps 128(%rdi), %xmm11 +; SSE-NEXT: movaps 352(%rdi), %xmm4 ; SSE-NEXT: movaps 320(%rdi), %xmm12 -; SSE-NEXT: movaps 96(%rdi), %xmm10 -; SSE-NEXT: movaps 64(%rdi), %xmm14 +; SSE-NEXT: movaps 96(%rdi), %xmm8 +; SSE-NEXT: movaps 64(%rdi), %xmm13 +; SSE-NEXT: movaps 288(%rdi), %xmm9 +; SSE-NEXT: movaps 256(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm10[0] +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm9[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm9[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm9[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm9 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm8[1] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm3[0] -; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm3[1] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movaps %xmm10, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm4, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rdi), %xmm0 ; SSE-NEXT: movaps 80(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -546,381 +478,379 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 176(%rdi), %xmm0 -; SSE-NEXT: movaps 144(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm14 +; SSE-NEXT: movaps 144(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 208(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps 208(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: movaps 304(%rdi), %xmm0 -; SSE-NEXT: movaps 272(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm11 +; SSE-NEXT: movaps 272(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 368(%rdi), %xmm0 ; SSE-NEXT: movaps 336(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movaps 432(%rdi), %xmm0 -; SSE-NEXT: movaps 400(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps 400(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] ; SSE-NEXT: movaps 496(%rdi), %xmm0 ; SSE-NEXT: movaps 464(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps 16(%rdi), %xmm5 +; SSE-NEXT: movaps 16(%rdi), %xmm4 ; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps %xmm15, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) -; SSE-NEXT: movaps %xmm15, (%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movaps %xmm6, 96(%rcx) -; SSE-NEXT: movaps %xmm14, 32(%rcx) -; SSE-NEXT: movaps %xmm3, 112(%rcx) -; SSE-NEXT: movaps %xmm12, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm2, 112(%rcx) +; SSE-NEXT: movaps %xmm5, 96(%rcx) +; SSE-NEXT: movaps %xmm8, 80(%rcx) ; SSE-NEXT: movaps %xmm11, 64(%rcx) -; SSE-NEXT: movaps %xmm4, (%rcx) -; SSE-NEXT: movaps %xmm9, 80(%rcx) +; SSE-NEXT: movaps %xmm13, 48(%rcx) +; SSE-NEXT: movaps %xmm14, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps %xmm6, (%rcx) ; SSE-NEXT: movaps %xmm1, 112(%r8) -; SSE-NEXT: movaps %xmm2, 96(%r8) +; SSE-NEXT: movaps %xmm3, 96(%r8) ; SSE-NEXT: movaps %xmm7, 80(%r8) -; SSE-NEXT: movaps %xmm10, 64(%r8) -; SSE-NEXT: movaps %xmm8, 48(%r8) -; SSE-NEXT: movaps %xmm13, 32(%r8) +; SSE-NEXT: movaps %xmm9, 64(%r8) +; SSE-NEXT: movaps %xmm10, 48(%r8) +; SSE-NEXT: movaps %xmm12, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm5, (%r8) +; SSE-NEXT: movaps %xmm4, (%r8) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride4_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm8[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm4[0] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm7[0] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm8[0] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm11[0],xmm10[0] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm12[0] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm7[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm9[1],xmm8[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm11[1],xmm10[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm13[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm8[0],xmm10[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm13[0],xmm14[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm4[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm10[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 96(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%r8) +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm13[1],xmm14[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%r8) ; AVX1-ONLY-NEXT: addq $296, %rsp # imm = 0x128 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride4_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $328, %rsp # imm = 0x148 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm0[0] +; AVX2-ONLY-NEXT: subq $296, %rsp # imm = 0x128 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm6[0] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm5[0] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm8[0] +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm9[0] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm10[0] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm11[0] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm12[0] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm13[0] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm14[0] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm5[1] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm9[1] ; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm8[1] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm13[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm12[1],xmm11[1] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm15[0] -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm12[1],xmm15[1] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm9[0],xmm2[0] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm14[1],xmm13[1] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm10[0] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm0[0] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm9[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm15[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm8[0],ymm2[2],ymm8[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm10[0],ymm15[0],ymm10[2],ymm15[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = ymm12[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm4, 16(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm4, (%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm4, 64(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm4, 80(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm4, 112(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm4, 96(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm4, 32(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm4, 48(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm4, (%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm4, 16(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm4, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm4, 80(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm4, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm4, 112(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm4, 32(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm4, 48(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm12[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm12[1],ymm9[3],ymm12[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm3, 96(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm3, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm3, 112(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm3, 48(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm3, 64(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm3, (%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm3, 80(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm3, 16(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm3, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm3, 112(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm3, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm3, 48(%rdx) +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm3, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm3, 80(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm3, (%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm3, 16(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX2-ONLY-NEXT: addq $328, %rsp # imm = 0x148 +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX2-ONLY-NEXT: addq $296, %rsp # imm = 0x128 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1001,29 +931,29 @@ ; SSE-LABEL: load_i64_stride4_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $664, %rsp # imm = 0x298 -; SSE-NEXT: movaps 416(%rdi), %xmm0 -; SSE-NEXT: movaps 384(%rdi), %xmm9 -; SSE-NEXT: movaps 160(%rdi), %xmm1 -; SSE-NEXT: movaps 128(%rdi), %xmm8 -; SSE-NEXT: movaps 480(%rdi), %xmm2 -; SSE-NEXT: movaps 448(%rdi), %xmm11 -; SSE-NEXT: movaps 224(%rdi), %xmm4 -; SSE-NEXT: movaps 192(%rdi), %xmm10 -; SSE-NEXT: movaps 288(%rdi), %xmm5 -; SSE-NEXT: movaps 256(%rdi), %xmm12 -; SSE-NEXT: movaps 608(%rdi), %xmm3 -; SSE-NEXT: movaps 352(%rdi), %xmm6 -; SSE-NEXT: movaps 320(%rdi), %xmm14 -; SSE-NEXT: movaps 96(%rdi), %xmm7 -; SSE-NEXT: movaps 64(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0] +; SSE-NEXT: movaps 480(%rdi), %xmm0 +; SSE-NEXT: movaps 448(%rdi), %xmm9 +; SSE-NEXT: movaps 224(%rdi), %xmm1 +; SSE-NEXT: movaps 192(%rdi), %xmm8 +; SSE-NEXT: movaps 416(%rdi), %xmm2 +; SSE-NEXT: movaps 384(%rdi), %xmm11 +; SSE-NEXT: movaps 160(%rdi), %xmm4 +; SSE-NEXT: movaps 128(%rdi), %xmm10 +; SSE-NEXT: movaps 352(%rdi), %xmm5 +; SSE-NEXT: movaps 320(%rdi), %xmm13 +; SSE-NEXT: movaps 96(%rdi), %xmm6 +; SSE-NEXT: movaps 64(%rdi), %xmm12 +; SSE-NEXT: movaps 544(%rdi), %xmm3 +; SSE-NEXT: movaps 288(%rdi), %xmm7 +; SSE-NEXT: movaps 256(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm12, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm6[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm7[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm8, %xmm4 @@ -1032,15 +962,15 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm5[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1051,21 +981,14 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 576(%rdi), %xmm0 +; SSE-NEXT: movaps 512(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 544(%rdi), %xmm0 -; SSE-NEXT: movaps 512(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 736(%rdi), %xmm0 -; SSE-NEXT: movaps 704(%rdi), %xmm1 +; SSE-NEXT: movaps 608(%rdi), %xmm0 +; SSE-NEXT: movaps 576(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1078,8 +1001,8 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 864(%rdi), %xmm0 -; SSE-NEXT: movaps 832(%rdi), %xmm1 +; SSE-NEXT: movaps 736(%rdi), %xmm0 +; SSE-NEXT: movaps 704(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1092,8 +1015,8 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 992(%rdi), %xmm0 -; SSE-NEXT: movaps 960(%rdi), %xmm1 +; SSE-NEXT: movaps 864(%rdi), %xmm0 +; SSE-NEXT: movaps 832(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1106,6 +1029,13 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 992(%rdi), %xmm0 +; SSE-NEXT: movaps 960(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%rdi), %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -1211,14 +1141,6 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rsi) @@ -1227,13 +1149,13 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps %xmm0, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps %xmm0, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1243,37 +1165,45 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rdx) +; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rdx) +; SSE-NEXT: movaps %xmm0, 224(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rdx) +; SSE-NEXT: movaps %xmm0, 192(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rdx) +; SSE-NEXT: movaps %xmm0, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps %xmm1, 240(%rcx) ; SSE-NEXT: movaps %xmm3, 224(%rcx) ; SSE-NEXT: movaps %xmm6, 208(%rcx) @@ -1328,414 +1258,410 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride4_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1112, %rsp # imm = 0x458 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 +; AVX1-ONLY-NEXT: subq $1096, %rsp # imm = 0x448 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm10[0] -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0] -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 912(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm3[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm8[0],ymm11[0],ymm8[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm2[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm6[0],xmm9[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 912(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm10[0],xmm13[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm14[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm5[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 144(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 176(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm1, 240(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm9[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm14, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm10[1],xmm13[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX1-ONLY-NEXT: addq $1112, %rsp # imm = 0x458 +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm15, (%r8) +; AVX1-ONLY-NEXT: addq $1096, %rsp # imm = 0x448 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride4_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm10[0] -; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0] -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm8[1] -; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] ; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0] +; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm2[0] +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm10[0] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm8[1] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm1[0] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -1746,135 +1672,147 @@ ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm8[0],ymm14[0],ymm8[2],ymm14[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[2,3],ymm7[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm5[0],ymm10[0],ymm5[2],ymm10[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm13 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm9[2,3],ymm7[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[2,3],ymm7[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm9[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm11[1],ymm15[1],ymm11[3],ymm15[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3],ymm7[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm7[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm7[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm14[1],ymm8[3],ymm14[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm10[1],ymm5[3],ymm10[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 224(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 160(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) ; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 208(%rsi) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 240(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 176(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 192(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -1884,67 +1822,45 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 208(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 144(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 240(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 224(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 160(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 176(%rsi) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 224(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 240(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 160(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 176(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 144(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 192(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 208(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 128(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 144(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 160(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 176(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 240(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -1953,17 +1869,23 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 128(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm13, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 64(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm12, (%r8) ; AVX2-ONLY-NEXT: addq $1224, %rsp # imm = 0x4C8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -1976,72 +1898,72 @@ ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm10 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm14 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm17 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm16 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm12 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm18 +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm17 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,4,8,12,0,4,8,12] ; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512-NEXT: vpermt2q %zmm12, %zmm19, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512-NEXT: vpermt2q %zmm8, %zmm19, %zmm13 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm13[0,1,2,3],zmm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm13 -; AVX512-NEXT: vpermt2q %zmm17, %zmm19, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512-NEXT: vpermt2q %zmm14, %zmm19, %zmm18 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm18[0,1,2,3],zmm13[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512-NEXT: vpermt2q %zmm10, %zmm19, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm8 +; AVX512-NEXT: vpermt2q %zmm18, %zmm19, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512-NEXT: vpermt2q %zmm14, %zmm19, %zmm13 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512-NEXT: vpermt2q %zmm12, %zmm19, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm16 +; AVX512-NEXT: vpermt2q %zmm9, %zmm19, %zmm16 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm16[0,1,2,3],zmm13[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512-NEXT: vpermt2q %zmm7, %zmm19, %zmm16 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512-NEXT: vpermt2q %zmm4, %zmm19, %zmm20 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm20[0,1,2,3],zmm18[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm20[0,1,2,3],zmm16[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm20 ; AVX512-NEXT: vpermt2q %zmm3, %zmm19, %zmm20 ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [1,5,9,13,1,5,9,13] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm20 -; AVX512-NEXT: vpermt2q %zmm12, %zmm21, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm22[0,1,2,3],zmm20[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm22 -; AVX512-NEXT: vpermt2q %zmm17, %zmm21, %zmm22 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512-NEXT: vpermt2q %zmm14, %zmm21, %zmm23 +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,5,9,13,1,5,9,13] +; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm21 +; AVX512-NEXT: vpermt2q %zmm18, %zmm20, %zmm21 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512-NEXT: vpermt2q %zmm14, %zmm20, %zmm22 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm22[0,1,2,3],zmm21[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512-NEXT: vpermt2q %zmm12, %zmm20, %zmm22 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm23 +; AVX512-NEXT: vpermt2q %zmm9, %zmm20, %zmm23 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],zmm22[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512-NEXT: vpermt2q %zmm10, %zmm21, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm23 +; AVX512-NEXT: vpermt2q %zmm7, %zmm20, %zmm23 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm24 -; AVX512-NEXT: vpermt2q %zmm4, %zmm21, %zmm24 +; AVX512-NEXT: vpermt2q %zmm4, %zmm20, %zmm24 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm24[0,1,2,3],zmm23[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512-NEXT: vpermt2q %zmm3, %zmm21, %zmm24 -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm21 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],zmm24[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm3, %zmm20, %zmm24 +; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm20 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm20[0,1,2,3],zmm24[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,6,10,14,2,6,10,14] ; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512-NEXT: vpermt2q %zmm12, %zmm24, %zmm25 -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512-NEXT: vpermt2q %zmm8, %zmm24, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512-NEXT: vpermt2q %zmm18, %zmm24, %zmm25 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm26 +; AVX512-NEXT: vpermt2q %zmm14, %zmm24, %zmm26 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm26[0,1,2,3],zmm25[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512-NEXT: vpermt2q %zmm17, %zmm24, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512-NEXT: vpermt2q %zmm14, %zmm24, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512-NEXT: vpermt2q %zmm12, %zmm24, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm27 +; AVX512-NEXT: vpermt2q %zmm9, %zmm24, %zmm27 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm27[0,1,2,3],zmm26[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512-NEXT: vpermt2q %zmm10, %zmm24, %zmm27 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm27 +; AVX512-NEXT: vpermt2q %zmm7, %zmm24, %zmm27 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm28 ; AVX512-NEXT: vpermt2q %zmm4, %zmm24, %zmm28 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm28[0,1,2,3],zmm27[4,5,6,7] @@ -2051,34 +1973,34 @@ ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm28[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,7,11,15,3,7,11,15] ; AVX512-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm17, %zmm28, %zmm16 +; AVX512-NEXT: vpermt2q %zmm18, %zmm28, %zmm17 ; AVX512-NEXT: vpermt2q %zmm14, %zmm28, %zmm15 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm16[4,5,6,7] -; AVX512-NEXT: vpermt2q %zmm10, %zmm28, %zmm7 -; AVX512-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm17[4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm12, %zmm28, %zmm11 -; AVX512-NEXT: vpermt2q %zmm8, %zmm28, %zmm9 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm11[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm9, %zmm28, %zmm10 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm11[4,5,6,7] +; AVX512-NEXT: vpermt2q %zmm7, %zmm28, %zmm6 +; AVX512-NEXT: vpermt2q %zmm4, %zmm28, %zmm5 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm6[4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm3, %zmm28, %zmm2 ; AVX512-NEXT: vpermt2q %zmm1, %zmm28, %zmm0 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm18, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm13, 128(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm8, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm23, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm21, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm20, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm22, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm21, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm20, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm27, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm26, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm25, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm24, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm26, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm5, 128(%r8) ; AVX512-NEXT: vmovdqa64 %zmm4, 192(%r8) -; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm9, 128(%r8) ; AVX512-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %wide.vec = load <128 x i64>, ptr %in.vec, align 64 @@ -2777,98 +2699,86 @@ ; AVX1-ONLY-LABEL: load_i64_stride4_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $2680, %rsp # imm = 0xA78 -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm10[0] -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0] ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm7[0] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm6[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -2885,273 +2795,292 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 912(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1424(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1712(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1680(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1968(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1936(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm7[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm4[0],xmm5[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm15 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 688(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm8[0],xmm9[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 912(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1424(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1712(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1680(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1968(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1936(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm13[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm7[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] @@ -3191,8 +3120,8 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -3221,7 +3150,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] @@ -3232,200 +3161,193 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 464(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 448(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 256(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 384(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 320(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 272(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 400(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 336(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 208(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 144(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 80(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 496(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 480(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 416(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 352(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 288(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 432(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 368(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 304(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 240(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 176(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 144(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 256(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 272(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 208(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 320(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 336(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 384(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 400(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 448(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 464(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 176(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 240(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 288(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 304(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 352(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 368(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 416(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 432(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 480(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm6, 496(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 448(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 384(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 320(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 256(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 480(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 416(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 352(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 288(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 480(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 416(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 352(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 288(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 496(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 432(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 368(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 304(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 448(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 384(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 320(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 256(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 464(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 400(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 336(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 272(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 496(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 416(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 432(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 352(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 368(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 288(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 304(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 464(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 400(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 320(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 336(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 272(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 480(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 416(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 352(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 288(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 448(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 384(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 320(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 256(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 480(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 352(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 320(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 288(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 256(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 192(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 352(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 320(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 288(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 256(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm13, (%r8) ; AVX1-ONLY-NEXT: addq $2680, %rsp # imm = 0xA78 ; AVX1-ONLY-NEXT: vzeroupper @@ -3434,188 +3356,188 @@ ; AVX2-ONLY-LABEL: load_i64_stride4_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $3016, %rsp # imm = 0xBC8 -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm10[0] -; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0] -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm8[1] +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] ; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm7[0] -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm7[1] -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm10[0],xmm9[0] +; AVX2-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm10[1],xmm9[1] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm6[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm6[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm2[0] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm8[0] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -3626,180 +3548,180 @@ ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm15[0],ymm10[2],ymm15[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] @@ -3852,30 +3774,39 @@ ; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm10[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm7[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload @@ -3887,10 +3818,10 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm5[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm5[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm3[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm3[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] @@ -3900,184 +3831,179 @@ ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 464(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 448(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 256(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 384(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 320(%rsi) -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 192(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 128(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 272(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 400(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 336(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 208(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 144(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 80(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 16(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 496(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 480(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 32(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 416(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 352(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 288(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 224(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 160(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 96(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 48(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 432(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 368(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 304(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 240(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 176(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 112(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 128(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 144(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 256(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 272(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 80(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 16(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 192(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 208(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 320(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 336(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 384(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 400(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 448(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 464(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 112(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 32(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 48(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 160(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 176(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 240(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 288(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 304(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 352(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 368(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 416(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 432(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 480(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm2, 496(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 448(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 384(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 320(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 256(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 480(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 352(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 288(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 480(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 416(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 352(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 288(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 224(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 160(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 496(%rsi) +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 432(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 368(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 304(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 240(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 176(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 448(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 384(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 320(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 256(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 192(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 128(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 64(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 464(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 400(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 336(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 272(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 208(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 144(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 480(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 496(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 416(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 432(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 352(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 368(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 288(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 304(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 240(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 160(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 176(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 448(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 464(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 384(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 400(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 320(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 336(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 256(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 272(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 208(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 144(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 416(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 384(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 352(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 320(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 288(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 224(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 448(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 416(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 384(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 352(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 288(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 192(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4097,42 +4023,42 @@ ; AVX512-LABEL: load_i64_stride4_vf64: ; AVX512: # %bb.0: ; AVX512-NEXT: subq $2056, %rsp # imm = 0x808 -; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm18 +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm17 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm22 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm20 ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm17 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm23 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm26 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm27 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm28 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm6 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,4,8,12,0,4,8,12] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,4,8,12,0,4,8,12] +; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm28, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm28, %zmm10, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 -; AVX512-NEXT: vpermt2q %zmm27, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm27, %zmm10, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512-NEXT: vpermt2q %zmm20, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm26, %zmm10, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-NEXT: vpermt2q %zmm17, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm23, %zmm10, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vpermt2q %zmm22, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm20, %zmm10, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vpermt2q %zmm18, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm17, %zmm10, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm7 +; AVX512-NEXT: vpermt2q %zmm15, %zmm10, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [1,5,9,13,1,5,9,13] ; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] @@ -4142,16 +4068,16 @@ ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 ; AVX512-NEXT: vpermt2q %zmm27, %zmm16, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vpermt2q %zmm22, %zmm16, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512-NEXT: vpermt2q %zmm26, %zmm16, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vpermt2q %zmm18, %zmm16, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512-NEXT: vpermt2q %zmm23, %zmm16, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512-NEXT: vpermt2q %zmm20, %zmm16, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512-NEXT: vpermt2q %zmm17, %zmm16, %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [2,6,10,14,2,6,10,14] @@ -4168,26 +4094,26 @@ ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm27, %zmm31, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512-NEXT: vpermt2q %zmm22, %zmm29, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-NEXT: vpermt2q %zmm26, %zmm29, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm22, %zmm31, %zmm2 +; AVX512-NEXT: vpermt2q %zmm26, %zmm31, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512-NEXT: vpermt2q %zmm23, %zmm29, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm23, %zmm31, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512-NEXT: vpermt2q %zmm20, %zmm29, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermt2q %zmm20, %zmm31, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512-NEXT: vpermt2q %zmm18, %zmm29, %zmm2 +; AVX512-NEXT: vpermt2q %zmm17, %zmm29, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm18, %zmm31, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm20, %zmm31, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512-NEXT: vpermt2q %zmm17, %zmm29, %zmm1 +; AVX512-NEXT: vpermt2q %zmm17, %zmm31, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm17, %zmm31, %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -4196,10 +4122,10 @@ ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm20 +; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm20 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm0 +; AVX512-NEXT: vpermt2q %zmm15, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm0 ; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm0 @@ -4208,10 +4134,10 @@ ; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm20 -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm21 +; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm21 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm30 +; AVX512-NEXT: vpermt2q %zmm15, %zmm10, %zmm30 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm0 ; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -4219,71 +4145,75 @@ ; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm21 -; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm17 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm26 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm26 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm28 +; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm26 +; AVX512-NEXT: vpermt2q %zmm15, %zmm10, %zmm26 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm28 ; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm28 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm0 ; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm17 -; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm4 +; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm19 +; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm23 +; AVX512-NEXT: vpermt2q %zmm15, %zmm10, %zmm23 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm25 ; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm25 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm27 ; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm27 ; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm4 -; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm5 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm19 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm19 +; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512-NEXT: vpermt2q %zmm15, %zmm10, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm22 ; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm22 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm24 ; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm5 -; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm13 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm13 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512-NEXT: vpermt2q %zmm15, %zmm10, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm14 ; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm14 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512-NEXT: vmovdqa64 %zmm11, %zmm18 ; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm18 -; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm12 -; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm15 -; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm11 +; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm15 +; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm10 +; AVX512-NEXT: vpermt2q %zmm15, %zmm10, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512-NEXT: vpermt2q %zmm15, %zmm29, %zmm13 ; AVX512-NEXT: vpermt2q %zmm15, %zmm31, %zmm2 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm15 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 +; AVX512-NEXT: vpermt2q %zmm0, %zmm10, %zmm3 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm6 ; AVX512-NEXT: vpermt2q %zmm0, %zmm16, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512-NEXT: vpermt2q %zmm0, %zmm29, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8 +; AVX512-NEXT: vpermt2q %zmm0, %zmm29, %zmm8 ; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm15 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm11 +; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm29 ; AVX512-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] @@ -4292,100 +4222,96 @@ ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 64-byte Folded Reload ; AVX512-NEXT: # zmm31 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm26[0,1,2,3],zmm30[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload -; AVX512-NEXT: # zmm30 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm13[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm19[0,1,2,3],zmm23[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,2,3],zmm3[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm14[4,5,6,7] -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm17[0,1,2,3],zmm23[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm12[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm10[0,1,2,3],zmm3[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload -; AVX512-NEXT: # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm19 # 64-byte Folded Reload -; AVX512-NEXT: # zmm19 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload +; AVX512-NEXT: # zmm12 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload ; AVX512-NEXT: # zmm23 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload +; AVX512-NEXT: # zmm30 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm28 # 64-byte Folded Reload ; AVX512-NEXT: # zmm28 = zmm28[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm25[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm14[4,5,6,7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm16[0,1,2,3],zmm6[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm18[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload +; AVX512-NEXT: # zmm14 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload ; AVX512-NEXT: # zmm16 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload -; AVX512-NEXT: # zmm18 = zmm0[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 64-byte Folded Reload ; AVX512-NEXT: # zmm25 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512-NEXT: # zmm10 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm27[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm29[0,1,2,3],zmm9[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm27 # 64-byte Folded Reload -; AVX512-NEXT: # zmm27 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm29 # 64-byte Folded Reload -; AVX512-NEXT: # zmm29 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512-NEXT: # zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm18[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm29[0,1,2,3],zmm8[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 64-byte Folded Reload +; AVX512-NEXT: # zmm18 = zmm18[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm27 # 64-byte Folded Reload +; AVX512-NEXT: # zmm27 = zmm27[0,1,2,3],mem[4,5,6,7] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm29 # 64-byte Folded Reload +; AVX512-NEXT: # zmm29 = zmm29[0,1,2,3],mem[4,5,6,7] ; AVX512-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 64-byte Folded Reload ; AVX512-NEXT: # zmm20 = zmm20[0,1,2,3],mem[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm17[0,1,2,3],zmm21[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm21[4,5,6,7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm11[4,5,6,7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm15[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm13, 448(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm7, 384(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm30, 320(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm26, 256(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm31, 192(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm7, 448(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm17, 384(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm26, 320(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm31, 256(%rsi) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm5, 192(%rsi) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512-NEXT: vmovups (%rsp), %zmm5 # 64-byte Reload +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm11, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm22, 448(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm28, 256(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm23, 320(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm19, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm14, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm3, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm9, 448(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm22, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm28, 320(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm30, 256(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm23, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm12, 128(%rdx) +; AVX512-NEXT: vmovups (%rsp), %zmm3 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm3, 64(%rdx) ; AVX512-NEXT: vmovdqa64 %zmm6, (%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm5, 384(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm24, 448(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm0, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm3, 320(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm25, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm18, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm9, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm10, 384(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm2, 384(%r8) -; AVX512-NEXT: vmovdqa64 %zmm4, 448(%r8) -; AVX512-NEXT: vmovdqa64 %zmm17, 256(%r8) -; AVX512-NEXT: vmovdqa64 %zmm20, 320(%r8) -; AVX512-NEXT: vmovdqa64 %zmm8, 128(%r8) +; AVX512-NEXT: vmovdqa64 %zmm13, 448(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm24, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm0, 320(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm10, 256(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm25, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm16, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm14, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm8, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm2, 448(%r8) +; AVX512-NEXT: vmovdqa64 %zmm4, 384(%r8) +; AVX512-NEXT: vmovdqa64 %zmm19, 320(%r8) +; AVX512-NEXT: vmovdqa64 %zmm20, 256(%r8) ; AVX512-NEXT: vmovdqa64 %zmm29, 192(%r8) +; AVX512-NEXT: vmovdqa64 %zmm27, 128(%r8) +; AVX512-NEXT: vmovdqa64 %zmm18, 64(%r8) ; AVX512-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm27, 64(%r8) ; AVX512-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -4403,20 +4329,23 @@ ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX: {{.*}} ; AVX1: {{.*}} -; AVX2: {{.*}} ; AVX2-FAST: {{.*}} ; AVX2-FAST-PERLANE: {{.*}} ; AVX2-SLOW: {{.*}} ; AVX512BW: {{.*}} +; AVX512BW-FAST: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} ; AVX512BW-ONLY-SLOW: {{.*}} +; AVX512BW-SLOW: {{.*}} ; AVX512DQ-FAST: {{.*}} ; AVX512DQ-SLOW: {{.*}} ; AVX512DQBW-FAST: {{.*}} ; AVX512DQBW-SLOW: {{.*}} ; AVX512F: {{.*}} +; AVX512F-FAST: {{.*}} ; AVX512F-ONLY-FAST: {{.*}} ; AVX512F-ONLY-SLOW: {{.*}} +; AVX512F-SLOW: {{.*}} ; FALLBACK0: {{.*}} ; FALLBACK1: {{.*}} ; FALLBACK10: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll @@ -39,20 +39,20 @@ ; AVX1-ONLY-LABEL: load_i64_stride5_vf2: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%rsi) +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsi) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%r8) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%r8) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%r9) ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride5_vf2: @@ -80,17 +80,17 @@ ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3] ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX512-NEXT: vmovaps (%rdi), %ymm2 ; AVX512-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] -; AVX512-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX512-NEXT: vmovdqa %xmm4, (%rsi) +; AVX512-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512-NEXT: vpunpcklqdq 72(%rdi){1to2}, %xmm1, %xmm1 +; AVX512-NEXT: vmovdqa %xmm3, (%rsi) ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) ; AVX512-NEXT: vextractf128 $1, %ymm2, (%rcx) -; AVX512-NEXT: vmovdqa %xmm5, (%r8) +; AVX512-NEXT: vmovaps %xmm4, (%r8) ; AVX512-NEXT: vmovdqa %xmm1, (%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -152,33 +152,32 @@ ; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovapd (%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm4[0],xmm6[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[3],ymm9[2] -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm4[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[3],ymm7[2] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm4[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm0[0],ymm6[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm6[0],xmm8[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm5[0],xmm7[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovapd %ymm3, (%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm4, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm6, (%r8) ; AVX1-ONLY-NEXT: vmovapd %ymm0, (%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -259,23 +258,23 @@ ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,6,11,u> ; AVX512F-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,4] -; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX512F-FAST-NEXT: vpermi2q %ymm5, %ymm3, %ymm4 -; AVX512F-FAST-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <2,7,12,u> -; AVX512F-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <11,0,5,u> +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <2,7,12,u> +; AVX512F-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <11,0,5,u> +; AVX512F-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm5 +; AVX512F-FAST-NEXT: vpbroadcastq 144(%rdi), %ymm6 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <12,1,6,u> ; AVX512F-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 -; AVX512F-FAST-NEXT: vpbroadcastq 144(%rdi), %ymm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <12,1,6,u> -; AVX512F-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX512F-FAST-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512F-FAST-NEXT: vmovdqa %ymm4, (%rdx) ; AVX512F-FAST-NEXT: vmovdqa %ymm3, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa %ymm6, (%r8) +; AVX512F-FAST-NEXT: vmovdqa %ymm5, (%r8) ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -319,23 +318,23 @@ ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,6,11,u> ; AVX512BW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,4] -; AVX512BW-FAST-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX512BW-FAST-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX512BW-FAST-NEXT: vpermi2q %ymm5, %ymm3, %ymm4 -; AVX512BW-FAST-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <2,7,12,u> -; AVX512BW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <11,0,5,u> +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <2,7,12,u> +; AVX512BW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <11,0,5,u> +; AVX512BW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm5 +; AVX512BW-FAST-NEXT: vpbroadcastq 144(%rdi), %ymm6 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <12,1,6,u> ; AVX512BW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 -; AVX512BW-FAST-NEXT: vpbroadcastq 144(%rdi), %ymm7 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <12,1,6,u> -; AVX512BW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm7 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX512BW-FAST-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512BW-FAST-NEXT: vmovdqa %ymm4, (%rdx) ; AVX512BW-FAST-NEXT: vmovdqa %ymm3, (%rcx) -; AVX512BW-FAST-NEXT: vmovdqa %ymm6, (%r8) +; AVX512BW-FAST-NEXT: vmovdqa %ymm5, (%r8) ; AVX512BW-FAST-NEXT: vmovdqa %ymm0, (%r9) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -356,153 +355,152 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i64_stride5_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movapd 304(%rdi), %xmm2 -; SSE-NEXT: movapd 64(%rdi), %xmm1 -; SSE-NEXT: movapd 224(%rdi), %xmm0 -; SSE-NEXT: movapd 256(%rdi), %xmm4 -; SSE-NEXT: movapd 176(%rdi), %xmm3 -; SSE-NEXT: movapd 288(%rdi), %xmm6 -; SSE-NEXT: movapd 208(%rdi), %xmm5 +; SSE-NEXT: movapd 224(%rdi), %xmm2 +; SSE-NEXT: movapd 144(%rdi), %xmm1 +; SSE-NEXT: movapd 64(%rdi), %xmm0 +; SSE-NEXT: movapd 176(%rdi), %xmm4 +; SSE-NEXT: movapd 96(%rdi), %xmm3 +; SSE-NEXT: movapd 208(%rdi), %xmm6 +; SSE-NEXT: movapd 128(%rdi), %xmm7 ; SSE-NEXT: movapd (%rdi), %xmm8 -; SSE-NEXT: movapd 16(%rdi), %xmm7 -; SSE-NEXT: movapd 32(%rdi), %xmm13 +; SSE-NEXT: movapd 16(%rdi), %xmm5 +; SSE-NEXT: movapd 32(%rdi), %xmm14 ; SSE-NEXT: movapd 48(%rdi), %xmm9 -; SSE-NEXT: movapd 240(%rdi), %xmm10 -; SSE-NEXT: movapd 272(%rdi), %xmm14 -; SSE-NEXT: movapd 160(%rdi), %xmm11 -; SSE-NEXT: movapd 192(%rdi), %xmm15 -; SSE-NEXT: movapd %xmm15, %xmm12 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm11[0],xmm12[1] -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm5[0] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] -; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm0[0] -; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm13, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm8[0],xmm15[1] +; SSE-NEXT: movapd 160(%rdi), %xmm10 +; SSE-NEXT: movapd 192(%rdi), %xmm12 +; SSE-NEXT: movapd 80(%rdi), %xmm11 +; SSE-NEXT: movapd 112(%rdi), %xmm15 +; SSE-NEXT: movapd %xmm14, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm8[0],xmm13[1] ; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm9[0] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm1[0] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm0[0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm15, %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm11[0],xmm14[1] +; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm7[0] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] +; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm1[0] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm14, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm10[0],xmm13[1] +; SSE-NEXT: movapd %xmm12, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm10[0],xmm15[1] ; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm6[0] ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] ; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm2[0] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm14[0],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm12[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 80(%rdi), %xmm14 -; SSE-NEXT: movapd 112(%rdi), %xmm4 +; SSE-NEXT: movapd 240(%rdi), %xmm12 +; SSE-NEXT: movapd 272(%rdi), %xmm4 ; SSE-NEXT: movapd %xmm4, %xmm3 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm14[0],xmm3[1] -; SSE-NEXT: movapd 128(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm0[0] -; SSE-NEXT: movapd 96(%rdi), %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm12[0],xmm3[1] +; SSE-NEXT: movapd 288(%rdi), %xmm0 +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm0[0] +; SSE-NEXT: movapd 256(%rdi), %xmm1 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd 144(%rdi), %xmm2 +; SSE-NEXT: movapd 304(%rdi), %xmm2 ; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] -; SSE-NEXT: movapd %xmm3, 16(%rsi) -; SSE-NEXT: movapd %xmm13, 48(%rsi) -; SSE-NEXT: movapd %xmm15, (%rsi) -; SSE-NEXT: movapd %xmm12, 32(%rsi) -; SSE-NEXT: movapd %xmm14, 16(%rdx) -; SSE-NEXT: movapd %xmm10, 48(%rdx) +; SSE-NEXT: movapd %xmm3, 48(%rsi) +; SSE-NEXT: movapd %xmm15, 32(%rsi) +; SSE-NEXT: movapd %xmm14, 16(%rsi) +; SSE-NEXT: movapd %xmm13, (%rsi) +; SSE-NEXT: movapd %xmm12, 48(%rdx) +; SSE-NEXT: movapd %xmm10, 32(%rdx) +; SSE-NEXT: movapd %xmm11, 16(%rdx) ; SSE-NEXT: movapd %xmm8, (%rdx) -; SSE-NEXT: movapd %xmm11, 32(%rdx) -; SSE-NEXT: movapd %xmm0, 16(%rcx) -; SSE-NEXT: movapd %xmm6, 48(%rcx) +; SSE-NEXT: movapd %xmm0, 48(%rcx) +; SSE-NEXT: movapd %xmm6, 32(%rcx) +; SSE-NEXT: movapd %xmm7, 16(%rcx) ; SSE-NEXT: movapd %xmm9, (%rcx) -; SSE-NEXT: movapd %xmm5, 32(%rcx) -; SSE-NEXT: movapd %xmm1, 16(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) -; SSE-NEXT: movapd %xmm7, (%r8) +; SSE-NEXT: movapd %xmm1, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r8) -; SSE-NEXT: movapd %xmm2, 16(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r9) +; SSE-NEXT: movaps %xmm0, 16(%r8) +; SSE-NEXT: movapd %xmm5, (%r8) +; SSE-NEXT: movapd %xmm2, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride5_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm7[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0,1,2],ymm4[3] ; AVX1-ONLY-NEXT: vmovapd (%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm10[0],xmm4[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = xmm10[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm9[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm9[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm12[0],xmm5[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm13[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm14[0],ymm7[3],ymm14[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm12[0],xmm6[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm13[0],ymm8[3],ymm13[2] ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm10[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm10 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[0],ymm10[0],ymm9[3],ymm10[2] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm12[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm12[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm12[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1,2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = mem[0,1],xmm11[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm14[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0],ymm1[0],ymm12[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm11[0],ymm12[0],ymm11[3],ymm12[2] +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[0],ymm2[0],ymm13[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vmovapd %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm15[0],ymm0[0],ymm15[3],ymm0[2] ; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm4[0],xmm15[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm0 = xmm5[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd %ymm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm5[0],xmm14[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm0 = xmm6[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX1-ONLY-NEXT: vmovapd %ymm9, 32(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm7, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm8, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 32(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 32(%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm8, (%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm13, (%r8) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm1, (%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm2, (%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -721,13 +719,13 @@ ; SSE-NEXT: movapd 128(%rdi), %xmm8 ; SSE-NEXT: movapd (%rdi), %xmm10 ; SSE-NEXT: movapd 16(%rdi), %xmm7 -; SSE-NEXT: movapd 32(%rdi), %xmm14 +; SSE-NEXT: movapd 32(%rdi), %xmm13 ; SSE-NEXT: movapd 48(%rdi), %xmm9 ; SSE-NEXT: movapd 160(%rdi), %xmm11 -; SSE-NEXT: movapd 192(%rdi), %xmm13 +; SSE-NEXT: movapd 192(%rdi), %xmm14 ; SSE-NEXT: movapd 80(%rdi), %xmm12 ; SSE-NEXT: movapd 112(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm14, %xmm15 +; SSE-NEXT: movapd %xmm13, %xmm15 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm10[0],xmm15[1] ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm9[0] @@ -736,7 +734,7 @@ ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm3[0] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm14[0],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm13[0],xmm3[1] ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm0, %xmm3 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm12[0],xmm3[1] @@ -749,7 +747,7 @@ ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm13, %xmm0 +; SSE-NEXT: movapd %xmm14, %xmm0 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm6[0] @@ -758,31 +756,31 @@ ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm1[0] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 240(%rdi), %xmm2 -; SSE-NEXT: movapd 272(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 288(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE-NEXT: movapd 272(%rdi), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 288(%rdi), %xmm0 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 256(%rdi), %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movapd 304(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd 304(%rdi), %xmm0 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 320(%rdi), %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 320(%rdi), %xmm15 ; SSE-NEXT: movapd 352(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 368(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm1[0] ; SSE-NEXT: movapd 336(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -791,78 +789,78 @@ ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 400(%rdi), %xmm11 -; SSE-NEXT: movapd 432(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm11[0],xmm15[1] +; SSE-NEXT: movapd 400(%rdi), %xmm9 +; SSE-NEXT: movapd 432(%rdi), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm9[0],xmm14[1] ; SSE-NEXT: movapd 448(%rdi), %xmm12 -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm12[0] +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm12[0] ; SSE-NEXT: movapd 416(%rdi), %xmm13 ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm13[0],xmm12[1] -; SSE-NEXT: movapd 464(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm1[0] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 464(%rdi), %xmm0 +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm0[0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 480(%rdi), %xmm2 -; SSE-NEXT: movapd 512(%rdi), %xmm5 -; SSE-NEXT: movapd %xmm5, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1] -; SSE-NEXT: movapd 528(%rdi), %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm3[0] +; SSE-NEXT: movapd 512(%rdi), %xmm6 +; SSE-NEXT: movapd %xmm6, %xmm5 +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] +; SSE-NEXT: movapd 528(%rdi), %xmm4 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm4[0] ; SSE-NEXT: movapd 496(%rdi), %xmm8 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm8[0],xmm3[1] -; SSE-NEXT: movapd 544(%rdi), %xmm9 -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm9[0] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] -; SSE-NEXT: movapd 560(%rdi), %xmm5 -; SSE-NEXT: movapd 592(%rdi), %xmm10 -; SSE-NEXT: movapd %xmm10, %xmm6 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm8[0],xmm4[1] +; SSE-NEXT: movapd 544(%rdi), %xmm10 +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm10[0] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm6[0],xmm10[1] +; SSE-NEXT: movapd 560(%rdi), %xmm6 +; SSE-NEXT: movapd 592(%rdi), %xmm11 +; SSE-NEXT: movapd %xmm11, %xmm7 +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] ; SSE-NEXT: movapd 608(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm0[0] +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm0[0] ; SSE-NEXT: movapd 576(%rdi), %xmm1 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd 624(%rdi), %xmm4 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm4[0] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm10[0],xmm4[1] -; SSE-NEXT: movapd %xmm7, 96(%rsi) +; SSE-NEXT: movapd 624(%rdi), %xmm3 +; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm11[0],xmm3[1] +; SSE-NEXT: movapd %xmm7, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, 32(%rsi) -; SSE-NEXT: movapd %xmm6, 112(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 64(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, (%rsi) -; SSE-NEXT: movapd %xmm15, 80(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 16(%rsi) +; SSE-NEXT: movaps %xmm7, 48(%rsi) +; SSE-NEXT: movapd %xmm5, 96(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 32(%rsi) +; SSE-NEXT: movapd %xmm14, 80(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, (%rsi) +; SSE-NEXT: movapd %xmm6, 112(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 48(%rdx) ; SSE-NEXT: movapd %xmm2, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%rdx) -; SSE-NEXT: movapd %xmm5, 112(%rdx) +; SSE-NEXT: movapd %xmm9, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movapd %xmm14, 64(%rdx) +; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movapd %xmm15, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movapd %xmm11, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movapd %xmm3, 96(%rcx) ; SSE-NEXT: movapd %xmm0, 112(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movapd %xmm4, 96(%rcx) ; SSE-NEXT: movapd %xmm12, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps %xmm0, 64(%rcx) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movapd %xmm1, 112(%r8) ; SSE-NEXT: movapd %xmm8, 96(%r8) ; SSE-NEXT: movapd %xmm13, 80(%r8) @@ -876,8 +874,8 @@ ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movapd %xmm4, 112(%r9) -; SSE-NEXT: movapd %xmm9, 96(%r9) +; SSE-NEXT: movapd %xmm3, 112(%r9) +; SSE-NEXT: movapd %xmm10, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -895,342 +893,338 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride5_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $376, %rsp # imm = 0x178 +; AVX1-ONLY-NEXT: subq $392, %rsp # imm = 0x188 +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = xmm4[0],xmm6[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm1[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm11[0],xmm6[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd (%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd (%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm14[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm4[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm1[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm8[0],xmm0[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm14[0],xmm12[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm3[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm0[0],xmm12[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm13[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm15[0],ymm2[3],ymm15[2] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm0[0],xmm10[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm13[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm13[0],ymm2[3],ymm13[2] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[3],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm8[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm11[0],ymm5[3],ymm11[2] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm14[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[3],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm9[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm2[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm8[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[3],ymm8[2] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm9[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[0],ymm3[0],ymm14[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm15[0],ymm14[0],ymm15[3],ymm14[2] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm11[0],ymm15[0],ymm11[3],ymm15[2] ; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm13[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[3],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2],ymm14[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm0 = xmm12[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 64(%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 96(%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 32(%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 64(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm4, (%r9) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 96(%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 32(%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 64(%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm9, (%r8) ; AVX1-ONLY-NEXT: vmovapd %ymm1, 96(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 32(%r9) -; AVX1-ONLY-NEXT: addq $376, %rsp # imm = 0x178 +; AVX1-ONLY-NEXT: vmovapd %ymm2, 64(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm4, (%r9) +; AVX1-ONLY-NEXT: addq $392, %rsp # imm = 0x188 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride5_vf16: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $360, %rsp # imm = 0x168 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm8 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm8[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm11 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm0[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm3[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm8[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm14 = ymm15[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm8[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm10[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm6[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm13[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm14[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = ymm9[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm10 = mem[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm8 = mem[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm12[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm10[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = mem[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 64(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 96(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 96(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 32(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, 96(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 96(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 64(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%r9) ; AVX2-ONLY-NEXT: addq $360, %rsp # imm = 0x168 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -1613,7 +1607,7 @@ ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 848(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: movapd 816(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1638,62 +1632,55 @@ ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 960(%rdi), %xmm10 -; SSE-NEXT: movapd 992(%rdi), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm10[0],xmm14[1] -; SSE-NEXT: movapd 1008(%rdi), %xmm15 -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm15[0] +; SSE-NEXT: movapd 960(%rdi), %xmm12 +; SSE-NEXT: movapd 992(%rdi), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 1008(%rdi), %xmm14 +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm14[0] ; SSE-NEXT: movapd 976(%rdi), %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm2[0],xmm15[1] -; SSE-NEXT: movapd 1024(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1] +; SSE-NEXT: movapd 1024(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1040(%rdi), %xmm8 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 1040(%rdi), %xmm7 ; SSE-NEXT: movapd 1072(%rdi), %xmm3 -; SSE-NEXT: movapd %xmm3, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm8[0],xmm13[1] -; SSE-NEXT: movapd 1088(%rdi), %xmm9 -; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm9[0] -; SSE-NEXT: movapd 1056(%rdi), %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1] +; SSE-NEXT: movapd %xmm3, %xmm9 +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] +; SSE-NEXT: movapd 1088(%rdi), %xmm10 +; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm10[0] +; SSE-NEXT: movapd 1056(%rdi), %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm15[0],xmm10[1] ; SSE-NEXT: movapd 1104(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm0[0] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movapd 1120(%rdi), %xmm1 +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 1120(%rdi), %xmm3 ; SSE-NEXT: movapd 1152(%rdi), %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm5 -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1] -; SSE-NEXT: movapd 1168(%rdi), %xmm6 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm6[0] +; SSE-NEXT: movapd %xmm0, %xmm6 +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1] +; SSE-NEXT: movapd 1168(%rdi), %xmm5 +; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm5[0] ; SSE-NEXT: movapd 1136(%rdi), %xmm11 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm11[0],xmm6[1] -; SSE-NEXT: movapd 1184(%rdi), %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm2[0] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm11[0],xmm5[1] +; SSE-NEXT: movapd 1184(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm1[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1200(%rdi), %xmm0 ; SSE-NEXT: movapd 1232(%rdi), %xmm4 ; SSE-NEXT: movapd %xmm4, %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE-NEXT: movapd 1248(%rdi), %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm3[0] -; SSE-NEXT: movapd 1216(%rdi), %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1] -; SSE-NEXT: movapd 1264(%rdi), %xmm12 -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm12[0] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm4[0],xmm12[1] -; SSE-NEXT: movapd %xmm5, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rsi) +; SSE-NEXT: movapd 1248(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE-NEXT: movapd 1216(%rdi), %xmm8 +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; SSE-NEXT: movapd 1264(%rdi), %xmm13 +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm13[0] +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm4[0],xmm13[1] ; SSE-NEXT: movapd %xmm2, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 176(%rsi) @@ -1701,52 +1688,60 @@ ; SSE-NEXT: movaps %xmm2, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movapd %xmm14, 192(%rsi) +; SSE-NEXT: movapd %xmm6, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 128(%rsi) +; SSE-NEXT: movaps %xmm2, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rsi) +; SSE-NEXT: movaps %xmm2, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movapd %xmm13, 208(%rsi) +; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movapd %xmm9, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 144(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movapd %xmm1, 224(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movapd %xmm0, 240(%rdx) -; SSE-NEXT: movapd %xmm10, 192(%rdx) -; SSE-NEXT: movapd %xmm8, 208(%rdx) +; SSE-NEXT: movapd %xmm3, 224(%rdx) +; SSE-NEXT: movapd %xmm7, 208(%rdx) +; SSE-NEXT: movapd %xmm12, 192(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rdx) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rdx) +; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rdx) +; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%rdx) +; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rdx) +; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movapd %xmm3, 240(%rcx) -; SSE-NEXT: movapd %xmm6, 224(%rcx) -; SSE-NEXT: movapd %xmm9, 208(%rcx) -; SSE-NEXT: movapd %xmm15, 192(%rcx) +; SSE-NEXT: movapd %xmm1, 240(%rcx) +; SSE-NEXT: movapd %xmm5, 224(%rcx) +; SSE-NEXT: movapd %xmm10, 208(%rcx) +; SSE-NEXT: movapd %xmm14, 192(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1771,10 +1766,9 @@ ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movapd %xmm7, 240(%r8) +; SSE-NEXT: movapd %xmm8, 240(%r8) ; SSE-NEXT: movapd %xmm11, 224(%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 208(%r8) +; SSE-NEXT: movapd %xmm15, 208(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1801,10 +1795,10 @@ ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movapd %xmm12, 240(%r9) +; SSE-NEXT: movapd %xmm13, 240(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%r9) @@ -1837,337 +1831,301 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1384, %rsp # imm = 0x568 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm4 +; AVX1-ONLY-NEXT: subq $1336, %rsp # imm = 0x538 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd (%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm8[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm11[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm9[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 512(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm6[0],xmm2[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1216(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1184(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm13[0],xmm2[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1056(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 960(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm5[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm15[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm10[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm1[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 736(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm2[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 672(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd 1216(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1184(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm2[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm4[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm10[0],ymm7[3],ymm10[2] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm9[0],ymm7[0],ymm9[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm15 = xmm3[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm14[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 768(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm12[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm14[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm6[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm11[0],ymm7[0],ymm11[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm13[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm10[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm12[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm15[0],ymm6[0],ymm15[3],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm5[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 816(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm4[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm12[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm15[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendps $12, (%rsp), %xmm6, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm6[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 976(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm2[0],ymm12[0],ymm2[3],ymm12[2] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1],ymm11[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm5[0],ymm2[0],ymm5[3],ymm2[2] -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm7[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm11[0],ymm8[3],ymm11[2] -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm2[0],ymm8[3],ymm2[2] -; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm12[0],ymm3[3],ymm12[2] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[3],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm15[0],ymm2[3],ymm15[2] +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[3],ymm10[2] +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm9[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[3],ymm7[2] +; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[0],ymm3[0],ymm0[3],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm14[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[3],ymm8[2] -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm4[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm4[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm11[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm14[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = mem[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = mem[0,1,2,3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,1,2,3],xmm11[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vpblendw $15, (%rsp), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = mem[0,1,2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm8[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -2177,13 +2135,13 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -2193,13 +2151,13 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -2208,483 +2166,483 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, (%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 192(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 224(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 192(%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm4, 160(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 64(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm15, 32(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm13, (%r9) -; AVX1-ONLY-NEXT: addq $1384, %rsp # imm = 0x568 +; AVX1-ONLY-NEXT: vmovapd %ymm6, 128(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 96(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 64(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm14, (%r9) +; AVX1-ONLY-NEXT: addq $1336, %rsp # imm = 0x538 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride5_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1544, %rsp # imm = 0x608 -; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm2 +; AVX2-ONLY-NEXT: subq $1496, %rsp # imm = 0x5D8 +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm10[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm2[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm2[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm3[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 848(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm10 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm14[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm9 +; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm15[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm8 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm13[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm7 +; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 848(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm4[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm5[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm10[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm9[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm14[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = mem[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm13[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm8[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm10 = mem[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = mem[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = mem[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = mem[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = mem[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm14[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, (%rsp), %xmm15, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, (%rsp), %xmm6, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm10[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm6 = mem[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 192(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 128(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 64(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX2-ONLY-NEXT: addq $1544, %rsp # imm = 0x608 +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 224(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 224(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 192(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 160(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 128(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 64(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 32(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, (%r9) +; AVX2-ONLY-NEXT: addq $1496, %rsp # imm = 0x5D8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride5_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm13 +; AVX512F-NEXT: subq $584, %rsp # imm = 0x248 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm7 ; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm15 ; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm18 ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm20 ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm17 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm22 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [12,1,6,0,12,1,6,0] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm28 = [0,5,10,15] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm28, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm13, %zmm0, %zmm4 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [12,1,6,0,12,1,6,0] +; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm28, %zmm11 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm27 = [0,5,10,15] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm27, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm28, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm28, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm7, %zmm0, %zmm28 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm30 = <1,6,11,u> -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm30, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm30, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,10,15,0,5,10,15,0] -; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm17, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm17, %zmm26 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,10,15,0,5,10,15,0] +; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm16, %zmm26 ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm17, %zmm21 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm16, %zmm19 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm7, %zmm16 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,11,0,1,6,11,0,1] ; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 ; AVX512F-NEXT: vpermt2q %zmm3, %zmm25, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm23 = <2,7,12,u> -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm23, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm23, %zmm24 ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 ; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm0, %zmm7, %zmm25 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2] ; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 @@ -2698,192 +2656,190 @@ ; AVX512F-NEXT: vpermt2q %zmm2, %zmm29, %zmm22 ; AVX512F-NEXT: vpermt2q %zmm20, %zmm31, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm20 ; AVX512F-NEXT: vpermt2q %zmm1, %zmm29, %zmm20 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm31, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = <11,0,5,u> -; AVX512F-NEXT: vpermt2q %zmm9, %zmm31, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm31, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm13, %zmm9 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <11,0,5,u> +; AVX512F-NEXT: vpermi2q %zmm0, %zmm7, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm31, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm15, %zmm7 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm31 = <12,1,6,u> -; AVX512F-NEXT: vpermt2q %zmm15, %zmm31, %zmm18 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm28, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm30, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm23, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm13, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm31, %zmm19 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm28, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm31, %zmm17 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm27, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm30, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm23, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm15, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm31, %zmm18 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm30, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm23, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm14 ; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm30, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm27 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm31, %zmm15 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm14[0,1,2,3],zmm12[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm14 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm14 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm12 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm10 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm10 = zmm28[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm12[0,1,2,3],zmm11[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm12 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm11 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm27[0,1,2,3],zmm28[4,5,6,7] ; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm28 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,6,11] -; AVX512F-NEXT: vpermt2q %zmm28, %zmm31, %zmm16 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm2 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm31, %zmm14 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm31, %zmm12 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm31, %zmm13 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm31, %zmm12 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 ; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] -; AVX512F-NEXT: vpermt2q %zmm28, %zmm7, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,12] +; AVX512F-NEXT: vpermt2q %zmm28, %zmm8, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm8, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm16 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm24 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512F-NEXT: vpermt2q %zmm28, %zmm7, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] -; AVX512F-NEXT: vpermt2q %zmm28, %zmm7, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] -; AVX512F-NEXT: vpermt2q %zmm28, %zmm7, %zmm18 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,8,13] +; AVX512F-NEXT: vpermt2q %zmm28, %zmm8, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm8, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm23 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] +; AVX512F-NEXT: vpermt2q %zmm28, %zmm8, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm8, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm17 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,15] +; AVX512F-NEXT: vpermt2q %zmm28, %zmm8, %zmm17 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm18 {%k1} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm8, %zmm18 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm17, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm27, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm27, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm12, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm13, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm16, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm19, 128(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm26, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm26, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm23, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm6, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm15, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm5, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm10, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%r8) ; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm18, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm19, 64(%r9) -; AVX512F-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512F-NEXT: vmovdqa64 %zmm14, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, (%r9) +; AVX512F-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride5_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 +; AVX512BW-NEXT: subq $584, %rsp # imm = 0x248 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm18 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm17 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm22 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [12,1,6,0,12,1,6,0] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm28 = [0,5,10,15] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm28, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm13, %zmm0, %zmm4 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [12,1,6,0,12,1,6,0] +; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm28, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm27 = [0,5,10,15] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm27, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm28, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm28, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm0, %zmm28 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm30 = <1,6,11,u> -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm30, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm30, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,10,15,0,5,10,15,0] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm17, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm26 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,10,15,0,5,10,15,0] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm16, %zmm26 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm17, %zmm21 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm19 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm16 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,11,0,1,6,11,0,1] ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm23 = <2,7,12,u> -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm23, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm25 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,12,0,2,7,12,0,2] ; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 @@ -2897,132 +2853,131 @@ ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm22 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm31, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm20 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm29, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = <11,0,5,u> -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm31, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm13, %zmm9 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <11,0,5,u> +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm31, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm7 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm31 = <12,1,6,u> -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm18 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm28, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm30, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm13, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm19 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm28, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm31, %zmm17 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm27, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm30, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm15, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm31, %zmm18 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm30, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm14 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm30, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm27 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm3 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm15 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm31, %zmm15 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm14[0,1,2,3],zmm12[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm14 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm14 = zmm10[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm12 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm10 = zmm28[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm12[0,1,2,3],zmm11[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm12 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm11 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm11 = zmm2[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm27[0,1,2,3],zmm28[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm28 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,1,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm16 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm31, %zmm14 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm31, %zmm12 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm13 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm31, %zmm12 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm31, %zmm11 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm31, %zmm27 ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,12] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm7, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm8, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm16 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm24 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm7, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,9,14] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,10,15] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm7, %zmm18 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,8,13] +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm8, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14] +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm8, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm15 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,15] +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm8, %zmm17 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm18 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm27, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm27, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm26, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm23, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%r9) -; AVX512BW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, (%r9) +; AVX512BW-NEXT: addq $584, %rsp # imm = 0x248 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <160 x i64>, ptr %in.vec, align 64 @@ -3850,626 +3805,546 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride5_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3288, %rsp # imm = 0xCD8 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm5 +; AVX1-ONLY-NEXT: subq $3192, %rsp # imm = 0xC78 +; AVX1-ONLY-NEXT: vmovapd 736(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm13[0],xmm3[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1216(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1184(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm5[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm4[0],xmm7[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm7[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1440(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm6[0],xmm9[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1856(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1824(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm9[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovapd 1792(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm11 = xmm8[0],xmm11[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2144(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovaps 2112(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2080(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm10[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2464(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2400(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm12[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 672(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm14 = xmm11[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm3[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm5[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2336(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2304(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2272(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2304(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 2272(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm6[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm7[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm9[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm10[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm12[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2144(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 2112(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2080(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm13[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2464(%rdi), %ymm14 ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm14[0],ymm2[3],ymm14[2] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovaps 2400(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 768(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1408(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 1328(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1648(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1488(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 2048(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1968(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm5[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 2368(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm6[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1808(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 2128(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 528(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2528(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 2448(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 848(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1568(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1488(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1888(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 688(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 1808(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm12[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2208(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 2128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm13[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2528(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1328(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 2448(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm14[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1648(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr $8, (%rsp), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2048(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm0[0],ymm3[0],ymm0[3],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 1968(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2368(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm0[0],ymm6[0],ymm0[3],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm6[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm7[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm11[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1456(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 1776(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 2096(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vmovaps 2416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2336(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm0[0,1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa 2256(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1776(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 1936(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1936(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2096(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm13[2,3] +; AVX1-ONLY-NEXT: vmovaps 2336(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 1296(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2256(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm3[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm5[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm8[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm10[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm9[0,1],xmm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm14[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[0],ymm12[0],ymm9[3],ymm12[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm14[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[3],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm10[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[3],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[3],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[3],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 704(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm5[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 896(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm6[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1216(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] ; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovapd 1408(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1376(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] ; AVX1-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 1568(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1536(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[3],ymm14[2] ; AVX1-ONLY-NEXT: vmovdqa 1504(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1696(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[3],ymm12[2] ; AVX1-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm15[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 1888(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1856(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[3],ymm10[2] ; AVX1-ONLY-NEXT: vmovdqa 1824(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 2048(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2016(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[3],ymm8[2] ; AVX1-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 2208(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2176(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[3],ymm6[2] ; AVX1-ONLY-NEXT: vmovdqa 2144(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 2368(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2336(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[3],ymm4[2] ; AVX1-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovapd 2528(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2496(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2] ; AVX1-ONLY-NEXT: vmovdqa 2464(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload @@ -4479,7 +4354,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] @@ -4551,53 +4426,37 @@ ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm14[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2,3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm13[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm12[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = mem[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm11[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm10[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = mem[0,1,2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm9[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm8[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm7[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm6[3] ; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2,3],xmm5[4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -4615,21 +4474,21 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -4647,38 +4506,54 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%r8) @@ -4698,7 +4573,7 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) @@ -4713,10 +4588,10 @@ ; AVX1-ONLY-NEXT: vmovapd %ymm0, 480(%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm3, 448(%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm5, 416(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 384(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 352(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 320(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 288(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 384(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 352(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 320(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 288(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm15, 256(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r9) @@ -4734,302 +4609,279 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX1-ONLY-NEXT: addq $3288, %rsp # imm = 0xCD8 +; AVX1-ONLY-NEXT: addq $3192, %rsp # imm = 0xC78 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride5_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3240, %rsp # imm = 0xCA8 -; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 +; AVX2-ONLY-NEXT: subq $3288, %rsp # imm = 0xCD8 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1856(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1824(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1792(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2176(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2144(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2464(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 2432(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2400(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm3[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2016(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1920(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 2272(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 2240(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm10[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 848(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm11[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr $8, (%rsp), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm12[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1488(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm13[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm8[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1808(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm14[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm9[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2128(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm15[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm10[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2448(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm11[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vmovdqa 1856(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1824(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1792(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm12[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm11 -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2176(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2144(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm13[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2496(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2464(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm7, %ymm9 -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1008(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1328(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1648(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1968(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2048(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2368(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 528(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm8 @@ -5037,9 +4889,9 @@ ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1328(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 848(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm7 @@ -5047,100 +4899,121 @@ ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1648(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm6 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1488(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm11[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm5 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1968(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2048(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 1808(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2128(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm13 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2368(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2448(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm11 +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2080(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2400(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 2336(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] @@ -5152,196 +5025,202 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2080(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2336(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2400(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm8 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 1824(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 2144(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm15 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqa 2464(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm12 = mem[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = mem[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = mem[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = mem[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = mem[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm12[6,7] @@ -5353,78 +5232,60 @@ ; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm9 = mem[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm8 = mem[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm7 = mem[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm6 = mem[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm5 = mem[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = mem[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = mem[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $192, (%rsp), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = ymm13[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) @@ -5441,21 +5302,21 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -5473,38 +5334,54 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%r8) @@ -5537,61 +5414,78 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 480(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 448(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 416(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 384(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 352(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 320(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 288(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm14, 256(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, 224(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm15, 192(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 448(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 416(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 384(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 352(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 320(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 288(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, 256(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, 224(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 192(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm10, 160(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm13, 128(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 96(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 128(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 32(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX2-ONLY-NEXT: addq $3240, %rsp # imm = 0xCA8 +; AVX2-ONLY-NEXT: addq $3288, %rsp # imm = 0xCD8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride5_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm19 ; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,1,6,0,12,1,6,0] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0] +; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm16, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm9, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm16, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm16, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,10,15,0,5,10,15,0] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm9, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm9, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm9, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,10,15,0,5,10,15,0] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,11,0,1,6,11,0,1] ; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512F-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 @@ -5600,257 +5494,229 @@ ; AVX512F-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512F-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [6,11,0,1,6,11,0,1] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [7,12,0,2,7,12,0,2] ; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512F-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [7,12,0,2,7,12,0,2] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm12, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,0,11,0,5,0,11] ; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm11, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm9, %zmm18 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm16, %zmm17 ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm9, %zmm19 -; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm6 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm6, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm6, %zmm1, %zmm10 +; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm1 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm6, %zmm1, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm6, %zmm1, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm22 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,10,15] ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm1, %zmm5 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm28 = <1,6,11,u> +; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <1,6,11,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm28, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm7, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <2,7,12,u> +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <2,7,12,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <11,0,5,u> -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <11,0,5,u> +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <12,1,6,u> -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm31 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = <12,1,6,u> +; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm22 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm21 ; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm1, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm19 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm28, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm7, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm9, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm27 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm0 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm21 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm28, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm7, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm9, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm26 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm1, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm26 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm1, %zmm13 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm28, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm7, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm9, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm30 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm31 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm7, %zmm6 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm28, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm9, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm8, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm24 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm0 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm29 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm28, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm8, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm13 -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm29 -; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm7 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm7, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm9, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm25 +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm1, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm7, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm9, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm12 +; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm8 ; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm7, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm28, %zmm16 -; AVX512F-NEXT: vpermi2q %zmm7, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm20 -; AVX512F-NEXT: vpermi2q %zmm7, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm11, %zmm21 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm7, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm15, %zmm10 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm9 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm2 = zmm17[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm14 = zmm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm18[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 $228, (%rsp), %zmm12, %zmm5 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm5 = zmm12[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm29[0,1,2,3],zmm19[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm0 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm29 +; AVX512F-NEXT: vpermi2q %zmm8, %zmm0, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm8, %zmm0, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm8, %zmm0, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm8, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm8 +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm19 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm19 = zmm19[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, (%rsp), %zmm3, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm2 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm3 # 64-byte Folded Reload +; AVX512F-NEXT: # zmm3 = zmm11[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm17[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm15[0,1,2,3],zmm18[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm16[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm18 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,11] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm15 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm18 ; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 ; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm14 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm17 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm15 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm17 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm11 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm13 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm14 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm19 ; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: movb $7, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} @@ -5862,68 +5728,71 @@ ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,1,2,3,4,5,6,12] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,12] +; AVX512F-NEXT: vpermt2q %zmm18, %zmm24, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm28, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm28, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm28, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm24, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm28, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm24, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm12, %zmm28, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm24, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm28, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm24, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm24, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm14, %zmm24, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm19, %zmm24, %zmm28 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,13] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm9 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5933,44 +5802,46 @@ ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,9,14] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm10 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,10,15] -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm31 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm27 {%k1} -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm27 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm26 {%k1} -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm30 {%k1} -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm21 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm31 {%k1} +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm31 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm29 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm8 ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5979,56 +5850,56 @@ ; AVX512F-NEXT: vmovaps %zmm0, 320(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 256(%rsi) -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 192(%rsi) +; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 128(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm16, 256(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm28, 448(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, 384(%rdx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 320(%rdx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rdx) +; AVX512F-NEXT: vmovaps %zmm0, 256(%rdx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 192(%rdx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rdx) +; AVX512F-NEXT: vmovaps %zmm0, 128(%rdx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rdx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm8, 448(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm22, 256(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm25, 320(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512F-NEXT: vmovaps %zmm0, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm9, 448(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm20, 384(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm27, 320(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm6, 256(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 192(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rcx) +; AVX512F-NEXT: vmovaps %zmm0, 128(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm20, 384(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm23, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm1, 320(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm28, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm5, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm21, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm10, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm7, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm13, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm24, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm30, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm26, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm31, (%r9) -; AVX512F-NEXT: vmovdqa64 %zmm27, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm10, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm23, 384(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm30, 320(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm4, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm8, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm12, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm25, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm29, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm26, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, (%r9) ; AVX512F-NEXT: addq $3336, %rsp # imm = 0xD08 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -6036,36 +5907,53 @@ ; AVX512BW-LABEL: load_i64_stride5_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $3336, %rsp # imm = 0xD08 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm18 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm19 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,1,6,0,12,1,6,0] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [12,1,6,0,12,1,6,0] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm9, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm16, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm16, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,10,15,0,5,10,15,0] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,10,15,0,5,10,15,0] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,11,0,1,6,11,0,1] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 @@ -6074,257 +5962,229 @@ ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm10, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [6,11,0,1,6,11,0,1] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [7,12,0,2,7,12,0,2] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm11, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [7,12,0,2,7,12,0,2] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,0,11,0,5,0,11] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm3, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm9, %zmm18 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm9, %zmm19 -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm6, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm1, %zmm10 +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm16 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm1, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm22 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,10,15] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm28 = <1,6,11,u> +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <1,6,11,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm28, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm7, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <2,7,12,u> +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <2,7,12,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = <11,0,5,u> -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <11,0,5,u> +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <12,1,6,u> -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm31 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <12,1,6,u> +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm22 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm21 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm28, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm7, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm9, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm27 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm21 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm28, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm26 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm26 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm1, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm28, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm7, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm8, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm9, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm30 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm31 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm7, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm28, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm9, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm8, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm24 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm29 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm28, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm8, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm13 -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm29 -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm7, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm9, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm25 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm9, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm12 +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm28, %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm11, %zmm21 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm7, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm15, %zmm10 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm9 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm9 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm2 = zmm17[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm14 = zmm14[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 = zmm3[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm18[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 $228, (%rsp), %zmm12, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm5 = zmm12[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm29[0,1,2,3],zmm19[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm0 = zmm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm29 +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm0, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm8 +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm19 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm19 = zmm19[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, (%rsp), %zmm3, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm2 = zmm3[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: # zmm3 = zmm11[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm17[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm15[0,1,2,3],zmm18[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm16[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm18 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm18 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $7, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} @@ -6336,68 +6196,71 @@ ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,1,2,3,4,5,6,12] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm28, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,5,6,12] +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm24, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm28, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm28, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm28, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm24, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm28, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm24, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm28, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm24, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm28, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm24, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm24, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm24, %zmm28 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,8,13] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm24 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm9 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -6407,44 +6270,46 @@ ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,9,14] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm10 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,10,15] -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm27 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm27 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm26 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm30 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm24 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm21 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm26 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm31 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm31 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm29 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm29 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -6453,56 +6318,56 @@ ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rsi) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 192(%rsi) +; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 128(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 256(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 448(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 384(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rdx) +; AVX512BW-NEXT: vmovaps %zmm0, 256(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rdx) +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 256(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 320(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512BW-NEXT: vmovaps %zmm0, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 448(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 384(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 320(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 256(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rcx) +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 384(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm28, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm31, (%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, (%r9) ; AVX512BW-NEXT: addq $3336, %rsp # imm = 0xD08 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll @@ -42,28 +42,51 @@ ; SSE-NEXT: movaps %xmm4, (%rax) ; SSE-NEXT: retq ; -; AVX-LABEL: load_i64_stride6_vf2: -; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX-NEXT: vmovaps 48(%rdi), %xmm3 -; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm3[0] -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; AVX-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm3[0] -; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; AVX-NEXT: vmovaps 80(%rdi), %xmm3 -; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm3[0] -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX-NEXT: vmovaps %xmm4, (%rsi) -; AVX-NEXT: vmovaps %xmm0, (%rdx) -; AVX-NEXT: vmovaps %xmm5, (%rcx) -; AVX-NEXT: vmovaps %xmm1, (%r8) -; AVX-NEXT: vmovaps %xmm6, (%r9) -; AVX-NEXT: vmovaps %xmm2, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: load_i64_stride6_vf2: +; AVX1: # %bb.0: +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX1-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-NEXT: vmovaps 48(%rdi), %xmm3 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm3[0] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX1-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm3[0] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; AVX1-NEXT: vmovaps 80(%rdi), %xmm3 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm3[0] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX1-NEXT: vmovaps %xmm4, (%rsi) +; AVX1-NEXT: vmovaps %xmm0, (%rdx) +; AVX1-NEXT: vmovaps %xmm5, (%rcx) +; AVX1-NEXT: vmovaps %xmm1, (%r8) +; AVX1-NEXT: vmovaps %xmm6, (%r9) +; AVX1-NEXT: vmovaps %xmm2, (%rax) +; AVX1-NEXT: retq +; +; AVX512-LABEL: load_i64_stride6_vf2: +; AVX512: # %bb.0: +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vmovaps (%rdi), %xmm0 +; AVX512-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX512-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX512-NEXT: vmovaps 48(%rdi), %xmm3 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm3[0] +; AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512-NEXT: vpbroadcastq 24(%rdi), %xmm3 +; AVX512-NEXT: vpunpcklqdq 72(%rdi){1to2}, %xmm3, %xmm3 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512-NEXT: vpbroadcastq 40(%rdi), %xmm5 +; AVX512-NEXT: vpunpcklqdq 88(%rdi){1to2}, %xmm5, %xmm5 +; AVX512-NEXT: vmovaps %xmm4, (%rsi) +; AVX512-NEXT: vmovaps %xmm0, (%rdx) +; AVX512-NEXT: vmovaps %xmm1, (%rcx) +; AVX512-NEXT: vmovdqa %xmm3, (%r8) +; AVX512-NEXT: vmovaps %xmm2, (%r9) +; AVX512-NEXT: vmovdqa %xmm5, (%rax) +; AVX512-NEXT: retq %wide.vec = load <12 x i64>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <12 x i64> %wide.vec, <12 x i64> poison, <2 x i32> %strided.vec1 = shufflevector <12 x i64> %wide.vec, <12 x i64> poison, <2 x i32> @@ -84,48 +107,48 @@ ; SSE-LABEL: load_i64_stride6_vf4: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps 80(%rdi), %xmm3 -; SSE-NEXT: movaps 176(%rdi), %xmm5 +; SSE-NEXT: movaps 176(%rdi), %xmm4 ; SSE-NEXT: movaps 128(%rdi), %xmm0 -; SSE-NEXT: movaps 64(%rdi), %xmm6 +; SSE-NEXT: movaps 80(%rdi), %xmm6 ; SSE-NEXT: movaps 160(%rdi), %xmm7 -; SSE-NEXT: movaps 112(%rdi), %xmm1 -; SSE-NEXT: movaps (%rdi), %xmm8 -; SSE-NEXT: movaps 16(%rdi), %xmm4 -; SSE-NEXT: movaps 32(%rdi), %xmm2 +; SSE-NEXT: movaps 112(%rdi), %xmm2 +; SSE-NEXT: movaps 64(%rdi), %xmm8 +; SSE-NEXT: movaps (%rdi), %xmm5 +; SSE-NEXT: movaps 16(%rdi), %xmm3 +; SSE-NEXT: movaps 32(%rdi), %xmm1 ; SSE-NEXT: movaps 48(%rdi), %xmm9 ; SSE-NEXT: movaps 144(%rdi), %xmm10 ; SSE-NEXT: movaps 96(%rdi), %xmm11 ; SSE-NEXT: movaps %xmm11, %xmm12 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm10[0] -; SSE-NEXT: movaps %xmm8, %xmm13 +; SSE-NEXT: movaps %xmm5, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm9[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm10[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] -; SSE-NEXT: movaps %xmm1, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm7[0] -; SSE-NEXT: movaps %xmm4, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE-NEXT: movaps %xmm13, (%rsi) +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm9[1] +; SSE-NEXT: movaps %xmm3, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] +; SSE-NEXT: movaps %xmm2, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] +; SSE-NEXT: movaps %xmm1, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] ; SSE-NEXT: movaps %xmm12, 16(%rsi) -; SSE-NEXT: movaps %xmm8, (%rdx) +; SSE-NEXT: movaps %xmm13, (%rsi) ; SSE-NEXT: movaps %xmm11, 16(%rdx) -; SSE-NEXT: movaps %xmm10, (%rcx) -; SSE-NEXT: movaps %xmm9, 16(%rcx) -; SSE-NEXT: movaps %xmm4, (%r8) -; SSE-NEXT: movaps %xmm1, 16(%r8) +; SSE-NEXT: movaps %xmm5, (%rdx) +; SSE-NEXT: movaps %xmm10, 16(%rcx) +; SSE-NEXT: movaps %xmm9, (%rcx) +; SSE-NEXT: movaps %xmm2, 16(%r8) +; SSE-NEXT: movaps %xmm3, (%r8) +; SSE-NEXT: movaps %xmm8, 16(%r9) ; SSE-NEXT: movaps %xmm7, (%r9) -; SSE-NEXT: movaps %xmm6, 16(%r9) -; SSE-NEXT: movaps %xmm2, (%rax) ; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps %xmm1, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride6_vf4: @@ -273,20 +296,21 @@ ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <10,0,6,u> ; AVX512F-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,4] -; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm7 +; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %xmm7 ; AVX512F-FAST-NEXT: vpermi2q %ymm7, %ymm5, %ymm6 -; AVX512F-FAST-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <11,1,7,u> -; AVX512F-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,6,0,6] -; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermi2q %ymm7, %ymm4, %ymm8 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <11,1,7,u> +; AVX512F-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,6,0,6] +; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX512F-FAST-NEXT: vpermi2q %ymm8, %ymm4, %ymm7 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,10] ; AVX512F-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-FAST-NEXT: vpbroadcastq 136(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-FAST-NEXT: vpbroadcastq 136(%rdi), %ymm7 +; AVX512F-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [5,11] ; AVX512F-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] @@ -357,20 +381,21 @@ ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <10,0,6,u> ; AVX512BW-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm5 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,4] -; AVX512BW-FAST-NEXT: vmovdqa 160(%rdi), %ymm7 +; AVX512BW-FAST-NEXT: vmovdqa 160(%rdi), %xmm7 ; AVX512BW-FAST-NEXT: vpermi2q %ymm7, %ymm5, %ymm6 -; AVX512BW-FAST-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm5 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <11,1,7,u> -; AVX512BW-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,6,0,6] -; AVX512BW-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpermi2q %ymm7, %ymm4, %ymm8 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm5 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <11,1,7,u> +; AVX512BW-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,6,0,6] +; AVX512BW-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX512BW-FAST-NEXT: vpermi2q %ymm8, %ymm4, %ymm7 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,10] ; AVX512BW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-FAST-NEXT: vpbroadcastq 136(%rdi), %ymm8 -; AVX512BW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FAST-NEXT: vpbroadcastq 136(%rdi), %ymm7 +; AVX512BW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [5,11] ; AVX512BW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7] @@ -402,21 +427,21 @@ ; SSE-LABEL: load_i64_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movaps 160(%rdi), %xmm8 +; SSE-NEXT: movaps 352(%rdi), %xmm8 ; SSE-NEXT: movaps 256(%rdi), %xmm9 ; SSE-NEXT: movaps 208(%rdi), %xmm0 -; SSE-NEXT: movaps 352(%rdi), %xmm12 -; SSE-NEXT: movaps 304(%rdi), %xmm1 +; SSE-NEXT: movaps 160(%rdi), %xmm12 +; SSE-NEXT: movaps 112(%rdi), %xmm1 ; SSE-NEXT: movaps 64(%rdi), %xmm15 ; SSE-NEXT: movaps (%rdi), %xmm3 ; SSE-NEXT: movaps 16(%rdi), %xmm2 ; SSE-NEXT: movaps 48(%rdi), %xmm10 -; SSE-NEXT: movaps 144(%rdi), %xmm14 -; SSE-NEXT: movaps 96(%rdi), %xmm4 +; SSE-NEXT: movaps 336(%rdi), %xmm14 +; SSE-NEXT: movaps 288(%rdi), %xmm4 ; SSE-NEXT: movaps 240(%rdi), %xmm13 ; SSE-NEXT: movaps 192(%rdi), %xmm5 -; SSE-NEXT: movaps 336(%rdi), %xmm11 -; SSE-NEXT: movaps 288(%rdi), %xmm6 +; SSE-NEXT: movaps 144(%rdi), %xmm11 +; SSE-NEXT: movaps 96(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, %xmm7 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0] ; SSE-NEXT: movaps %xmm7, (%rsp) # 16-byte Spill @@ -447,7 +472,7 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm7 +; SSE-NEXT: movaps 304(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, %xmm9 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm8[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] @@ -456,8 +481,8 @@ ; SSE-NEXT: movaps %xmm8, %xmm10 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] -; SSE-NEXT: movaps 368(%rdi), %xmm1 -; SSE-NEXT: movaps 320(%rdi), %xmm3 +; SSE-NEXT: movaps 176(%rdi), %xmm1 +; SSE-NEXT: movaps 128(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, %xmm6 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] @@ -466,44 +491,44 @@ ; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSE-NEXT: movaps 176(%rdi), %xmm1 -; SSE-NEXT: movaps 128(%rdi), %xmm0 +; SSE-NEXT: movaps 368(%rdi), %xmm1 +; SSE-NEXT: movaps 320(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm13, 16(%rsi) +; SSE-NEXT: movaps %xmm13, 48(%rsi) ; SSE-NEXT: movaps %xmm11, 32(%rsi) ; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rsi) +; SSE-NEXT: movaps %xmm1, 16(%rsi) ; SSE-NEXT: movaps %xmm14, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) +; SSE-NEXT: movaps %xmm1, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps %xmm1, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps %xmm9, 16(%rcx) +; SSE-NEXT: movaps %xmm9, 48(%rcx) ; SSE-NEXT: movaps %xmm12, 32(%rcx) -; SSE-NEXT: movaps %xmm15, 48(%rcx) +; SSE-NEXT: movaps %xmm15, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps %xmm7, 16(%r8) +; SSE-NEXT: movaps %xmm7, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%r8) +; SSE-NEXT: movaps %xmm1, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%r8) -; SSE-NEXT: movaps %xmm2, 16(%r9) +; SSE-NEXT: movaps %xmm2, 48(%r9) ; SSE-NEXT: movaps %xmm5, 32(%r9) -; SSE-NEXT: movaps %xmm6, 48(%r9) +; SSE-NEXT: movaps %xmm6, 16(%r9) ; SSE-NEXT: movaps %xmm10, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps %xmm4, 32(%rax) -; SSE-NEXT: movaps %xmm3, 48(%rax) +; SSE-NEXT: movaps %xmm3, 16(%rax) ; SSE-NEXT: movaps %xmm8, (%rax) ; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq @@ -869,35 +894,35 @@ ; SSE: # %bb.0: ; SSE-NEXT: subq $408, %rsp # imm = 0x198 ; SSE-NEXT: movaps (%rdi), %xmm7 -; SSE-NEXT: movaps 624(%rdi), %xmm0 -; SSE-NEXT: movaps 576(%rdi), %xmm8 -; SSE-NEXT: movaps 240(%rdi), %xmm1 -; SSE-NEXT: movaps 192(%rdi), %xmm9 -; SSE-NEXT: movaps 720(%rdi), %xmm2 -; SSE-NEXT: movaps 672(%rdi), %xmm10 -; SSE-NEXT: movaps 336(%rdi), %xmm3 -; SSE-NEXT: movaps 288(%rdi), %xmm11 -; SSE-NEXT: movaps 432(%rdi), %xmm4 -; SSE-NEXT: movaps 384(%rdi), %xmm13 -; SSE-NEXT: movaps 528(%rdi), %xmm5 +; SSE-NEXT: movaps 720(%rdi), %xmm0 +; SSE-NEXT: movaps 672(%rdi), %xmm8 +; SSE-NEXT: movaps 336(%rdi), %xmm1 +; SSE-NEXT: movaps 288(%rdi), %xmm9 +; SSE-NEXT: movaps 624(%rdi), %xmm2 +; SSE-NEXT: movaps 576(%rdi), %xmm10 +; SSE-NEXT: movaps 240(%rdi), %xmm3 +; SSE-NEXT: movaps 192(%rdi), %xmm11 +; SSE-NEXT: movaps 528(%rdi), %xmm4 ; SSE-NEXT: movaps 480(%rdi), %xmm12 -; SSE-NEXT: movaps 144(%rdi), %xmm6 -; SSE-NEXT: movaps 96(%rdi), %xmm14 +; SSE-NEXT: movaps 144(%rdi), %xmm5 +; SSE-NEXT: movaps 96(%rdi), %xmm13 +; SSE-NEXT: movaps 432(%rdi), %xmm6 +; SSE-NEXT: movaps 384(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm6[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm6 +; SSE-NEXT: movaps %xmm13, %xmm6 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm5[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm4[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm11, %xmm4 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -949,7 +974,7 @@ ; SSE-NEXT: movaps 304(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 448(%rdi), %xmm0 @@ -960,51 +985,51 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 544(%rdi), %xmm0 -; SSE-NEXT: movaps 496(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: movaps 496(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] ; SSE-NEXT: movaps 640(%rdi), %xmm0 -; SSE-NEXT: movaps 592(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps 592(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 736(%rdi), %xmm0 -; SSE-NEXT: movaps 688(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movaps 688(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm0[1] ; SSE-NEXT: movaps 80(%rdi), %xmm0 ; SSE-NEXT: movaps 32(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 176(%rdi), %xmm0 -; SSE-NEXT: movaps 128(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps 128(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 272(%rdi), %xmm0 -; SSE-NEXT: movaps 224(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: movaps 224(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 320(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm15 +; SSE-NEXT: movaps 320(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 464(%rdi), %xmm0 ; SSE-NEXT: movaps 416(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] ; SSE-NEXT: movaps 560(%rdi), %xmm0 ; SSE-NEXT: movaps 512(%rdi), %xmm5 @@ -1022,56 +1047,56 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rcx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movaps %xmm0, 96(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm8, 112(%r8) -; SSE-NEXT: movaps %xmm12, 96(%r8) -; SSE-NEXT: movaps %xmm14, 80(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm14, 112(%r8) +; SSE-NEXT: movaps %xmm13, 96(%r8) +; SSE-NEXT: movaps %xmm10, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1085,22 +1110,22 @@ ; SSE-NEXT: movaps %xmm2, 112(%r9) ; SSE-NEXT: movaps %xmm4, 96(%r9) ; SSE-NEXT: movaps %xmm7, 80(%r9) -; SSE-NEXT: movaps %xmm9, 64(%r9) +; SSE-NEXT: movaps %xmm8, 64(%r9) ; SSE-NEXT: movaps %xmm15, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm1, 112(%rax) ; SSE-NEXT: movaps %xmm3, 96(%rax) ; SSE-NEXT: movaps %xmm5, 80(%rax) ; SSE-NEXT: movaps %xmm6, 64(%rax) -; SSE-NEXT: movaps %xmm10, 48(%rax) -; SSE-NEXT: movaps %xmm11, 32(%rax) -; SSE-NEXT: movaps %xmm13, 16(%rax) +; SSE-NEXT: movaps %xmm11, 48(%rax) +; SSE-NEXT: movaps %xmm9, 32(%rax) +; SSE-NEXT: movaps %xmm12, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: addq $408, %rsp # imm = 0x198 @@ -1173,16 +1198,16 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm11[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm3[0],ymm7[0],ymm3[2],ymm7[2] ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm14[0],xmm13[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm15[0],ymm7[0],ymm15[2],ymm7[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm15[0],ymm6[0],ymm15[2],ymm6[2] ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm4[0],xmm5[0] @@ -1203,65 +1228,65 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm15[1],ymm6[1],ymm15[3],ymm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm7[1],ymm15[3],ymm7[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm14[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 512(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 464(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm12[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%rdi), %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm4[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm13[0],ymm2[2],ymm13[2] -; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm13[1],ymm2[3],ymm13[3] +; AVX1-ONLY-NEXT: vmovaps 464(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm10[0],xmm12[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 704(%rdi), %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 656(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm2[0],xmm3[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm14[1],xmm8[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm14[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm12[1],xmm8[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm10[1],xmm12[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -1287,25 +1312,25 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) ; AVX1-ONLY-NEXT: addq $552, %rsp # imm = 0x228 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -1313,64 +1338,63 @@ ; AVX2-ONLY-LABEL: load_i64_stride6_vf16: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $520, %rsp # imm = 0x208 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm2[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm15[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] -; AVX2-ONLY-NEXT: vmovaps %ymm7, %ymm6 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm9[0],xmm12[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-ONLY-NEXT: vmovaps %ymm6, %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm8 ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm10[0],xmm9[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm12[0],ymm5[0],ymm12[2],ymm5[2] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm10[0],xmm8[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm13[0],ymm11[2],ymm13[2] +; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm13[0],ymm3[0],ymm13[2],ymm3[2] +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm15[0],ymm3[0],ymm15[2],ymm3[2] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,0,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm0[0],xmm14[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm0[0],xmm14[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm11[1],ymm2[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm15[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm12[1] ; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] -; AVX2-ONLY-NEXT: vmovaps %ymm6, %ymm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm9[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm8[1] ; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm13[1],ymm2[3],ymm13[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] @@ -1379,146 +1403,147 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm10[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm10[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 544(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm6[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm3[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm8[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm1[0],xmm11[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm0[0],xmm2[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm2[0],xmm7[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm10[1] ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm4[1],ymm10[1],ymm4[3],ymm10[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm13[1],ymm1[1],ymm13[3],ymm1[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm12[1],ymm9[3],ymm12[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm5[1],ymm13[1],ymm5[3],ymm13[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm8[1] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm14[1],ymm11[1],ymm14[3],ymm11[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm15[0],ymm10[0],ymm15[2],ymm10[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm9[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm14[1],ymm8[1],ymm14[3],ymm8[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm7[1] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm7[0],xmm6[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm7[0],xmm5[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm5[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm9[0],xmm10[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm13[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 712(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 520(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 136(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm12[1],ymm1[3],ymm12[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 328(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm11[1],ymm6[3],ymm11[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%r8) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm14, (%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm0[0],xmm2[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm11[0],ymm1[2],ymm11[2] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm3[0],xmm4[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 136(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm5[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 328(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 520(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm13[1],ymm2[3],ymm13[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm9[1],xmm10[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 712(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 64(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, (%rax) ; AVX2-ONLY-NEXT: addq $520, %rsp # imm = 0x208 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -1828,29 +1853,29 @@ ; SSE-LABEL: load_i64_stride6_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $1176, %rsp # imm = 0x498 -; SSE-NEXT: movaps 624(%rdi), %xmm0 -; SSE-NEXT: movaps 576(%rdi), %xmm9 -; SSE-NEXT: movaps 240(%rdi), %xmm1 -; SSE-NEXT: movaps 192(%rdi), %xmm8 -; SSE-NEXT: movaps 720(%rdi), %xmm2 -; SSE-NEXT: movaps 672(%rdi), %xmm11 -; SSE-NEXT: movaps 336(%rdi), %xmm4 -; SSE-NEXT: movaps 288(%rdi), %xmm10 -; SSE-NEXT: movaps 432(%rdi), %xmm5 -; SSE-NEXT: movaps 384(%rdi), %xmm12 -; SSE-NEXT: movaps 912(%rdi), %xmm3 -; SSE-NEXT: movaps 528(%rdi), %xmm6 -; SSE-NEXT: movaps 480(%rdi), %xmm14 -; SSE-NEXT: movaps 144(%rdi), %xmm7 -; SSE-NEXT: movaps 96(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0] +; SSE-NEXT: movaps 720(%rdi), %xmm0 +; SSE-NEXT: movaps 672(%rdi), %xmm9 +; SSE-NEXT: movaps 336(%rdi), %xmm1 +; SSE-NEXT: movaps 288(%rdi), %xmm8 +; SSE-NEXT: movaps 624(%rdi), %xmm2 +; SSE-NEXT: movaps 576(%rdi), %xmm11 +; SSE-NEXT: movaps 240(%rdi), %xmm4 +; SSE-NEXT: movaps 192(%rdi), %xmm10 +; SSE-NEXT: movaps 528(%rdi), %xmm5 +; SSE-NEXT: movaps 480(%rdi), %xmm13 +; SSE-NEXT: movaps 144(%rdi), %xmm6 +; SSE-NEXT: movaps 96(%rdi), %xmm12 +; SSE-NEXT: movaps 816(%rdi), %xmm3 +; SSE-NEXT: movaps 432(%rdi), %xmm7 +; SSE-NEXT: movaps 384(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm12, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm6[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm7[1] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm4[0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm6[1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm8, %xmm4 @@ -1859,15 +1884,15 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm5[1] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1878,21 +1903,14 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 864(%rdi), %xmm0 +; SSE-NEXT: movaps 768(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 816(%rdi), %xmm0 -; SSE-NEXT: movaps 768(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1104(%rdi), %xmm0 -; SSE-NEXT: movaps 1056(%rdi), %xmm1 +; SSE-NEXT: movaps 912(%rdi), %xmm0 +; SSE-NEXT: movaps 864(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1905,8 +1923,8 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1296(%rdi), %xmm0 -; SSE-NEXT: movaps 1248(%rdi), %xmm1 +; SSE-NEXT: movaps 1104(%rdi), %xmm0 +; SSE-NEXT: movaps 1056(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1919,8 +1937,8 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1488(%rdi), %xmm0 -; SSE-NEXT: movaps 1440(%rdi), %xmm1 +; SSE-NEXT: movaps 1296(%rdi), %xmm0 +; SSE-NEXT: movaps 1248(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1933,7 +1951,14 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rdi), %xmm1 +; SSE-NEXT: movaps 1488(%rdi), %xmm0 +; SSE-NEXT: movaps 1440(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rdi), %xmm1 ; SSE-NEXT: movaps 48(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] @@ -2150,14 +2175,6 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rsi) @@ -2166,13 +2183,13 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps %xmm0, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps %xmm0, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2182,38 +2199,46 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rdx) +; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rdx) +; SSE-NEXT: movaps %xmm0, 224(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rdx) +; SSE-NEXT: movaps %xmm0, 192(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rdx) +; SSE-NEXT: movaps %xmm0, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rcx) @@ -2333,85 +2358,85 @@ ; AVX1-ONLY-LABEL: load_i64_stride6_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $1624, %rsp # imm = 0x658 -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovaps 1008(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1440(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm14[0],xmm13[0] +; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 1008(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1440(%rdi), %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2450,7 +2475,7 @@ ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] @@ -2465,128 +2490,122 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm12[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm11[0],ymm14[2],ymm11[2] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm4[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm13[0],ymm3[2],ymm13[2] -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm4[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm3[0],xmm7[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm5[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm2[0],xmm11[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vunpckhpd (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm10[1],ymm12[3],ymm10[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm7[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm13[1],ymm3[3],ymm13[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm11[1],ymm14[3],ymm11[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm8[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2597,6 +2616,12 @@ ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm11[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 @@ -2725,14 +2750,6 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) @@ -2741,13 +2758,13 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -2757,13 +2774,13 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -2773,22 +2790,30 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r9) @@ -2822,261 +2847,250 @@ ; ; AVX2-ONLY-LABEL: load_i64_stride6_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1480, %rsp # imm = 0x5C8 -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 +; AVX2-ONLY-NEXT: subq $1448, %rsp # imm = 0x5A8 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm7 ; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm0[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm3[0],ymm6[2],ymm3[2] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm9[0],xmm0[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1008(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm0[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1392(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1200(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0] +; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm11[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm10[0] +; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm14[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm12[0] +; AVX2-ONLY-NEXT: vmovaps 1008(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm7[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1200(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm1[0],xmm2[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm9[1],mem[1] +; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm6[1],mem[1] +; AVX2-ONLY-NEXT: vbroadcastsd 872(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm5[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 680(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vbroadcastsd 1256(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm11[1] +; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm3[1],mem[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm14[1] +; AVX2-ONLY-NEXT: vbroadcastsd 680(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm10[1],ymm5[3],ymm10[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm7[1] ; AVX2-ONLY-NEXT: vbroadcastsd 1064(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm13[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 1448(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm9[1] -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm10[1] -; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm12[1] -; AVX2-ONLY-NEXT: vbroadcastsd 872(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm8[1],ymm3[3],ymm8[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vbroadcastsd 1256(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 1448(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm9[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm8[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1120(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 976(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm6[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1504(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 1360(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm2[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm3[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vbroadcastsd 544(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm5[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 544(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vbroadcastsd 928(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm3[0],xmm0[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm1[0],xmm0[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 928(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm5[0],ymm11[0],ymm5[2],ymm11[2] -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm13[0],xmm11[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 1312(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1168(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm1[0],xmm0[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1312(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm3[0],ymm14[0],ymm3[2],ymm14[2] -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovaps 1168(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm15[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm10[1],xmm9[1] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 208(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm15[0],xmm14[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 736(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm13[0],xmm12[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 1120(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 976(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm11[0],xmm9[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 1504(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm7[0],ymm10[2],ymm7[2] +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 1360(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm8[0],xmm7[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm1[1],ymm5[1],ymm1[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovaps %ymm5, %ymm14 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm15 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm11[1] -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm12[1] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -3084,25 +3098,41 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm5 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm9[1] +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm7[1] +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm2[1],ymm10[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3112,58 +3142,56 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] +; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm11[0] +; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 848(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0] +; AVX2-ONLY-NEXT: vmovaps 848(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm8[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1040(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm7[0] +; AVX2-ONLY-NEXT: vmovaps 1040(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1232(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm6[0] +; AVX2-ONLY-NEXT: vmovaps 1232(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm4[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 1424(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %xmm1 @@ -3171,56 +3199,47 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 136(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[1],mem[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 328(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[1],mem[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 520(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm13[1],xmm14[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 712(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm10[1],xmm11[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 904(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm8[1],xmm9[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 1096(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm12[1],xmm7[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 1288(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 1480(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 1480(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -3230,13 +3249,13 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -3246,13 +3265,13 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -3262,22 +3281,30 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r8) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%r9) @@ -3296,710 +3323,723 @@ ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $1480, %rsp # imm = 0x5C8 +; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 128(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm15, (%rax) +; AVX2-ONLY-NEXT: addq $1448, %rsp # imm = 0x5A8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride6_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512F-NEXT: subq $2696, %rsp # imm = 0xA88 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm17 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <0,6,12,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm14, %zmm7 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <0,6,12,u> +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm9, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm3, %zmm12, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = <1,7,13,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm14, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,4,10,4,10,4,10,4] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm14, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,5,11,5,11,5,11,5] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm14, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,0,0,6,12,0,0,6] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <1,7,13,u> +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,4,10,4,10,4,10,4] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm8, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,5,11,5,11,5,11,5] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm11, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [12,0,0,6,12,0,0,6] +; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,1,7,13,0,1,7] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm9, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm15, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm14, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm3, %zmm12, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm16, %zmm3 +; AVX512F-NEXT: vpermi2q %zmm12, %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm12, %zmm3, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm12, %zmm3, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <10,0,6,u> -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = <10,0,6,u> +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = <11,1,7,u> -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <11,1,7,u> +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,10,4,10,4,10,4,10] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [4,10,4,10,4,10,4,10] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,11,5,11,5,11,5,11] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm16, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm9, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm13, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,11,5,11,5,11,5,11] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm17, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm9, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm14, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm15, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm17 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm10, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm15, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm16 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm5 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm16, %zmm14 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm5, %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm9, %zmm16 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm13, %zmm18 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm30 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,6,12,0,0,6,12] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,7,13,0,1,7,13] +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm6, %zmm22 +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm0, %zmm6, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm14 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm1, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] ; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,10,0,6,0,10,0,6] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,7,13,0,1,7,13] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,10,0,6,0,10,0,6] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm7, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm1, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm4, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm11 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm21, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm4, %zmm22 -; AVX512F-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm1, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm5, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm22 +; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm5, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm11, %zmm4, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm11, %zmm4, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm11, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm24 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm24 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm19 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm16 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm28 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm25 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm3, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm8, %zmm3, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm17, %zmm5, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm7, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm11, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm10, %zmm6, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm20, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm23, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm28, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm24, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm31, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm14, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm9, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm26, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm16, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm13, 192(%r8) +; AVX512F-NEXT: vinserti32x4 $0, %xmm16, %zmm7, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm6, %zmm8, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm6 {%k1} +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm18, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%rsi) +; AVX512F-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm19, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm24, 64(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm8, (%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm14, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm20, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm25, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm28, (%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm21, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm26, 64(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm27, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm18, 128(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512F-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512F-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride6_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512BW-NEXT: subq $2696, %rsp # imm = 0xA88 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm17 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,0,10,0,6,0,10] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <0,6,12,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm14, %zmm7 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <0,6,12,u> +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm12, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <1,7,13,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm14, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [10,4,10,4,10,4,10,4] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,5,11,5,11,5,11,5] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [12,0,0,6,12,0,0,6] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <1,7,13,u> +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,4,10,4,10,4,10,4] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm8, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,5,11,5,11,5,11,5] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm11, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [12,0,0,6,12,0,0,6] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,1,7,13,0,1,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm3, %zmm12, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm16, %zmm3 +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm3, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <10,0,6,u> -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <10,0,6,u> +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = <11,1,7,u> -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <11,1,7,u> +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [4,10,4,10,4,10,4,10] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [4,10,4,10,4,10,4,10] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,11,5,11,5,11,5,11] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm16, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm9, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm13, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,11,5,11,5,11,5,11] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm17 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm9, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm16 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm19 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm9, %zmm16 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm13, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm30 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,6,12,0,0,6,12] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,7,13,0,1,7,13] +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm30 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,10,0,6,0,10,0,6] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm4, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,7,13,0,1,7,13] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,10,0,6,0,10,0,6] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm7, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm21, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm5, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm5, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm4, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm11, %zmm4, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm11, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm19 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm16 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm28 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm25 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm20 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm3, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm5, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm15, %zmm7, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm11, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm6, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm20, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm23, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm26, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%r8) +; AVX512BW-NEXT: vinserti32x4 $0, %xmm16, %zmm7, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 {%k1} +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%rsi) +; AVX512BW-NEXT: vmovups (%rsp), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm8, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm28, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm27, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 128(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512BW-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <192 x i64>, ptr %in.vec, align 64 @@ -5054,182 +5094,182 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride6_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3768, %rsp # imm = 0xEB8 -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 +; AVX1-ONLY-NEXT: subq $3736, %rsp # imm = 0xE98 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovaps 1008(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1440(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1824(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1632(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1776(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2048(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2208(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2016(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 2160(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1968(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2624(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2592(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2400(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 2544(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2352(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2816(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2976(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2784(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 2928(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2736(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2880(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1008(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1440(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1632(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1824(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 1776(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2048(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, 2016(%rdi), %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2208(%rdi), %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovaps 1968(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 2160(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 2112(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, 2400(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovaps 2624(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2592(%rdi), %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovaps 2352(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 2304(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 2544(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2816(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 2784(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2976(%rdi), %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 2736(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2928(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2880(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5347,123 +5387,123 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 736(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1888(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1744(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2208(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2272(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1120(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2592(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2656(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2560(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2976(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 3040(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2944(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2896(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2784(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2848(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1696(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2752(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2704(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2400(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2464(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1888(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2368(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2320(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1744(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5479,164 +5519,164 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2208(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1696(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2272(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2400(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2464(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] +; AVX1-ONLY-NEXT: vmovaps 2368(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 2320(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, 928(%rdi), %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm7[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2592(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2656(%rdi), %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovaps 2560(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 2512(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, 544(%rdi), %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm15[0],ymm6[2],ymm15[2] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm4[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2784(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2848(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 2752(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 2704(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm6[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm14[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2976(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 3040(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovaps 2944(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2896(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm15[1],ymm6[3],ymm15[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm8[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm11[1],xmm12[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 @@ -5657,7 +5697,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -5829,7 +5869,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5880,7 +5920,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] @@ -5927,22 +5967,6 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 256(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 416(%rsi) @@ -5959,21 +5983,21 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 256(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -5991,38 +6015,54 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 256(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 256(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 416(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 384(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 352(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 288(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 256(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%r8) @@ -6098,7 +6138,7 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) @@ -6114,213 +6154,213 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $3768, %rsp # imm = 0xEB8 +; AVX1-ONLY-NEXT: addq $3736, %rsp # imm = 0xE98 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride6_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3416, %rsp # imm = 0xD58 -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm2 +; AVX2-ONLY-NEXT: subq $3320, %rsp # imm = 0xCF8 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm7 ; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm0[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1008(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1392(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1200(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1776(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1584(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2048(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2208(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2160(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1968(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2432(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2592(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2400(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2544(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2352(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 2304(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3008(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2816(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2976(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2784(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2928(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2736(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2880(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 2688(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 240(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 624(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1008(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm11 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1200(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 1392(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm14[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1584(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 1776(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm12[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2048(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2208(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1968(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 2160(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm10[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2432(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 2400(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 2592(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2352(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 2304(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 2544(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm6[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2816(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 2784(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 3008(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 2976(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2736(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 2688(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2928(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 2880(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -6328,7 +6368,7 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 680(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -6336,7 +6376,7 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 1064(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 872(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -6344,7 +6384,7 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 1448(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 1256(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -6352,7 +6392,7 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 1832(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 1640(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -6360,7 +6400,7 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 2216(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 2024(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -6368,7 +6408,7 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 2600(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 2408(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -6376,7 +6416,7 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 2984(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 2792(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] @@ -6384,53 +6424,61 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 104(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd 296(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm13[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 488(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vbroadcastsd 680(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm11[1],mem[1] -; AVX2-ONLY-NEXT: vbroadcastsd 872(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vbroadcastsd 1064(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm14[1] -; AVX2-ONLY-NEXT: vbroadcastsd 1256(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vbroadcastsd 1448(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm12[1] -; AVX2-ONLY-NEXT: vbroadcastsd 1640(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vbroadcastsd 1832(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm10[1] -; AVX2-ONLY-NEXT: vbroadcastsd 2024(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vbroadcastsd 2216(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm6[1] -; AVX2-ONLY-NEXT: vbroadcastsd 2408(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vbroadcastsd 2600(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm8[1],ymm3[3],ymm8[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vbroadcastsd 2792(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 2984(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm10[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] @@ -6441,320 +6489,306 @@ ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 736(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 544(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1120(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 736(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1504(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 928(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1360(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1888(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 1120(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 976(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1744(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd 1312(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vmovaps 1168(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 2272(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 1504(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1360(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2128(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 2656(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 1696(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 2560(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2512(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vmovaps 1552(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 3040(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 1888(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 2944(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2896(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vmovaps 1744(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 2848(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 2080(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 2752(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2704(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vmovaps 1936(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 2464(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 2272(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 2368(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2320(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vmovaps 2128(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 2080(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 2464(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2368(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1936(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 2320(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1696(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 2656(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 1552(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] +; AVX2-ONLY-NEXT: vmovaps 2560(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 2512(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm14[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1312(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 1168(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm9[0] +; AVX2-ONLY-NEXT: vbroadcastsd 2848(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 2752(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 2704(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm4[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 928(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm5[0],xmm6[0] +; AVX2-ONLY-NEXT: vbroadcastsd 3040(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[2],ymm0[2] +; AVX2-ONLY-NEXT: vmovaps 2944(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 2896(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm2[0],xmm8[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 544(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm3[0],xmm15[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm14[0],xmm0[0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm13[1],ymm2[3],ymm13[3] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm6[1],ymm15[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm14[1],ymm1[3],ymm14[3] -; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm15[1] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm10[1],ymm15[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm6[1] -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm15 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm8[1],xmm9[1] -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm5[1],ymm10[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm10 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm12[1] -; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 2080(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm11 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 2272(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm1[1],ymm15[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 2464(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm0[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 2656(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] -; AVX2-ONLY-NEXT: vmovaps 2848(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm13[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm1[1],ymm13[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm11[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps 2080(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm15[1],ymm11[3],ymm15[3] +; AVX2-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm9[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps 2272(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm1[1],ymm9[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm7[1],mem[1] +; AVX2-ONLY-NEXT: vmovaps 2464(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm1[1],ymm7[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovaps 2656(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vmovaps 2848(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[1],mem[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm8[1] ; AVX2-ONLY-NEXT: vmovaps 3040(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm12[1],ymm2[1],ymm12[3],ymm2[3] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] +; AVX2-ONLY-NEXT: vmovaps 80(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm12[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 464(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6764,7 +6798,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6774,7 +6809,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 848(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6784,7 +6820,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 1040(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6794,7 +6831,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 1232(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6804,7 +6842,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 1424(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6814,7 +6853,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 1616(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6824,36 +6864,39 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 1808(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 1808(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm14[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm1[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2000(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 2000(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm14[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2192(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 2192(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vmovaps 2144(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm10[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm12[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] -; AVX2-ONLY-NEXT: vmovaps 2384(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 2384(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vmovaps 2336(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm8[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm10[0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6866,8 +6909,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-ONLY-NEXT: vmovaps 2768(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps 2720(%rdi), %xmm3 @@ -6886,7 +6928,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6956,54 +6998,38 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 1864(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],xmm14[1] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm13[1],mem[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 2056(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm14[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 2248(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 2440(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm10[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 2632(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 2632(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 2824(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 2824(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 3016(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 3016(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7021,21 +7047,21 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7053,38 +7079,54 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%r8) @@ -7149,10 +7191,10 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm2, 480(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 448(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 448(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 416(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 352(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm13, 288(%rax) @@ -7173,20 +7215,20 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $3416, %rsp # imm = 0xD58 +; AVX2-ONLY-NEXT: addq $3320, %rsp # imm = 0xCF8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride6_vf64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: subq $7240, %rsp # imm = 0x1C48 -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm15 ; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm7 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,0,10,0,6,0,10] @@ -7194,154 +7236,154 @@ ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm9 ; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512F-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,7,0,11,1,7,0,11] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [10,4,10,4,10,4,10,4] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,0,11,1,7,0,11] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm9, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [10,4,10,4,10,4,10,4] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm10, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,5,11,5,11,5,11,5] ; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm11, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,5,11,5,11,5,11,5] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm12, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm12, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm11, %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,0,0,6,12,0,0,6] ; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,0,1,7,13,0,1,7] ; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm0 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm12 +; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm1 ; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2816(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 2880(%rdi), %zmm1 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 +; AVX512F-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 2880(%rdi), %zmm0 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm10 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm12, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm13 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 ; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = <0,6,12,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm1 @@ -7358,17 +7400,17 @@ ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [4,10,4,10,4,10,4,10] +; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,11,5,11,5,11,5,11] ; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7382,12 +7424,12 @@ ; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7401,316 +7443,320 @@ ; AVX512F-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm15, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm31 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm29 ; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm29 +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm15, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm16, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm29 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm27 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm28 +; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm15, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm16, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm23 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm27 +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm24 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm13, %zmm28 -; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm9 ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm20 +; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm17 ; AVX512F-NEXT: vpermi2q %zmm0, %zmm17, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm14, %zmm30 ; AVX512F-NEXT: vpermi2q %zmm0, %zmm17, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm15, %zmm31 ; AVX512F-NEXT: vpermi2q %zmm17, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermi2q %zmm17, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm17, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm17, %zmm8 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] ; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] ; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,10,0,6,0,10,0,6] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm1, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm9 ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm10 ; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm2, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 -; AVX512F-NEXT: vpermi2q %zmm18, %zmm9, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm2, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm18, %zmm9, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm3, %zmm6 -; AVX512F-NEXT: vpermi2q %zmm9, %zmm18, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm6, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm19 +; AVX512F-NEXT: vpermi2q %zmm22, %zmm12, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm22, %zmm12, %zmm2 +; AVX512F-NEXT: vpermi2q %zmm12, %zmm22, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm22 ; AVX512F-NEXT: movb $56, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -7718,54 +7764,50 @@ ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm31 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm15 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7796,70 +7838,75 @@ ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 16-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm7 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm5, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 {%k1} +; AVX512F-NEXT: vinserti32x4 $0, %xmm8, %zmm9, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm9 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm10 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm11 {%k1} +; AVX512F-NEXT: vinserti32x4 $0, %xmm31, %zmm11, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 16-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm29, %zmm19, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm17, %zmm22, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm17 {%k1} +; AVX512F-NEXT: vinserti32x4 $0, %xmm29, %zmm12, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm28, %zmm21, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm27, %zmm20, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm20, %zmm18, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} +; AVX512F-NEXT: vinserti32x4 $0, %xmm17, %zmm18, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm17 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm13, 448(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm28, 384(%rsi) -; AVX512F-NEXT: vmovups (%rsp), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 320(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm13, 384(%rsi) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm13, 320(%rsi) +; AVX512F-NEXT: vmovups (%rsp), %zmm13 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm13, 256(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm13, 192(%rsi) @@ -7871,63 +7918,65 @@ ; AVX512F-NEXT: vmovaps %zmm13, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm14, 448(%rdx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 256(%rdx) +; AVX512F-NEXT: vmovaps %zmm13, 384(%rdx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm13, 320(%rdx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 128(%rdx) +; AVX512F-NEXT: vmovaps %zmm13, 256(%rdx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm13, 192(%rdx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, (%rdx) +; AVX512F-NEXT: vmovaps %zmm13, 128(%rdx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm13, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm30, 384(%rdx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm13, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm15, 448(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 256(%rcx) +; AVX512F-NEXT: vmovaps %zmm13, 384(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm13, 320(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 128(%rcx) +; AVX512F-NEXT: vmovaps %zmm13, 256(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm13, 192(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, (%rcx) +; AVX512F-NEXT: vmovaps %zmm13, 128(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm13, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm31, 384(%rcx) +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm13, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm16, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm25, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm30, 384(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm13, 320(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 128(%r8) +; AVX512F-NEXT: vmovaps %zmm13, 256(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm13, 192(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, (%r8) +; AVX512F-NEXT: vmovaps %zmm13, 128(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm13, 64(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm13, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm5, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm8, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm7, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512F-NEXT: vmovaps %zmm13, (%r8) +; AVX512F-NEXT: vmovdqa64 %zmm8, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm7, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm5, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm4, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm0, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm18, 384(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm20, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm21, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512F-NEXT: addq $7240, %rsp # imm = 0x1C48 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -7935,13 +7984,13 @@ ; AVX512BW-LABEL: load_i64_stride6_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $7240, %rsp # imm = 0x1C48 -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,0,10,0,6,0,10] @@ -7949,154 +7998,154 @@ ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,7,0,11,1,7,0,11] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [10,4,10,4,10,4,10,4] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,0,11,1,7,0,11] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [10,4,10,4,10,4,10,4] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm10, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,5,11,5,11,5,11,5] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,5,11,5,11,5,11,5] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,0,0,6,12,0,0,6] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,0,1,7,13,0,1,7] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm9, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm8 +; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm10 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm11 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm12, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = <0,6,12,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm1 @@ -8113,17 +8162,17 @@ ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [4,10,4,10,4,10,4,10] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [4,10,4,10,4,10,4,10] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,11,5,11,5,11,5,11] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8137,12 +8186,12 @@ ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8156,316 +8205,320 @@ ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm16, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm15, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm31 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm29 ; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm16, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm28 +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm13, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm15, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm16, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm16, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm29 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm27 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm25 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm13, %zmm28 -; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm17 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm17, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm14, %zmm30 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm17, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm15, %zmm31 ; AVX512BW-NEXT: vpermi2q %zmm17, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm16, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermi2q %zmm17, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm17, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm17, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,6,12,0,0,6,12] ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,7,13,0,1,7,13] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,10,0,6,0,10,0,6] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,10,0,6,0,10,0,6] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,11,1,7,0,11,1,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm18, %zmm9, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm18, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm6, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm19 +; AVX512BW-NEXT: vpermi2q %zmm22, %zmm12, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm22, %zmm12, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm12, %zmm22, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm22 ; AVX512BW-NEXT: movb $56, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -8473,54 +8526,50 @@ ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm31 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8551,70 +8600,75 @@ ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 16-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm7 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm3 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm9, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 16-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm6 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm10 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm11 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm31, %zmm11, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm11 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 16-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm29, %zmm19, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm22, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm17 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm29, %zmm12, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm28, %zmm21, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm27, %zmm20, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm20 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm20, %zmm18, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm18 {%k1} +; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm18, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm17 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm13, 448(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 384(%rsi) -; AVX512BW-NEXT: vmovups (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 320(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm13, 384(%rsi) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm13, 320(%rsi) +; AVX512BW-NEXT: vmovups (%rsp), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm13, 256(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm13, 192(%rsi) @@ -8626,63 +8680,65 @@ ; AVX512BW-NEXT: vmovaps %zmm13, (%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 256(%rdx) +; AVX512BW-NEXT: vmovaps %zmm13, 384(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm13, 320(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 128(%rdx) +; AVX512BW-NEXT: vmovaps %zmm13, 256(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm13, 192(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, (%rdx) +; AVX512BW-NEXT: vmovaps %zmm13, 128(%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm13, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 384(%rdx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm13, (%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm15, 448(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 256(%rcx) +; AVX512BW-NEXT: vmovaps %zmm13, 384(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm13, 320(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 128(%rcx) +; AVX512BW-NEXT: vmovaps %zmm13, 256(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm13, 192(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, (%rcx) +; AVX512BW-NEXT: vmovaps %zmm13, 128(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm13, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 384(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm13, (%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 384(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm13, 320(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 128(%r8) +; AVX512BW-NEXT: vmovaps %zmm13, 256(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm13, 192(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, (%r8) +; AVX512BW-NEXT: vmovaps %zmm13, 128(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm13, 64(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm13, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-NEXT: vmovaps %zmm13, (%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm18, 384(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512BW-NEXT: addq $7240, %rsp # imm = 0x1C48 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -8702,12 +8758,11 @@ ret void } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; AVX1: {{.*}} +; AVX: {{.*}} ; AVX2: {{.*}} ; AVX2-FAST: {{.*}} ; AVX2-FAST-PERLANE: {{.*}} ; AVX2-SLOW: {{.*}} -; AVX512: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} ; AVX512BW-ONLY-SLOW: {{.*}} ; AVX512DQ-FAST: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll @@ -101,27 +101,26 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3] -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3] -; AVX512-NEXT: vmovdqa 80(%rdi), %xmm6 -; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3] -; AVX512-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3] -; AVX512-NEXT: vmovdqa %xmm4, (%rsi) -; AVX512-NEXT: vmovdqa %xmm0, (%rdx) -; AVX512-NEXT: vmovdqa %xmm5, (%rcx) -; AVX512-NEXT: vmovdqa %xmm1, (%r8) -; AVX512-NEXT: vmovdqa %xmm6, (%r9) -; AVX512-NEXT: vmovdqa %xmm2, (%r10) -; AVX512-NEXT: vmovdqa %xmm3, (%rax) +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = mem[0,1],xmm2[2,3] +; AVX512-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX512-NEXT: vpunpcklqdq 72(%rdi){1to2}, %xmm0, %xmm0 +; AVX512-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX512-NEXT: vpunpcklqdq 88(%rdi){1to2}, %xmm1, %xmm1 +; AVX512-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512-NEXT: vpunpcklqdq 104(%rdi){1to2}, %xmm2, %xmm2 +; AVX512-NEXT: vmovdqa %xmm3, (%rsi) +; AVX512-NEXT: vmovaps %xmm4, (%rdx) +; AVX512-NEXT: vmovdqa %xmm0, (%rcx) +; AVX512-NEXT: vmovaps %xmm5, (%r8) +; AVX512-NEXT: vmovdqa %xmm1, (%r9) +; AVX512-NEXT: vmovaps %xmm6, (%r10) +; AVX512-NEXT: vmovdqa %xmm2, (%rax) ; AVX512-NEXT: retq %wide.vec = load <14 x i64>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <14 x i64> %wide.vec, <14 x i64> poison, <2 x i32> @@ -211,17 +210,16 @@ ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm4[0],ymm5[3],ymm4[2] ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm10[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm7[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1],ymm9[0],ymm7[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm2[0,1,2],ymm8[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = mem[0],xmm9[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3] @@ -261,17 +259,18 @@ ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm8 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq 128(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] -; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm8, %ymm9 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm8 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] @@ -315,23 +314,23 @@ ; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm7 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX512F-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512F-SLOW-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX512F-SLOW-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm9 +; AVX512F-SLOW-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX512F-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm8[2,3],ymm6[2,3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] ; AVX512F-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512F-SLOW-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512F-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] ; AVX512F-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11] ; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 @@ -342,8 +341,8 @@ ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm2, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm6, (%r8) -; AVX512F-SLOW-NEXT: vmovdqa %ymm7, (%r9) -; AVX512F-SLOW-NEXT: vmovdqa %ymm8, (%r10) +; AVX512F-SLOW-NEXT: vmovdqa %ymm8, (%r9) +; AVX512F-SLOW-NEXT: vmovdqa %ymm7, (%r10) ; AVX512F-SLOW-NEXT: vmovdqa %ymm3, (%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq @@ -365,27 +364,28 @@ ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,7,0,7] ; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %xmm6 ; AVX512F-FAST-NEXT: vpermi2q 160(%rdi), %ymm6, %ymm2 ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm7 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX512F-FAST-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512F-FAST-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX512F-FAST-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %xmm9 +; AVX512F-FAST-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm8[2,3],ymm6[2,3] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm8 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] ; AVX512F-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512F-FAST-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512F-FAST-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] ; AVX512F-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11] ; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 @@ -395,9 +395,9 @@ ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-FAST-NEXT: vmovdqa %ymm2, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa %ymm7, (%r8) +; AVX512F-FAST-NEXT: vmovdqa %ymm6, (%r8) ; AVX512F-FAST-NEXT: vmovdqa %ymm8, (%r9) -; AVX512F-FAST-NEXT: vmovdqa %ymm6, (%r10) +; AVX512F-FAST-NEXT: vmovdqa %ymm7, (%r10) ; AVX512F-FAST-NEXT: vmovdqa %ymm3, (%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -423,23 +423,23 @@ ; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm7 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX512BW-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512BW-SLOW-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX512BW-SLOW-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm9 +; AVX512BW-SLOW-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm7 -; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm8 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX512BW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm8[2,3],ymm6[2,3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm8 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] ; AVX512BW-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512BW-SLOW-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512BW-SLOW-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] ; AVX512BW-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11] ; AVX512BW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-SLOW-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 @@ -450,8 +450,8 @@ ; AVX512BW-SLOW-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512BW-SLOW-NEXT: vmovdqa %ymm2, (%rcx) ; AVX512BW-SLOW-NEXT: vmovdqa %ymm6, (%r8) -; AVX512BW-SLOW-NEXT: vmovdqa %ymm7, (%r9) -; AVX512BW-SLOW-NEXT: vmovdqa %ymm8, (%r10) +; AVX512BW-SLOW-NEXT: vmovdqa %ymm8, (%r9) +; AVX512BW-SLOW-NEXT: vmovdqa %ymm7, (%r10) ; AVX512BW-SLOW-NEXT: vmovdqa %ymm3, (%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq @@ -473,27 +473,28 @@ ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,7,0,7] ; AVX512BW-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512BW-FAST-NEXT: vmovdqa 128(%rdi), %xmm6 ; AVX512BW-FAST-NEXT: vpermi2q 160(%rdi), %ymm6, %ymm2 ; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm7 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FAST-NEXT: vmovdqa 64(%rdi), %ymm7 -; AVX512BW-FAST-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512BW-FAST-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512BW-FAST-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512BW-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX512BW-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 +; AVX512BW-FAST-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX512BW-FAST-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512BW-FAST-NEXT: vmovdqa 192(%rdi), %xmm9 +; AVX512BW-FAST-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm8[2,3],ymm6[2,3] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm8 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] ; AVX512BW-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512BW-FAST-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX512BW-FAST-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] ; AVX512BW-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11] ; AVX512BW-FAST-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-FAST-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 @@ -503,9 +504,9 @@ ; AVX512BW-FAST-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-FAST-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512BW-FAST-NEXT: vmovdqa %ymm2, (%rcx) -; AVX512BW-FAST-NEXT: vmovdqa %ymm7, (%r8) +; AVX512BW-FAST-NEXT: vmovdqa %ymm6, (%r8) ; AVX512BW-FAST-NEXT: vmovdqa %ymm8, (%r9) -; AVX512BW-FAST-NEXT: vmovdqa %ymm6, (%r10) +; AVX512BW-FAST-NEXT: vmovdqa %ymm7, (%r10) ; AVX512BW-FAST-NEXT: vmovdqa %ymm3, (%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -531,55 +532,55 @@ ; SSE-LABEL: load_i64_stride7_vf8: ; SSE: # %bb.0: ; SSE-NEXT: subq $88, %rsp -; SSE-NEXT: movapd 320(%rdi), %xmm0 -; SSE-NEXT: movapd 208(%rdi), %xmm1 -; SSE-NEXT: movapd 256(%rdi), %xmm2 -; SSE-NEXT: movapd 144(%rdi), %xmm3 -; SSE-NEXT: movapd 304(%rdi), %xmm4 -; SSE-NEXT: movapd 192(%rdi), %xmm5 -; SSE-NEXT: movapd 240(%rdi), %xmm6 -; SSE-NEXT: movapd 128(%rdi), %xmm7 -; SSE-NEXT: movapd 288(%rdi), %xmm8 -; SSE-NEXT: movapd 176(%rdi), %xmm9 -; SSE-NEXT: movapd 336(%rdi), %xmm10 -; SSE-NEXT: movapd 224(%rdi), %xmm11 -; SSE-NEXT: movapd 272(%rdi), %xmm14 +; SSE-NEXT: movapd 208(%rdi), %xmm0 +; SSE-NEXT: movapd 96(%rdi), %xmm1 +; SSE-NEXT: movapd 144(%rdi), %xmm2 +; SSE-NEXT: movapd 192(%rdi), %xmm3 +; SSE-NEXT: movapd 80(%rdi), %xmm4 +; SSE-NEXT: movapd 128(%rdi), %xmm5 +; SSE-NEXT: movapd 176(%rdi), %xmm7 +; SSE-NEXT: movapd 64(%rdi), %xmm8 +; SSE-NEXT: movapd (%rdi), %xmm10 +; SSE-NEXT: movapd 16(%rdi), %xmm9 +; SSE-NEXT: movapd 32(%rdi), %xmm6 +; SSE-NEXT: movapd 48(%rdi), %xmm14 +; SSE-NEXT: movapd 336(%rdi), %xmm11 ; SSE-NEXT: movapd 112(%rdi), %xmm13 ; SSE-NEXT: movapd 160(%rdi), %xmm15 -; SSE-NEXT: movapd %xmm15, %xmm12 -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm13[0],xmm12[1] +; SSE-NEXT: movapd %xmm14, %xmm12 +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm10[0],xmm12[1] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm9[0] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm8[0] +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm4[0] ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm5[0] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm1[0] +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm15, %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm13[0],xmm14[1] +; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm7[0] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] +; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm3[0] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm1[0] -; SSE-NEXT: movapd %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm15[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm14, %xmm15 -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm11[0],xmm15[1] -; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm8[0] -; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm4[0] -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1] -; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 384(%rdi), %xmm5 -; SSE-NEXT: movapd %xmm5, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm10[0],xmm14[1] +; SSE-NEXT: movapd %xmm5, %xmm15 +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm11[0],xmm15[1] ; SSE-NEXT: movapd 400(%rdi), %xmm7 -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm7[0] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm7[0] +; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 352(%rdi), %xmm8 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm8[0],xmm7[1] ; SSE-NEXT: movapd 416(%rdi), %xmm9 @@ -589,167 +590,161 @@ ; SSE-NEXT: movapd 432(%rdi), %xmm11 ; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm11[0] ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm5[0],xmm11[1] -; SSE-NEXT: movapd (%rdi), %xmm5 -; SSE-NEXT: movapd 48(%rdi), %xmm12 +; SSE-NEXT: movapd 224(%rdi), %xmm5 +; SSE-NEXT: movapd 272(%rdi), %xmm12 ; SSE-NEXT: movapd %xmm12, %xmm6 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm5[0],xmm6[1] -; SSE-NEXT: movapd 64(%rdi), %xmm0 +; SSE-NEXT: movapd 288(%rdi), %xmm0 ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm0[0] -; SSE-NEXT: movapd 16(%rdi), %xmm1 +; SSE-NEXT: movapd 240(%rdi), %xmm1 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd 80(%rdi), %xmm2 +; SSE-NEXT: movapd 304(%rdi), %xmm2 ; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0] -; SSE-NEXT: movapd 32(%rdi), %xmm3 +; SSE-NEXT: movapd 256(%rdi), %xmm3 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; SSE-NEXT: movapd 96(%rdi), %xmm4 +; SSE-NEXT: movapd 320(%rdi), %xmm4 ; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm4[0] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm12[0],xmm4[1] -; SSE-NEXT: movapd %xmm6, (%rsi) -; SSE-NEXT: movapd %xmm14, 48(%rsi) -; SSE-NEXT: movapd %xmm15, 32(%rsi) +; SSE-NEXT: movapd %xmm6, 32(%rsi) +; SSE-NEXT: movapd %xmm15, 48(%rsi) +; SSE-NEXT: movapd %xmm14, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 16(%rsi) -; SSE-NEXT: movapd %xmm5, (%rdx) +; SSE-NEXT: movaps %xmm6, (%rsi) +; SSE-NEXT: movapd %xmm5, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movaps %xmm5, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 32(%rdx) ; SSE-NEXT: movapd %xmm13, 16(%rdx) -; SSE-NEXT: movapd %xmm0, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, (%rdx) +; SSE-NEXT: movapd %xmm0, 32(%rcx) ; SSE-NEXT: movapd %xmm7, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movapd %xmm1, (%r8) -; SSE-NEXT: movapd %xmm8, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movapd %xmm1, 32(%r8) +; SSE-NEXT: movapd %xmm8, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movapd %xmm2, (%r9) -; SSE-NEXT: movapd %xmm9, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%r9) +; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movapd %xmm2, 32(%r9) +; SSE-NEXT: movapd %xmm9, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm3, (%rax) +; SSE-NEXT: movapd %xmm3, 32(%rax) ; SSE-NEXT: movapd %xmm10, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm4, (%rax) +; SSE-NEXT: movapd %xmm4, 32(%rax) ; SSE-NEXT: movapd %xmm11, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: addq $88, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride7_vf8: ; AVX1-ONLY: # %bb.0: +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = mem[0],xmm2[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm1[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm14 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm14 = xmm13[0],xmm4[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[3],ymm9[2] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm7[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm8[0],ymm6[3],ymm8[2] -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm10[0],ymm7[3],ymm10[2] -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm11[0,1,2,3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm14[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm15 = xmm14[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm11[1],ymm9[0],ymm11[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[1],ymm13[0],ymm14[2],ymm13[2] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = mem[0],xmm9[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm1[0],xmm13[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm13[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm15[0],ymm13[0],ymm15[3],ymm13[2] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[3],ymm15[2] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm14 = xmm14[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, (%rdx) +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm10[0],ymm8[3],ymm10[2] +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm13[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm13[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm12[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm13[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm14 = xmm13[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[1],ymm11[0],ymm12[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[1],ymm14[0],ymm13[2],ymm14[2] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm3[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm11 = mem[0],xmm11[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm14 = mem[0],xmm14[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm14[0],ymm3[3],ymm14[2] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm15[0],ymm6[0],ymm15[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm2[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm5[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) ; AVX1-ONLY-NEXT: vmovapd %ymm10, 32(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm8, (%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm11, (%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 32(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm9, (%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm9, (%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 32(%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm12, (%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm11, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm5, (%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm3, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm13, (%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm1, (%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -762,9 +757,8 @@ ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = mem[0,1],xmm2[2,3] @@ -773,68 +767,70 @@ ; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm13[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq 128(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm13[1],ymm9[1],ymm13[3],ymm9[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = mem[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq 352(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm13[1],ymm11[1],ymm13[3],ymm11[3] -; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm13, %ymm14 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm14[1],ymm9[1],ymm14[3],ymm9[3] +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm14, %ymm15 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] +; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm10[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm10[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm13 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm15 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm14 = ymm14[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = ymm15[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm15[1],ymm1[3],ymm15[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq %xmm3, %ymm2 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -846,14 +842,14 @@ ; AVX2-ONLY-NEXT: vmovdqa %ymm9, (%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm10, 32(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm12, (%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 32(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%r9) ; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 32(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1106,8 +1102,8 @@ ; SSE-NEXT: movapd 192(%rdi), %xmm3 ; SSE-NEXT: movapd 80(%rdi), %xmm4 ; SSE-NEXT: movapd 128(%rdi), %xmm5 -; SSE-NEXT: movapd 64(%rdi), %xmm8 -; SSE-NEXT: movapd 176(%rdi), %xmm9 +; SSE-NEXT: movapd 176(%rdi), %xmm8 +; SSE-NEXT: movapd 64(%rdi), %xmm9 ; SSE-NEXT: movapd (%rdi), %xmm10 ; SSE-NEXT: movapd 16(%rdi), %xmm7 ; SSE-NEXT: movapd 32(%rdi), %xmm6 @@ -1118,10 +1114,10 @@ ; SSE-NEXT: movapd %xmm14, %xmm13 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm10[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm8[0] +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm9[0] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm7[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm4[0] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] @@ -1133,10 +1129,10 @@ ; SSE-NEXT: movapd %xmm15, %xmm1 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm9[0] +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm8[0] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm3[0] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] @@ -1170,7 +1166,7 @@ ; SSE-NEXT: movapd 384(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 400(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1227,19 +1223,19 @@ ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 656(%rdi), %xmm0 ; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 672(%rdi), %xmm2 +; SSE-NEXT: movapd 672(%rdi), %xmm3 ; SSE-NEXT: movapd 720(%rdi), %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm8 -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm2[0],xmm8[1] +; SSE-NEXT: movapd %xmm4, %xmm9 +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm3[0],xmm9[1] ; SSE-NEXT: movapd 736(%rdi), %xmm6 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm6[0] -; SSE-NEXT: movapd 688(%rdi), %xmm9 -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1] +; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm6[0] +; SSE-NEXT: movapd 688(%rdi), %xmm8 +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm8[0],xmm6[1] ; SSE-NEXT: movapd 752(%rdi), %xmm12 -; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm12[0] +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm12[0] ; SSE-NEXT: movapd 704(%rdi), %xmm15 ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm15[0],xmm12[1] ; SSE-NEXT: movapd 768(%rdi), %xmm0 @@ -1254,55 +1250,55 @@ ; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm0[0] ; SSE-NEXT: movapd 800(%rdi), %xmm1 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: movapd 864(%rdi), %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm3[0] +; SSE-NEXT: movapd 864(%rdi), %xmm2 +; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0] ; SSE-NEXT: movapd 816(%rdi), %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm7[0],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm7[0],xmm2[1] ; SSE-NEXT: movapd 880(%rdi), %xmm10 ; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm10[0] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm11[0],xmm10[1] -; SSE-NEXT: movapd %xmm8, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, 32(%rsi) ; SSE-NEXT: movapd %xmm5, 112(%rsi) -; SSE-NEXT: movaps (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, 64(%rsi) +; SSE-NEXT: movaps %xmm5, 48(%rsi) +; SSE-NEXT: movapd %xmm9, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, (%rsi) +; SSE-NEXT: movaps %xmm5, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movaps %xmm5, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movaps %xmm5, 16(%rsi) -; SSE-NEXT: movapd %xmm2, 96(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, (%rsi) ; SSE-NEXT: movapd %xmm4, 112(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 48(%rdx) +; SSE-NEXT: movapd %xmm3, 96(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 32(%rdx) ; SSE-NEXT: movapd %xmm13, 80(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) -; SSE-NEXT: movapd %xmm6, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 64(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rdx) ; SSE-NEXT: movapd %xmm0, 112(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movapd %xmm6, 96(%rcx) ; SSE-NEXT: movapd %xmm14, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps %xmm0, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movapd %xmm1, 112(%r8) -; SSE-NEXT: movapd %xmm9, 96(%r8) +; SSE-NEXT: movapd %xmm8, 96(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1315,7 +1311,7 @@ ; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movapd %xmm3, 112(%r9) +; SSE-NEXT: movapd %xmm2, 112(%r9) ; SSE-NEXT: movapd %xmm12, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%r9) @@ -1332,7 +1328,7 @@ ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm7, 112(%rax) ; SSE-NEXT: movapd %xmm15, 96(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) @@ -1365,466 +1361,452 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride7_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $552, %rsp # imm = 0x228 -; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 720(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: subq $504, %rsp # imm = 0x1F8 +; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = mem[0],xmm7[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm11[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm7[0],xmm8[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 720(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm5[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[3],ymm5[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm6[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[3],ymm4[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm7[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm3[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm3[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 800(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm5[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm5[1],ymm15[0],ymm5[2],ymm15[2] -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm8[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm4[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm9[1],ymm11[0],ymm9[2],ymm11[2] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm3[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm2[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 800(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm1[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm0[0],ymm4[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm4[0],ymm6[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm6[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm6[0],xmm7[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm7[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm2[0],xmm15[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm7[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm15[0],xmm4[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm4[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm11[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm3[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[0],ymm3[0],ymm10[3],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm1[0],ymm5[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm2[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm15[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm15[0],ymm7[3],ymm15[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[3],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm1[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm0 = mem[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm0[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = mem[0],xmm4[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm3[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm6[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm5[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm14[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rcx) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm14[0],ymm3[3],ymm14[2] +; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm15 = xmm15[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm11 = xmm11[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm1[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm13[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm13, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm13, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm13, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm13, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 96(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%r8) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%r8) -; AVX1-ONLY-NEXT: vmovapd %ymm11, (%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 64(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 96(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 32(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 96(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 32(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 64(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm12, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm4, (%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 64(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 32(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 96(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 64(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm7, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd %ymm7, 64(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm1, (%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 32(%rax) -; AVX1-ONLY-NEXT: addq $552, %rsp # imm = 0x228 +; AVX1-ONLY-NEXT: vmovapd %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 32(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm4, (%rax) +; AVX1-ONLY-NEXT: addq $504, %rsp # imm = 0x1F8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride7_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $504, %rsp # imm = 0x1F8 -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm4 +; AVX2-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovdqa 720(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 720(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm5[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm6[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 352(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm5 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm4, %ymm5 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] +; AVX2-ONLY-NEXT: vmovdqa 464(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm3, %ymm5 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 800(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] -; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 128(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 576(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm2, %ymm5 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] +; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm0[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm4[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm14[1],ymm2[1],ymm14[3],ymm2[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = mem[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm15, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 64(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%r9) +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm14[1],ymm0[1],ymm14[3],ymm0[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq %xmm11, %ymm11 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],ymm13[1],ymm11[3],ymm13[3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm3[1],ymm12[3],ymm3[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm12[1],ymm15[1],ymm12[3],ymm15[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm13, (%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm9, 32(%r8) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm9, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm9, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 96(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 32(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, 64(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm11, (%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 64(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 96(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 96(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 96(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm13, 96(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm14, 32(%rax) -; AVX2-ONLY-NEXT: addq $504, %rsp # imm = 0x1F8 +; AVX2-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -2220,15 +2202,15 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { ; SSE-LABEL: load_i64_stride7_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $1448, %rsp # imm = 0x5A8 +; SSE-NEXT: subq $1432, %rsp # imm = 0x598 ; SSE-NEXT: movapd 208(%rdi), %xmm0 ; SSE-NEXT: movapd 96(%rdi), %xmm1 ; SSE-NEXT: movapd 144(%rdi), %xmm2 ; SSE-NEXT: movapd 192(%rdi), %xmm3 ; SSE-NEXT: movapd 80(%rdi), %xmm4 ; SSE-NEXT: movapd 128(%rdi), %xmm5 -; SSE-NEXT: movapd 64(%rdi), %xmm8 -; SSE-NEXT: movapd 176(%rdi), %xmm9 +; SSE-NEXT: movapd 176(%rdi), %xmm8 +; SSE-NEXT: movapd 64(%rdi), %xmm9 ; SSE-NEXT: movapd (%rdi), %xmm10 ; SSE-NEXT: movapd 16(%rdi), %xmm7 ; SSE-NEXT: movapd 32(%rdi), %xmm6 @@ -2239,10 +2221,10 @@ ; SSE-NEXT: movapd %xmm14, %xmm13 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm10[0],xmm13[1] ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm8[0] +; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm9[0] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm7[0],xmm8[1] -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] +; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm4[0] ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1] @@ -2254,10 +2236,10 @@ ; SSE-NEXT: movapd %xmm15, %xmm1 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm9[0] +; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm8[0] ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] -; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm3[0] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] @@ -2485,13 +2467,13 @@ ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1344(%rdi), %xmm13 +; SSE-NEXT: movapd 1344(%rdi), %xmm14 ; SSE-NEXT: movapd 1392(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1408(%rdi), %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm1[0] +; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm1[0] ; SSE-NEXT: movapd 1360(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2508,8 +2490,8 @@ ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1456(%rdi), %xmm9 ; SSE-NEXT: movapd 1504(%rdi), %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm14 -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm9[0],xmm14[1] +; SSE-NEXT: movapd %xmm1, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm9[0],xmm13[1] ; SSE-NEXT: movapd 1520(%rdi), %xmm12 ; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm12[0] ; SSE-NEXT: movapd 1472(%rdi), %xmm2 @@ -2519,18 +2501,18 @@ ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1488(%rdi), %xmm2 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: movapd 1552(%rdi), %xmm0 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1568(%rdi), %xmm5 +; SSE-NEXT: movapd 1568(%rdi), %xmm3 ; SSE-NEXT: movapd 1616(%rdi), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm7 -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1] ; SSE-NEXT: movapd 1632(%rdi), %xmm8 -; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm8[0] +; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm8[0] ; SSE-NEXT: movapd 1584(%rdi), %xmm11 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm11[0],xmm8[1] ; SSE-NEXT: movapd 1648(%rdi), %xmm1 @@ -2540,83 +2522,82 @@ ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd 1664(%rdi), %xmm1 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] -; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 1680(%rdi), %xmm1 -; SSE-NEXT: movapd 1728(%rdi), %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE-NEXT: movapd 1744(%rdi), %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm3[0] +; SSE-NEXT: movapd 1680(%rdi), %xmm0 +; SSE-NEXT: movapd 1728(%rdi), %xmm5 +; SSE-NEXT: movapd %xmm5, %xmm2 +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd 1744(%rdi), %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] ; SSE-NEXT: movapd 1696(%rdi), %xmm6 -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm6[0],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm6[0],xmm1[1] ; SSE-NEXT: movapd 1760(%rdi), %xmm10 ; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm10[0] ; SSE-NEXT: movapd 1712(%rdi), %xmm15 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm15[0],xmm10[1] -; SSE-NEXT: movapd 1776(%rdi), %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm0[0] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm7, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, 32(%rsi) +; SSE-NEXT: movapd 1776(%rdi), %xmm4 +; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm4[0] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] ; SSE-NEXT: movapd %xmm2, 240(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 176(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 176(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movapd %xmm7, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 128(%rsi) +; SSE-NEXT: movaps %xmm2, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 64(%rsi) +; SSE-NEXT: movaps %xmm2, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) -; SSE-NEXT: movapd %xmm14, 208(%rsi) +; SSE-NEXT: movaps %xmm2, 32(%rsi) +; SSE-NEXT: movapd %xmm13, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 144(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movapd %xmm5, 224(%rdx) -; SSE-NEXT: movapd %xmm1, 240(%rdx) -; SSE-NEXT: movapd %xmm13, 192(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movapd %xmm0, 240(%rdx) +; SSE-NEXT: movapd %xmm3, 224(%rdx) ; SSE-NEXT: movapd %xmm9, 208(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rdx) +; SSE-NEXT: movapd %xmm14, 192(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rdx) +; SSE-NEXT: movaps %xmm0, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movapd %xmm3, 240(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movapd %xmm1, 240(%rcx) ; SSE-NEXT: movapd %xmm8, 224(%rcx) ; SSE-NEXT: movapd %xmm12, 208(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2678,7 +2659,7 @@ ; SSE-NEXT: movapd %xmm10, 240(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%r9) @@ -2708,7 +2689,7 @@ ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm15, 240(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) @@ -2739,8 +2720,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 240(%rax) +; SSE-NEXT: movapd %xmm4, 240(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2771,507 +2751,452 @@ ; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: addq $1448, %rsp # imm = 0x5A8 +; SSE-NEXT: addq $1432, %rsp # imm = 0x598 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1720, %rsp # imm = 0x6B8 -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 720(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm1[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm11[0],xmm1[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 896(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovapd 944(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm2[0],xmm1[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1440(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1344(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovapd 1392(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm7[0],xmm0[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm10[0],ymm0[0],ymm10[3],ymm0[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm9[0],ymm3[3],ymm9[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm6[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[3],ymm1[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm14[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm13[0],ymm5[3],ymm13[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm8[0],ymm3[0],ymm8[3],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: subq $1576, %rsp # imm = 0x628 +; AVX1-ONLY-NEXT: vmovapd 1440(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 992(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 944(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 1392(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm15[0],ymm10[0],ymm15[3],ymm10[2] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm11[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm4[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = mem[0],xmm6[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 720(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = mem[0],xmm7[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1216(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 1168(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = mem[0],xmm7[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1664(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm13[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 1616(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = mem[0],xmm7[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1056(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm12[0],ymm5[0],ymm12[3],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm12[0],ymm3[3],ymm12[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[3],ymm10[2] +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1056(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[3],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm4[0],ymm8[0],ymm4[3],ymm8[2] -; AVX1-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm12[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[3],ymm8[2] +; AVX1-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm11[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm7[0],ymm4[3],ymm7[2] +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1248(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 1136(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm7[0],mem[1] +; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[0],ymm5[0],ymm6[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovapd 1584(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm13[0],mem[1] +; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[0],ymm4[0],ymm11[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm13[0],ymm2[0],ymm13[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm3[0],mem[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm6[0],mem[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovdqa 912(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovdqa 1360(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm12[1],ymm8[0],ymm12[2],ymm8[2] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm11[1],ymm5[0],ymm11[2],ymm5[2] -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1200(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm7[1],ymm1[0],ymm7[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm9[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1648(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm13[1],ymm1[0],ymm13[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[1],ymm9[0],ymm2[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm15[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 976(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm0[1],ymm3[0],ymm0[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm10[1],ymm1[0],ymm10[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1024(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovapd 912(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm1[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1472(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovapd 1360(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm0[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm9[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 800(%rdi), %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm7[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd 1136(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm8[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1696(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 1584(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm4[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm2[0],ymm6[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm2[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm2[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm12[0],xmm8[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm2[0],ymm3[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm9[1],ymm3[0],ymm9[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm5[0],ymm6[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm14[0],xmm1[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1],ymm6[0],ymm7[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 976(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm13[0],xmm5[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm11[0],xmm3[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 1200(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm8[1],ymm12[0],ymm8[2],ymm12[2] +; AVX1-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm8[0],ymm0[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm8[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 1648(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[1],ymm13[0],ymm4[2],ymm13[2] +; AVX1-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm2[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm3[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm6[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm7[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm12[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm12[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1376(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm0[0],xmm9[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1696(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd 1600(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm10[0],ymm9[0],ymm10[3],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[3],ymm10[2] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 1536(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm12[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm8[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm14[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm2[0],ymm4[0],ymm2[3],ymm4[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vmovapd 1696(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1760(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm13[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm11[0],ymm7[0],ymm11[3],ymm7[2] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[3],ymm14[2] -; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm10[0],ymm6[0],ymm10[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm9[0],ymm4[0],ymm9[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm5[0],ymm3[0],ymm5[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm5[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm15[0],ymm2[0],ymm15[3],ymm2[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm6[0],ymm15[0],ymm6[3],ymm15[2] -; AVX1-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm8[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm5[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[3],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm12[0],ymm1[0],ymm12[3],ymm1[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm0[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm12[0],ymm5[3],ymm12[2] -; AVX1-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm11 = xmm11[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = mem[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[0],ymm1[0],ymm12[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm5[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm8[0],ymm0[0],ymm8[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = xmm8[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = xmm6[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm3[0],mem[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm9[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = xmm6[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r9) +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm9[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm9[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%r8) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm9, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovapd %ymm5, 224(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 192(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovapd %ymm12, 192(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 160(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 128(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 96(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 64(%rax) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) @@ -3281,618 +3206,605 @@ ; AVX1-ONLY-NEXT: vmovapd %ymm2, 160(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm3, 128(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm4, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 64(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 64(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm9, (%rax) -; AVX1-ONLY-NEXT: addq $1720, %rsp # imm = 0x6B8 +; AVX1-ONLY-NEXT: vmovapd %ymm8, (%rax) +; AVX1-ONLY-NEXT: addq $1576, %rsp # imm = 0x628 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride7_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1624, %rsp # imm = 0x658 -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm4 +; AVX2-ONLY-NEXT: subq $1448, %rsp # imm = 0x5A8 +; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovdqa 944(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1392(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = mem[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovdqa 720(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovdqa 720(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovdqa 944(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa 1392(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm2[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm4[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm10[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm7[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm8[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm9[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm11[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm13[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm11 = ymm14[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 352(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 800(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1248(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovdqa 1136(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1696(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm3, %ymm0 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] -; AVX2-ONLY-NEXT: vmovdqa 1584(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 128(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 576(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm2, %ymm0 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1024(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm1, %ymm0 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1472(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 912(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm9 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX2-ONLY-NEXT: vmovdqa 1360(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm8, %ymm9 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm7, %ymm9 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm9, %ymm6 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovdqa 1136(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm5, %ymm6 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovdqa 1584(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm8[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm9[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm5[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm14[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm15 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm1[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm10[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm15[1],ymm2[3],ymm15[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm5[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm9[1],ymm2[3],ymm9[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm12[1],ymm1[3],ymm12[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = mem[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r9) +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm15, 224(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 224(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm14, 192(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, 224(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 192(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 160(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 128(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm13, 96(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm14, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 224(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm2, 160(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 128(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 96(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 64(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm15, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $1624, %rsp # imm = 0x658 +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 128(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 96(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, (%rax) +; AVX2-ONLY-NEXT: addq $1448, %rsp # imm = 0x5A8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: load_i64_stride7_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm18 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovaps 1024(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovaps 576(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3] ; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6] -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm4, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,4,11,4,11,4,11,4] +; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm8, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm26, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] -; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,6,13,6,13,6,13,6] +; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm9, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [12,5,12,5,12,5,12,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm21, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm21, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm8, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm8, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm8, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] +; AVX512F-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm25, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm25, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm25, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] +; AVX512F-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] ; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm20, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm30, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm30, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm8, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm8, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm0, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm8, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm22, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm20, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm17, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm0, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm18, %zmm0, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm18, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm27 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] ; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] @@ -3906,320 +3818,322 @@ ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] ; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] ; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] ; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] ; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm25 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] ; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm30 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] ; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = <0,7,14,u> +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm24, %zmm17 ; AVX512F-ONLY-SLOW-NEXT: movb $24, %al ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm2[0,1,4,5],zmm15[4,5,0,1] ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0] -; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512F-ONLY-SLOW-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] +; AVX512F-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [4,11,4,11] +; AVX512F-ONLY-SLOW-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm24, %zmm14 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[0,1,4,5],zmm29[4,5,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm24, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm10[4,5,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm28, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[0,1,4,5],zmm13[4,5,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm13, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm24, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[0,1,4,5],zmm9[4,5,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm24, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm24, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[0,1,4,5],zmm11[4,5,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm28, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm11, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm4[0,1,4,5],zmm13[4,5,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm13, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm26, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 464(%rdi), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 912(%rdi), %xmm13 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm10, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm0, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5],ymm15[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm19, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm0 = ymm10[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = <9,0,7,u> +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm26, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm26, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 960(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [6,13] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm18, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm25, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm18, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm13 ; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm1, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: movb $-32, %al ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm15, %zmm21, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm19, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 960(%rdi), %ymm13 ; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm20, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm13 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm16, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm4 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm15, %zmm16, %zmm15 ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm16, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm27, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 192(%rdx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, (%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 192(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 128(%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, (%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 192(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 128(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 192(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 128(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, 64(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm1, (%r9) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm9, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm15, (%rax) ; AVX512F-ONLY-SLOW-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq @@ -4227,112 +4141,112 @@ ; AVX512F-ONLY-FAST-LABEL: load_i64_stride7_vf32: ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm18 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovaps 1024(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovaps 576(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3] ; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm4, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6] -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512F-ONLY-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm4, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,4,11,4,11,4,11,4] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm8, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm26, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] -; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,6,13,6,13,6,13,6] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm9, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [12,5,12,5,12,5,12,5] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm21, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm8, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm8, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] +; AVX512F-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm25, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm25, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm25, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] +; AVX512F-ONLY-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] ; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm30, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm30, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm8, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm8, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm0, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm8, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm22, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm17, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm0, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm0, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm18, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm27 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] ; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] @@ -4346,320 +4260,322 @@ ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] ; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] ; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] ; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] ; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm25 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] ; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm30 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] ; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = <0,7,14,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm24, %zmm17 ; AVX512F-ONLY-FAST-NEXT: movb $24, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm2[0,1,4,5],zmm15[4,5,0,1] ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0] -; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] +; AVX512F-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [4,11,4,11] +; AVX512F-ONLY-FAST-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm28, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm24, %zmm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[0,1,4,5],zmm29[4,5,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm24, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm10[4,5,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm28, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[0,1,4,5],zmm13[4,5,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm13, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[0,1,4,5],zmm9[4,5,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm24, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[0,1,4,5],zmm11[4,5,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm28, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm11, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm4[0,1,4,5],zmm13[4,5,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm13, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm26, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 464(%rdi), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 912(%rdi), %xmm13 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm10, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1360(%rdi), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm0, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5],ymm15[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5],ymm15[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm19, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1472(%rdi), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm0 = ymm10[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = <9,0,7,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 960(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} xmm18 = [6,13] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm18, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm25, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm18, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm13 ; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm1, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm11 ; AVX512F-ONLY-FAST-NEXT: movb $-32, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1408(%rdi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm15, %zmm21, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 960(%rdi), %ymm13 ; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm20, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm13 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1408(%rdi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm16, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1088(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm15, %zmm16, %zmm15 ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm16, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm27, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 192(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 128(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 192(%rdx) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 192(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 128(%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, (%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 192(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 128(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, 192(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, 128(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, 64(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm1, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm9, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm15, (%rax) ; AVX512F-ONLY-FAST-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq @@ -4667,112 +4583,112 @@ ; AVX512DQ-SLOW-LABEL: load_i64_stride7_vf32: ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm18 ; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovaps 1024(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovaps 576(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3] ; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6] -; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512DQ-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm4, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,4,11,4,11,4,11,4] +; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm8, %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm26, %zmm23 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] -; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,6,13,6,13,6,13,6] +; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm9, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [12,5,12,5,12,5,12,5] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm21, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm21, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm8, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm8, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm8, %zmm7 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] +; AVX512DQ-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm25, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm25, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm25, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] +; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] ; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm20, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm30, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm1, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm30, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm18, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm8, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm8, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm18, %zmm0, %zmm9 ; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm8, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm22, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm20, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm17, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm18, %zmm0, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm18, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm18, %zmm0, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm18, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm27 ; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] ; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] @@ -4786,320 +4702,322 @@ ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] ; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] ; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] ; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] ; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm25 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] ; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm30 ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] ; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = <0,7,14,u> +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm24, %zmm17 ; AVX512DQ-SLOW-NEXT: movb $24, %al ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm17 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm2[0,1,4,5],zmm15[4,5,0,1] ; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0] -; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm22 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512DQ-SLOW-NEXT: # ymm24 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm26 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] +; AVX512DQ-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm22 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [4,11,4,11] +; AVX512DQ-SLOW-NEXT: # ymm28 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm28, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm24, %zmm14 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[0,1,4,5],zmm29[4,5,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm24, %zmm1 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm10[4,5,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm28, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm5 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[0,1,4,5],zmm13[4,5,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm13, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm24, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[0,1,4,5],zmm9[4,5,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm24, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm24, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[0,1,4,5],zmm11[4,5,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm28, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm11 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm11, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm4[0,1,4,5],zmm13[4,5,0,1] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm13, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm28, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm26, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa 464(%rdi), %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm28 ; AVX512DQ-SLOW-NEXT: vmovdqa 912(%rdi), %xmm13 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm10, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm0, %zmm23 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5],ymm15[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm19, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm0 = ymm10[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = <9,0,7,u> +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm26, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm26, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa 960(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [6,13] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm18, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm25, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm18, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm25, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm13 ; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm18, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm1, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm11 ; AVX512DQ-SLOW-NEXT: movb $-32, %al ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} -; AVX512DQ-SLOW-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm15, %zmm21, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm19, %ymm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 960(%rdi), %ymm13 ; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2} -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9 -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11 -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm20, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm13 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm16, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm4 {%k2} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm15, %zmm16, %zmm15 ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm16, %zmm12 +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm27, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 192(%rdx) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 192(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 128(%rcx) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, (%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 192(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 128(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 192(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 128(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, 64(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm1, (%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-SLOW-NEXT: vmovaps %zmm9, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm15, (%rax) ; AVX512DQ-SLOW-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq @@ -5107,112 +5025,112 @@ ; AVX512DQ-FAST-LABEL: load_i64_stride7_vf32: ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm18 ; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm19 -; AVX512DQ-FAST-NEXT: vmovaps 1024(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-FAST-NEXT: vmovaps 576(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm13 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3] ; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm4, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm6 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6] -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512DQ-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm4, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,4,11,4,11,4,11,4] +; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm8, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm26, %zmm23 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] -; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,6,13,6,13,6,13,6] +; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm9, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [12,5,12,5,12,5,12,5] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm21, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm21, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm8, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm8, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm8, %zmm7 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] +; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm25, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm25, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm25, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] +; AVX512DQ-FAST-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm30, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm1, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm30, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm18, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm8, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm8, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm18, %zmm0, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm8, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm22, %zmm21 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm31 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm11 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm25 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm17, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm18, %zmm0, %zmm21 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm18, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm18, %zmm0, %zmm25 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm18, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm27 ; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] ; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] @@ -5226,2111 +5144,808 @@ ; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] ; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] ; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm22 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] ; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] ; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm25 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] ; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm30 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] ; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = <0,7,14,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm24, %zmm17 ; AVX512DQ-FAST-NEXT: movb $24, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm17 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm2[0,1,4,5],zmm15[4,5,0,1] ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0] -; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm22 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm26 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] +; AVX512DQ-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm22 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [4,11,4,11] +; AVX512DQ-FAST-NEXT: # ymm28 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm28, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm24, %zmm14 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[0,1,4,5],zmm29[4,5,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm24, %zmm1 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm10[4,5,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm28, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[0,1,4,5],zmm13[4,5,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm13, %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm13, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[0,1,4,5],zmm9[4,5,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm24, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[0,1,4,5],zmm11[4,5,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm28, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm11 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm11, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm4[0,1,4,5],zmm13[4,5,0,1] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm13, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm13, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm26, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa 464(%rdi), %xmm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm28 ; AVX512DQ-FAST-NEXT: vmovdqa 912(%rdi), %xmm13 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 -; AVX512DQ-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm10, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa 1360(%rdi), %xmm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm0, %zmm23 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5],ymm15[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm18, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa 1024(%rdi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5],ymm15[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm19, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 1472(%rdi), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm0 = ymm10[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = <9,0,7,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa 960(%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa 512(%rdi), %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} xmm18 = [6,13] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm18, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm25, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm18, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm25, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm13 ; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm1, %zmm25 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm11 ; AVX512DQ-FAST-NEXT: movb $-32, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2} -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2} -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} -; AVX512DQ-FAST-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vmovdqa 1408(%rdi), %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm15, %zmm21, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 960(%rdi), %ymm13 ; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2} -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa 1536(%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9 -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11 -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm20, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm13 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vmovdqa 1408(%rdi), %ymm4 +; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm16, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm4 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa 1088(%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vpalignr $8, (%rsp), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm31, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 1536(%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm15, %zmm16, %zmm15 ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, (%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm16, %zmm12 +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm16, %zmm6 +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm27, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 192(%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 128(%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 64(%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 192(%rdx) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 64(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 192(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 128(%rcx) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, (%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 192(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 128(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 192(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 128(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 64(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm1, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-FAST-NEXT: vmovaps %zmm9, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm15, (%rax) ; AVX512DQ-FAST-NEXT: addq $2152, %rsp # imm = 0x868 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; -; AVX512BW-ONLY-SLOW-LABEL: load_i64_stride7_vf32: -; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] -; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm29, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm4, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm4, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm15, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: movb $24, %al -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512BW-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm16, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512BW-ONLY-SLOW-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm7[4,5,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm24, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[0,1,4,5],zmm13[4,5,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm13, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm24, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[0,1,4,5],zmm30[4,5,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm24, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm26, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm26, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm2, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: movb $-32, %al -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 960(%rdi), %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm8 -; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, (%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 128(%r9) -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 -; AVX512BW-ONLY-SLOW-NEXT: vzeroupper -; AVX512BW-ONLY-SLOW-NEXT: retq -; -; AVX512BW-ONLY-FAST-LABEL: load_i64_stride7_vf32: -; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $2120, %rsp # imm = 0x848 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm8, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] -; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm4, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm4, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm15, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: movb $24, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512BW-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm16, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512BW-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm7[4,5,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm24, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[0,1,4,5],zmm13[4,5,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm13, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[0,1,4,5],zmm30[4,5,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm24, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm26, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm2, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: movb $-32, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 960(%rdi), %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm16 -; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, (%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 128(%r9) -; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $2120, %rsp # imm = 0x848 -; AVX512BW-ONLY-FAST-NEXT: vzeroupper -; AVX512BW-ONLY-FAST-NEXT: retq -; -; AVX512DQBW-SLOW-LABEL: load_i64_stride7_vf32: -; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] -; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm23 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] -; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm29, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm4, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm4, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm15, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: movb $24, %al -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm17 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512DQBW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm16, %zmm19 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512DQBW-SLOW-NEXT: # ymm24 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm7[4,5,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm24, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[0,1,4,5],zmm13[4,5,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm13, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm24, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[0,1,4,5],zmm30[4,5,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm24, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm13 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm26, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm26, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm2, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18 -; AVX512DQBW-SLOW-NEXT: movb $-32, %al -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa 960(%rdi), %ymm2 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm16 -; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm8 -; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r8) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, (%r9) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 128(%r9) -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 64(%rax) -; AVX512DQBW-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 -; AVX512DQBW-SLOW-NEXT: vzeroupper -; AVX512DQBW-SLOW-NEXT: retq -; -; AVX512DQBW-FAST-LABEL: load_i64_stride7_vf32: -; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $2120, %rsp # imm = 0x848 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm8, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] -; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm23 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] -; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm4, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm4, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm15, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: movb $24, %al -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm17 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1] -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512DQBW-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm16, %zmm19 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11] -; AVX512DQBW-FAST-NEXT: # ymm24 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm7[4,5,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm24, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[0,1,4,5],zmm13[4,5,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm13, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm13, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[0,1,4,5],zmm30[4,5,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm24, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqa 1024(%rdi), %ymm13 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm26, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm2, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18 -; AVX512DQBW-FAST-NEXT: movb $-32, %al -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa 960(%rdi), %ymm2 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %ymm16 -; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa 1536(%rdi), %ymm8 -; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 -; AVX512DQBW-FAST-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 128(%r8) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, (%r9) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 128(%r9) -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 64(%rax) -; AVX512DQBW-FAST-NEXT: addq $2120, %rsp # imm = 0x848 -; AVX512DQBW-FAST-NEXT: vzeroupper -; AVX512DQBW-FAST-NEXT: retq - %wide.vec = load <224 x i64>, ptr %in.vec, align 64 - %strided.vec0 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> - %strided.vec1 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> - %strided.vec2 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> - %strided.vec3 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> - %strided.vec4 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> - %strided.vec5 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> - %strided.vec6 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> - store <32 x i64> %strided.vec0, ptr %out.vec0, align 64 - store <32 x i64> %strided.vec1, ptr %out.vec1, align 64 - store <32 x i64> %strided.vec2, ptr %out.vec2, align 64 - store <32 x i64> %strided.vec3, ptr %out.vec3, align 64 - store <32 x i64> %strided.vec4, ptr %out.vec4, align 64 - store <32 x i64> %strided.vec5, ptr %out.vec5, align 64 - store <32 x i64> %strided.vec6, ptr %out.vec6, align 64 - ret void -} - -define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { -; SSE-LABEL: load_i64_stride7_vf64: -; SSE: # %bb.0: -; SSE-NEXT: subq $3240, %rsp # imm = 0xCA8 -; SSE-NEXT: movapd 208(%rdi), %xmm0 -; SSE-NEXT: movapd 96(%rdi), %xmm1 -; SSE-NEXT: movapd 144(%rdi), %xmm2 -; SSE-NEXT: movapd 192(%rdi), %xmm3 -; SSE-NEXT: movapd 80(%rdi), %xmm4 -; SSE-NEXT: movapd 128(%rdi), %xmm5 -; SSE-NEXT: movapd 176(%rdi), %xmm8 -; SSE-NEXT: movapd 64(%rdi), %xmm9 -; SSE-NEXT: movapd (%rdi), %xmm10 -; SSE-NEXT: movapd 16(%rdi), %xmm7 -; SSE-NEXT: movapd 32(%rdi), %xmm6 -; SSE-NEXT: movapd 48(%rdi), %xmm14 -; SSE-NEXT: movapd 224(%rdi), %xmm11 -; SSE-NEXT: movapd 112(%rdi), %xmm12 -; SSE-NEXT: movapd 160(%rdi), %xmm15 -; SSE-NEXT: movapd %xmm14, %xmm13 -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm10[0],xmm13[1] -; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-LABEL: load_i64_stride7_vf32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: subq $2152, %rsp # imm = 0x868 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm15 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm6 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,4,11,4,11,4,11,4] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,6,13,6,13,6,13,6] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm10, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [12,5,12,5,12,5,12,5] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm13, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm18, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm18, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm8, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm28 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm8, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm25, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm25, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm25, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm24 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [9,0,7,0,9,0,7,0] +; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm30, %zmm17 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm30, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm0, %zmm10 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm0, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm0, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm19, %zmm0, %zmm25 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm19, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm21 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm25 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm30 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm27 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm29 = <0,7,14,u> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm29, %zmm19 +; AVX512BW-NEXT: movb $24, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm2[0,1,4,5],zmm15[4,5,0,1] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm28 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm18, %zmm23 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,11,4,11] +; AVX512BW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm29, %zmm17 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm1[0,1,4,5],zmm16[4,5,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm10 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm1[0,1,4,5],zmm13[4,5,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm13, %zmm29 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm1[0,1,4,5],zmm26[4,5,0,1] +; AVX512BW-NEXT: vpermi2q %zmm26, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm26, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm12, %zmm28, %zmm28 +; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm12, %zmm11, %zmm26 +; AVX512BW-NEXT: vmovdqa 912(%rdi), %xmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm11, %zmm16, %zmm24 +; AVX512BW-NEXT: vmovdqa 1360(%rdi), %xmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm11, %zmm0, %zmm16 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm15 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm12 +; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm12 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm12[6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm4 +; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm13, %zmm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm25 = <9,0,7,u> +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm5 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [6,13] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm25, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm13, %zmm2, %zmm25 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm13 +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: movb $-32, %al +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm23, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm3 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm3, %zmm22, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa 960(%rdi), %ymm7 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm7, %zmm21, %zmm7 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %ymm20 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm20 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm20, %zmm18, %zmm18 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %ymm20 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20, %ymm15 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm15, %zmm11 +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm15 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm31, %zmm4 +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm15 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm12, %zmm15, %zmm12 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm15, %zmm14 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm15, %zmm9 +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm29, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm28, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 192(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, 64(%r9) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm2, (%r9) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rax) +; AVX512BW-NEXT: vmovaps %zmm12, (%rax) +; AVX512BW-NEXT: addq $2152, %rsp # imm = 0x868 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %wide.vec = load <224 x i64>, ptr %in.vec, align 64 + %strided.vec0 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> + %strided.vec1 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> + %strided.vec2 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> + %strided.vec3 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> + %strided.vec4 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> + %strided.vec5 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> + %strided.vec6 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> + store <32 x i64> %strided.vec0, ptr %out.vec0, align 64 + store <32 x i64> %strided.vec1, ptr %out.vec1, align 64 + store <32 x i64> %strided.vec2, ptr %out.vec2, align 64 + store <32 x i64> %strided.vec3, ptr %out.vec3, align 64 + store <32 x i64> %strided.vec4, ptr %out.vec4, align 64 + store <32 x i64> %strided.vec5, ptr %out.vec5, align 64 + store <32 x i64> %strided.vec6, ptr %out.vec6, align 64 + ret void +} + +define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind { +; SSE-LABEL: load_i64_stride7_vf64: +; SSE: # %bb.0: +; SSE-NEXT: subq $3240, %rsp # imm = 0xCA8 +; SSE-NEXT: movapd 208(%rdi), %xmm0 +; SSE-NEXT: movapd 96(%rdi), %xmm1 +; SSE-NEXT: movapd 144(%rdi), %xmm2 +; SSE-NEXT: movapd 192(%rdi), %xmm3 +; SSE-NEXT: movapd 80(%rdi), %xmm4 +; SSE-NEXT: movapd 128(%rdi), %xmm5 +; SSE-NEXT: movapd 176(%rdi), %xmm8 +; SSE-NEXT: movapd 64(%rdi), %xmm9 +; SSE-NEXT: movapd (%rdi), %xmm10 +; SSE-NEXT: movapd 16(%rdi), %xmm7 +; SSE-NEXT: movapd 32(%rdi), %xmm6 +; SSE-NEXT: movapd 48(%rdi), %xmm14 +; SSE-NEXT: movapd 224(%rdi), %xmm11 +; SSE-NEXT: movapd 112(%rdi), %xmm12 +; SSE-NEXT: movapd 160(%rdi), %xmm15 +; SSE-NEXT: movapd %xmm14, %xmm13 +; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm10[0],xmm13[1] +; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm9[0] ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1] @@ -8445,3065 +7060,2828 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $4264, %rsp # imm = 0x10A8 -; AVX1-ONLY-NEXT: vmovapd 1216(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 720(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: subq $3800, %rsp # imm = 0xED8 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 544(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1120(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1168(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1664(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1568(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1616(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm1[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2112(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2176(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2064(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2560(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 2624(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2464(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2512(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 3072(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2912(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2960(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3456(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 3520(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 3360(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3408(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps 944(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1888(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, 1952(%rdi), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2336(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1840(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2336(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, 2400(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 2288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2784(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 2848(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 2288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2736(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2784(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, 2848(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovapd 3232(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 3296(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 2688(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovapd 2736(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm12[0],xmm2[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 3232(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, 3296(%rdi), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 3136(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovapd 3184(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm2[0],xmm4[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm5[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm13[0],ymm5[0],ymm13[3],ymm5[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm14[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[3],ymm9[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[0],ymm4[0],ymm10[3],ymm4[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm15[0],ymm8[0],ymm15[3],ymm8[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 2176(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm0[0],ymm10[0],ymm0[3],ymm10[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 2528(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 2624(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm0[0],ymm13[0],ymm0[3],ymm13[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 2976(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 3072(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 3424(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 3520(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 3184(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 768(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 720(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1216(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1168(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1664(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 1616(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2112(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2176(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 2064(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2560(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2624(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 2512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3008(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 3072(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 2960(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3456(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 3520(%rdi), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[0],ymm0[0],ymm14[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm15[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd 3408(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm11 = mem[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[0],ymm0[0],ymm14[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[0],ymm3[0],ymm10[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 608(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[0],ymm11[0],ymm9[3],ymm11[2] +; AVX1-ONLY-NEXT: vmovdqa 512(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1056(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 960(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 1504(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1952(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm0[0],ymm14[0],ymm0[3],ymm14[2] -; AVX1-ONLY-NEXT: vmovdqa 1856(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2400(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm0[0],ymm11[0],ymm0[3],ymm11[2] -; AVX1-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm9[0],ymm0[0],ymm9[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1952(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm0[0],ymm7[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1856(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2848(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm6[0],ymm15[0],ymm6[3],ymm15[2] -; AVX1-ONLY-NEXT: vmovdqa 2752(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovapd 2400(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm10[0],ymm6[3],ymm10[2] +; AVX1-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2848(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm0[0],ymm6[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 2752(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 3296(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm3[0],ymm0[0],ymm3[3],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 3200(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm5[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 1136(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovapd 1584(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2144(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovapd 2032(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2592(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovapd 2480(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3040(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 2928(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 3376(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovdqa 3152(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2816(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovdqa 2704(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 3200(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2368(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] -; AVX1-ONLY-NEXT: vmovdqa 2256(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm8[0],ymm6[0],ymm8[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 832(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm13[0],ymm8[0],ymm13[3],ymm8[2] +; AVX1-ONLY-NEXT: vmovdqa 736(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1280(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm14[0],ymm13[0],ymm14[3],ymm13[2] +; AVX1-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1728(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm12[0],ymm14[0],ymm12[3],ymm14[2] +; AVX1-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2176(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm12[0],ymm4[3],ymm12[2] +; AVX1-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2624(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[3],ymm7[2] +; AVX1-ONLY-NEXT: vmovdqa 2528(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3072(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 2976(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3520(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[0],ymm2[0],ymm15[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 3424(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm1[0],mem[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1920(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vmovapd 1808(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm2[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm4[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm3[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 576(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm6[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 800(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovapd 688(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm8[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1024(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovapd 912(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm11[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1248(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vmovapd 1136(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm13[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1472(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vmovapd 1360(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovapd 912(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm9[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 576(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm11[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm13[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm13[1],ymm14[0],ymm13[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm4[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1696(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] +; AVX1-ONLY-NEXT: vmovapd 1584(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm15[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 1920(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vmovapd 1808(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm14[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 2144(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vmovapd 2032(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm12[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 2368(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vmovapd 2256(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm9 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 2592(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd 2480(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1],ymm12[0],ymm5[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm11[1],ymm13[0],ymm11[2],ymm13[2] -; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm10[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 2816(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovaps 2704(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 3040(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 2928(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm0[1],ymm7[0],ymm0[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 976(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm9[1],ymm10[0],ymm9[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm6[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1200(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = xmm5[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm4[1],ymm0[0],ymm4[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1648(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm4[0],ymm1[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1872(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[1],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 3264(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovaps 3152(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 3488(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 3376(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = xmm2[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2096(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 2208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 304(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm9[0],ymm3[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2320(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovapd 528(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm0[0],ymm15[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm0[0],ymm6[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 640(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2544(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm0[1],ymm15[0],ymm0[2],ymm15[2] -; AVX1-ONLY-NEXT: vmovdqa 2656(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 752(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2768(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 2880(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[1],ymm0[0],ymm8[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 864(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2992(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 3104(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 976(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm7[0],ymm11[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 3216(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 3328(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1200(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm13[1],ymm5[0],ymm13[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 3440(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 3552(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 1424(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[1],ymm0[0],ymm4[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm14[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm14[0],xmm12[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 1648(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm12[0],xmm13[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm0[0],ymm15[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 704(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm13[0],xmm7[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 1872(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 928(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm9[0],xmm10[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm14[1],ymm0[0],ymm14[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1152(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm10[0],xmm6[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 2096(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm12[1],ymm0[0],ymm12[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovapd %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa 2208(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1376(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 2320(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm10[1],ymm8[0],ymm10[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1696(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 2544(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1600(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm1[0],xmm4[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[1],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 2656(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 1920(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 1824(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm1[0],xmm3[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 2768(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[1],ymm10[0],ymm0[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovdqa 2880(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps 2144(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 2048(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 2368(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 2272(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 2992(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[1],ymm3[0],ymm0[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 3104(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 2592(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 2496(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = xmm1[0],xmm15[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 3216(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[1],ymm13[0],ymm0[2],ymm13[2] +; AVX1-ONLY-NEXT: vmovdqa 3328(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 2816(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 2720(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 3440(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[1],ymm11[0],ymm0[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovdqa 3552(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovapd 3040(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 2944(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = mem[0],xmm2[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 3168(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps 3392(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm15[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = mem[0],xmm9[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 576(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm14[0,1,2],ymm4[3] ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = mem[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 800(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, 864(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3] ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm14[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm6 = mem[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm11[0],ymm1[0],ymm11[3],ymm1[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 1024(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1088(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = mem[0],xmm7[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm13[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 1248(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1312(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm5 = mem[0],xmm5[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[3],ymm1[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 1472(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1536(%rdi), %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1696(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1760(%rdi), %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm0[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[3],ymm13[2] +; AVX1-ONLY-NEXT: vmovapd 1920(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 1984(%rdi), %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm0 = mem[0],xmm0[1] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 2144(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2208(%rdi), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm12 = mem[0],xmm12[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2368(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 2432(%rdi), %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = mem[0],xmm8[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2592(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 2656(%rdi), %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = mem[0,1],xmm12[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2816(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 2880(%rdi), %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm12[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = mem[0],xmm10[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3040(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 3104(%rdi), %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3264(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, 3328(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = mem[0],xmm13[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 3488(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, 3552(%rdi), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm8 = mem[0],xmm11[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovupd (%rsp), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm15[0],ymm3[0],ymm15[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 640(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm14[0],ymm15[0],ymm14[3],ymm15[2] +; AVX1-ONLY-NEXT: vmovdqa 544(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 864(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm9[0],ymm12[0],ymm9[3],ymm12[2] +; AVX1-ONLY-NEXT: vmovdqa 768(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1088(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm6[0],ymm11[0],ymm6[3],ymm11[2] +; AVX1-ONLY-NEXT: vmovdqa 992(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1312(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm7[0],ymm10[0],ymm7[3],ymm10[2] +; AVX1-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1536(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm5[0],ymm9[0],ymm5[3],ymm9[2] +; AVX1-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1760(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm8[0],ymm4[3],ymm8[2] +; AVX1-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 1984(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[3],ymm7[2] +; AVX1-ONLY-NEXT: vmovdqa 1888(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 2208(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[3],ymm6[2] +; AVX1-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] -; AVX1-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 2432(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[3],ymm5[2] +; AVX1-ONLY-NEXT: vmovdqa 2336(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 1888(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 1984(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm4[0],ymm10[0],ymm4[3],ymm10[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2208(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovapd 2656(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[3],ymm12[2] -; AVX1-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 2336(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 2432(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0],ymm7[0],ymm3[3],ymm7[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 2656(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[3],ymm9[2] -; AVX1-ONLY-NEXT: vmovdqa 2560(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[3],ymm4[2] +; AVX1-ONLY-NEXT: vmovdqa 2560(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 2784(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 2880(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm15[0],ymm4[0],ymm15[3],ymm4[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 3104(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[3],ymm6[2] -; AVX1-ONLY-NEXT: vmovdqa 3008(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 2880(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[3],ymm3[2] +; AVX1-ONLY-NEXT: vmovdqa 2784(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 3232(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 3328(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovapd 3104(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm0[0],ymm3[0],ymm0[3],ymm3[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2] +; AVX1-ONLY-NEXT: vmovdqa 3008(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 3552(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovapd 3328(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[3],ymm2[2] -; AVX1-ONLY-NEXT: vmovdqa 3456(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2] +; AVX1-ONLY-NEXT: vmovdqa 3232(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vmovapd 3552(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm13[0],ymm0[0],ymm13[3],ymm0[2] +; AVX1-ONLY-NEXT: vmovdqa 3456(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2],ymm15[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm14 = xmm14[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = mem[0,1,2],ymm12[3] ; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm13 = xmm13[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2,3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm13[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm12 = xmm12[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = mem[0,1,2],ymm10[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm11 = xmm11[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2],ymm9[3] ; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm10[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = mem[0,1,2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm4[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm3 = xmm3[0],mem[1] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm2[3] -; AVX1-ONLY-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm10[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm10[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} xmm10 = xmm10[0],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 480(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 416(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 352(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 288(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 448(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 384(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 256(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 480(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 416(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 352(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 288(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 448(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 384(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 256(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 480(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 448(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 416(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 384(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 352(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 288(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 256(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 480(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 448(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 416(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 384(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 352(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 288(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 256(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 480(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 448(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 416(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 384(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 352(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 288(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 256(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 384(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 480(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 448(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 416(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 384(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 352(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 288(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 256(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rax) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovapd %ymm0, 480(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 448(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 416(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 384(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 352(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 320(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 288(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 256(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 448(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 416(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 384(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 352(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 320(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 288(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 256(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 224(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 192(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 160(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 128(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 96(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $4264, %rsp # imm = 0x10A8 +; AVX1-ONLY-NEXT: addq $3800, %rsp # imm = 0xED8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride7_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3976, %rsp # imm = 0xF88 -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX2-ONLY-NEXT: subq $3560, %rsp # imm = 0xDE8 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 944(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 1392(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 1840(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2784(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 2848(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa 2736(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3232(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovdqa 3296(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 3184(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 272(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 672(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 720(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1120(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1168(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %ymm14 ; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1568(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 1616(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vmovdqa 2176(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2016(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 2064(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2560(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 2464(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2512(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3072(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 2912(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2960(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3456(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3520(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 2560(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa 2624(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 3360(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa 3408(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2512(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3008(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 3072(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 2960(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 496(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 896(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 944(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1344(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1392(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1792(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1840(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2240(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2288(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2784(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2848(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2688(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vmovdqa 2736(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3232(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovdqa 3296(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 3136(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovdqa 3184(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3456(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 3520(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm5[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm6[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm13[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm11[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2176(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2624(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2976(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 3072(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqa 3408(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = mem[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm8 = ymm13[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm8 = ymm11[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1856(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2848(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm6[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2752(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3296(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3200(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm9[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 832(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1280(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1728(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm14[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2176(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm8 = ymm15[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2624(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2976(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3520(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 3424(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 3520(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm10, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1056(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm14, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] +; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1504(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1952(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm14[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1856(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm7, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2400(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm9, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] +; AVX2-ONLY-NEXT: vmovdqa 912(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2848(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2752(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm15 = xmm15[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3296(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm10[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3200(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 352(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovdqa 240(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 800(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm11, %ymm0 ; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] -; AVX2-ONLY-NEXT: vmovdqa 688(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1248(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] -; AVX2-ONLY-NEXT: vmovdqa 1136(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1696(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX2-ONLY-NEXT: vmovdqa 1584(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 2144(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] -; AVX2-ONLY-NEXT: vmovdqa 2032(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 2592(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] -; AVX2-ONLY-NEXT: vmovdqa 2480(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 3040(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps 2928(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 3488(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps 3376(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 3264(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = mem[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 2816(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 1136(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 2368(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm13, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovdqa 1360(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq 1920(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovdqa 1584(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1472(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 1024(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 1808(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 576(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 2144(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovdqa 2032(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2368(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 2256(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 128(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 2592(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] +; AVX2-ONLY-NEXT: vmovdqa 2480(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2816(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 2704(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 3040(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm8, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX2-ONLY-NEXT: vmovdqa 2928(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa 3264(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm5, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovdqa 3152(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1856(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vmovdqa 3488(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpbroadcastq %xmm4, %ymm0 +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX2-ONLY-NEXT: vmovdqa 3376(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = xmm14[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 736(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 960(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm9[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2656(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1184(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1408(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2752(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2880(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1632(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2976(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3104(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1856(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm12 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm12 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm14[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2080(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm10 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm12[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2304(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm10[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2528(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2656(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3200(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3328(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm15[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2752(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2880(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3424(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3552(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm10[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2976(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3104(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3200(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3328(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm8[2,3],ymm5[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3424(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3552(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa 1920(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 800(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 704(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1024(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 928(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1248(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1152(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1472(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1376(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1696(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1600(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqa 1824(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 1920(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 1824(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 2144(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 2048(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 2368(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 2272(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vmovdqa 2592(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 2496(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 2816(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vmovdqa 2816(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 2720(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vmovdqa 3040(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 2944(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 2944(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 3264(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqa 3168(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vmovdqa 3488(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa 3392(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm13[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovdqa 3168(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vmovdqa 3488(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqa 3392(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm5[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm4[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 640(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm2[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 864(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm3[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 768(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1088(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm7[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 992(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vpalignr $8, (%rsp), %ymm5, %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1312(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1216(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1440(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1536(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1760(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 1664(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 1888(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 1984(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2208(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2112(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpalignr $8, (%rsp), %xmm2, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2432(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 2656(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 2560(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2336(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2656(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 2560(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 2880(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23] ; AVX2-ONLY-NEXT: vmovdqa 2784(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 2880(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3104(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3008(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3328(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3232(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 3552(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX2-ONLY-NEXT: vmovdqa 3456(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3104(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3008(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3232(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 3328(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 3552(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm7 = ymm8[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX2-ONLY-NEXT: vmovdqa 3456(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = mem[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm6 = mem[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = mem[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = mem[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = mem[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = mem[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vpunpckhqdq (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = mem[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%r9) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%r9) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 480(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 416(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 352(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 288(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 224(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 160(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 96(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 448(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 384(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 320(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 256(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 192(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 128(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 64(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 480(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 416(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 352(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 288(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 224(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 160(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 96(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 448(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 384(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 320(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 256(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 192(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 128(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 64(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 480(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 448(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 416(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 384(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 352(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 320(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 288(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 256(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 224(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 160(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 480(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 448(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 416(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 384(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 352(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 320(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 288(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 256(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 224(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 192(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 128(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, (%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 480(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 448(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 416(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 384(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 352(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 320(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 288(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 256(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 224(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 192(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 160(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 128(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 96(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 64(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, (%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 384(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 320(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 480(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 448(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 416(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 384(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 352(%rax) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 320(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 288(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 256(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 224(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 192(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 160(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 128(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 480(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 448(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 384(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 352(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 320(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, 288(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm13, 256(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 416(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 384(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 352(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 320(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 288(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, 256(%rax) ; AVX2-ONLY-NEXT: vmovdqa %ymm9, 224(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 192(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 96(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 192(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 160(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm13, 128(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm14, 96(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 32(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: addq $3976, %rsp # imm = 0xF88 +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm15, (%rax) +; AVX2-ONLY-NEXT: addq $3560, %rsp # imm = 0xDE8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride7_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $6536, %rsp # imm = 0x1988 -; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: subq $6664, %rsp # imm = 0x1A08 +; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 3264(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 2880(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2816(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 2880(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2816(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm16 +; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm12 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa 2704(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa 912(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa 1808(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm18, %zmm4, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm15, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa 2816(%rdi), %ymm0 +; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm0 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11] -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm9, %zmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [4,11] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,7,14,0,0,7,14,0] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqa64 3072(%rdi), %zmm30 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,5,6,13,4,5,6,13] -; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm30, %zmm16, %zmm2 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,7,14,0,0,7,14,0] +; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm19, %zmm17, %zmm2 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm30 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [4,5,6,13,4,5,6,13] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm30, %zmm18, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm25 ; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm31, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm16, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm28, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm25, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm16, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm27 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm5 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm17, %zmm3 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm16, %zmm5 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm18, %zmm3 +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm5 -; AVX512F-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm22 +; AVX512F-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm3 +; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm6 ; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm17 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm28 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm16 ; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm19 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm16, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa 2368(%rdi), %ymm5 -; AVX512F-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm9, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm17, %zmm5 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm6, %zmm18, %zmm5 +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm5 +; AVX512F-NEXT: vmovdqa 1472(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm27 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm27, %zmm4, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm24 +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm24, %zmm17, %zmm6 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm26 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm18, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm5 +; AVX512F-NEXT: vmovdqa 1920(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm29, %zmm4, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm17, %zmm6 +; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm20 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm18, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm24, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm7 +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm5 +; AVX512F-NEXT: vmovdqa 2368(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm19 +; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm22 +; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm16, %zmm5 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa 1920(%rdi), %ymm5 -; AVX512F-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm14, %zmm9, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm17, %zmm7 +; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm14 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm18, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm7 +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm6 +; AVX512F-NEXT: vmovdqa 2816(%rdi), %ymm3 +; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm15 +; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm4, %zmm7 -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm13 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm16, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm7, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm5 -; AVX512F-NEXT: vmovdqa 3264(%rdi), %ymm7 -; AVX512F-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-NEXT: vmovdqa64 3200(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm7 -; AVX512F-NEXT: vpermi2q %zmm12, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm4, %zmm7 +; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm17, %zmm7 +; AVX512F-NEXT: vmovdqa64 3072(%rdi), %zmm12 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm18, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm9, %zmm7, %zmm4 -; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm29 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm16, %zmm4 -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4 +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm6 +; AVX512F-NEXT: vmovdqa 3264(%rdi), %ymm5 +; AVX512F-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-NEXT: vmovdqa64 3200(%rdi), %zmm3 +; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm3, %zmm5, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm11, %zmm5, %zmm17 +; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm10 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm18, %zmm17 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 2880(%rdi), %ymm4 +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm4 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0] ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm11, %zmm4, %zmm6 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,14,4,5,6,14] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm30, %zmm5, %zmm6 -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm8 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,5,6,14,4,5,6,14] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm30, %zmm6, %zmm8 +; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm8, %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm7 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm31, %zmm0, %zmm7 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm7 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm8, %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm5, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm25, %zmm4, %zmm7 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm17, %zmm6, %zmm7 ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm4, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm5, %zmm6 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm1 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm5, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm6, %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm30, %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm6, %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm1 +; AVX512F-NEXT: vmovdqa 1984(%rdi), %ymm1 ; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm23, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm5, %zmm3 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm27, %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm6, %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa 2432(%rdi), %ymm1 ; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm30, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm29, %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm6, %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1984(%rdi), %ymm1 +; AVX512F-NEXT: vmovdqa 2880(%rdi), %ymm1 ; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm22, %zmm4, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm5, %zmm3 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm31, %zmm4, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm6, %zmm2 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa 3328(%rdi), %ymm1 ; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vpermi2q %zmm25, %zmm12, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm0 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm31, %zmm9, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm5, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm15, %zmm11, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [10,3,10,3,10,3,10,3] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm15, %zmm26 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [10,3,10,3,10,3,10,3] +; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm19, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [11,4,11,4,11,4,11,4] ; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 ; AVX512F-NEXT: vpermt2q %zmm18, %zmm20, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [12,5,12,5,12,5,12,5] -; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm25, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [12,5,12,5,12,5,12,5] +; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm21, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,6,13,6,13,6,13,6] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm8, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm15, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm20, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm25, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm15, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm20, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,9,2,9,2,9,2,9] +; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm25, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm19, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm20, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm21, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm15, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm7, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm20, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm3, %zmm25 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm19, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm20, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm25, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm21, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm7, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm15, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm19, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm26 ; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm24, %zmm20, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm25, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm7, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm3, %zmm30 ; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm20, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm19, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm25, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm20, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm27 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm15, %zmm27 -; AVX512F-NEXT: vpermi2q %zmm9, %zmm31, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm20, %zmm28 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm2 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm27 +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm20, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm21, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm7, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm22, %zmm3, %zmm29 +; AVX512F-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm19, %zmm28 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm20, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm21, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm13, %zmm3, %zmm31 +; AVX512F-NEXT: vpermi2q %zmm11, %zmm15, %zmm19 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm22, %zmm7, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm18, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm23, %zmm7, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm30, %zmm7, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm17, %zmm8, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm21, %zmm8, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm29, %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm24, %zmm7, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm23, %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm2 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm9, %zmm31, %zmm20 +; AVX512F-NEXT: vpermi2q %zmm11, %zmm15, %zmm20 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm30, %zmm8, %zmm2 +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm2 # 64-byte Folded Reload ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm25, %zmm24 -; AVX512F-NEXT: vpermi2q %zmm9, %zmm31, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm2 -; AVX512F-NEXT: vpermi2q %zmm9, %zmm31, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm11, %zmm15, %zmm21 +; AVX512F-NEXT: vpermi2q %zmm11, %zmm15, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm3, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm2 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,9,0,5,6,9] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm18 +; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm25 +; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm26 ; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm15 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm19 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,10,0,5,6,10] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm20 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm20 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,11,0,5,6,11] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm25 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,12,0,5,6,12] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm8 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,5,8,15,4,5,8,15] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm21 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,12,0,5,6,12] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm7 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,5,8,15,4,5,8,15] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm26 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm27 +; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm31 ; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = <0,7,14,u> +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm18 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm18 ; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <0,7,14,u> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm12, %zmm9, %zmm19 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm11[0,1,4,5],zmm6[4,5,0,1] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm6[0,1,4,5],zmm5[4,5,0,1] ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [7,0,9,0,7,0,9,0] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,11,4,11] -; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm9, %zmm29 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm13[0,1,4,5],zmm18[4,5,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm3, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm18 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k1} = zmm0[0,1,4,5],zmm16[4,5,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm0 +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,11,4,11] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm17 # 64-byte Reload +; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm17 # 64-byte Folded Reload +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm22[4,5,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k1} = zmm0[0,1,4,5],zmm17[4,5,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm27, %zmm11, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm13[0,1,4,5],zmm23[4,5,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm22, %zmm9, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k1} = zmm7[0,1,4,5],zmm21[4,5,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm21, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm25, %zmm11, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k1} = zmm12[0,1,4,5],zmm30[4,5,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm30, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm21, %zmm9, %zmm17 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm26, %zmm11, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm30 {%k1} = zmm9[0,1,4,5],zmm29[4,5,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm29, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm23, %zmm11, %zmm13 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm10[0,1,4,5],zmm24[4,5,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm2, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm3, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm15, %zmm11, %zmm10 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm0[0,1,4,5],zmm23[4,5,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[0,1,4,5],zmm5[4,5,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm9, %zmm13 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm22, %zmm14, %zmm11 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm14, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm0[0,1,4,5],zmm30[4,5,0,1] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm23 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm30, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm3, %zmm0 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[0,1,4,5],zmm1[4,5,0,1] +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm1[0,1,4,5],zmm5[4,5,0,1] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm16 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm30 = [6,13] -; AVX512F-NEXT: vpermt2q %zmm12, %zmm30, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm12 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} xmm24 = [6,13] +; AVX512F-NEXT: vpermt2q %zmm4, %zmm24, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm30, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm24, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm8 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm30, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm24, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm6 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm30, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm24, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm5 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm30, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm24, %zmm3 ; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm30, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm24, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm3 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm1, %zmm14, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm30, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm15, %zmm24, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm30, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm14, %zmm22, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm22, %zmm24, %zmm14 +; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm15, %zmm1 ; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm15, %zmm1 ; AVX512F-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm15, %zmm1 ; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm7, %zmm1 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm1, %zmm15, %zmm1 ; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: movb $-32, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm11 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -11513,100 +9891,108 @@ ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 {%k2} -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm1 {%k2} ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm1 {%k2} ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm1 {%k2} +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 {%k2} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm7 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm19 {%k2} ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm14 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 2752(%rdi), %ymm1 +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovaps %ymm1, %ymm21 +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm23, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm15, %zmm23 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm14, %zmm1, %zmm1 +; AVX512F-NEXT: vinsertf64x4 $0, %ymm21, %zmm1, %zmm1 ; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm14, %zmm21 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm15, %zmm21 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm21 {%k2} ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovaps %ymm1, %ymm26 +; AVX512F-NEXT: vmovdqa 960(%rdi), %ymm1 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm14, %zmm22 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm15, %zmm25 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm22 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm15, %zmm1, %zmm1 +; AVX512F-NEXT: vinsertf64x4 $0, %ymm26, %zmm1, %zmm1 ; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa 1408(%rdi), %ymm1 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm14, %zmm23 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm15, %zmm26 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 960(%rdi), %ymm1 +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm14 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovdqa 1856(%rdi), %ymm1 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm14, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm15, %zmm27 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm15, %zmm1, %zmm1 +; AVX512F-NEXT: vinsertf64x4 $0, %ymm14, %zmm1, %zmm1 ; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa 2304(%rdi), %ymm15 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm1, %zmm15 +; AVX512F-NEXT: vinserti32x4 $0, %xmm15, %zmm29, %zmm15 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm11 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 1856(%rdi), %ymm1 +; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-NEXT: vmovaps %ymm1, %ymm29 +; AVX512F-NEXT: vmovdqa 2752(%rdi), %ymm1 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload @@ -11614,1082 +10000,1087 @@ ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm1 {%k2} ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm11, %zmm14, %zmm14 -; AVX512F-NEXT: vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vinsertf64x4 $0, %ymm29, %zmm14, %zmm29 ; AVX512F-NEXT: vmovdqa 3200(%rdi), %ymm14 ; AVX512F-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23] ; AVX512F-NEXT: vextracti128 $1, %ymm14, %xmm14 ; AVX512F-NEXT: vinserti32x4 $0, %xmm14, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm9, 448(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm13, 384(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm31, 320(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm17, 256(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm18, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm26, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm19, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm29, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm11, 448(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm10, 384(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm13, 320(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm30, 256(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm31, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm16, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm18, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm0, 448(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm3, 256(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm5, 320(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm3, 384(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm4, 320(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm5, 256(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm9, 64(%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm12, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm4, 384(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm7, 448(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm20, 256(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm24, 320(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm27, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm19, 448(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm20, 384(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm22, 320(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%rcx) +; AVX512F-NEXT: vmovaps %zmm0, 256(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm30, 64(%rcx) +; AVX512F-NEXT: vmovaps %zmm0, 192(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512F-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm24, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm28, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm2, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm1, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm1, 384(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm15, 320(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm27, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm26, 192(%r8) ; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm23, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm22, (%r8) ; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm28, 384(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm23, (%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512F-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 320(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512F-NEXT: vmovaps %zmm0, 256(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%r9) +; AVX512F-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512F-NEXT: vmovaps %zmm0, (%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-NEXT: vmovaps %zmm0, (%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovaps %zmm29, 448(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-NEXT: addq $6536, %rsp # imm = 0x1988 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-NEXT: addq $6664, %rsp # imm = 0x1A08 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride7_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $6536, %rsp # imm = 0x1988 -; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm5 +; AVX512BW-NEXT: subq $6600, %rsp # imm = 0x19C8 +; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm18 ; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa 2704(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa 912(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa 912(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa 1808(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 2256(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa 1808(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm19, %zmm5, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm16, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa 3152(%rdi), %xmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 2816(%rdi), %ymm1 +; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [4,11] +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm4, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,7,14,0,0,7,14,0] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [4,5,6,13,4,5,6,13] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,11] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,7,14,0,0,7,14,0] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm30 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [4,5,6,13,4,5,6,13] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm18, %zmm1 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm22 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm18, %zmm1 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 1920(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm29 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm1 +; AVX512BW-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 2368(%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm18, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm18, %zmm3 +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm7, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm18, %zmm4 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm18, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 2368(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm5 -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 1920(%rdi), %ymm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm6 +; AVX512BW-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa 2816(%rdi), %ymm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm15, %zmm6 -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm18, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm16, %zmm6 +; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm18, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm6 ; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm0 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm7, %zmm15 -; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm18, %zmm15 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 2880(%rdi), %ymm6 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [5,12] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm23, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm28 +; AVX512BW-NEXT: vpermi2q %zmm17, %zmm28, %zmm16 +; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm18, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm16, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm4 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm6 = ymm11[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [5,12] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm4, %zmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0] ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm6, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,5,6,14,4,5,6,14] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm13, %zmm11 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm11 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm6, %zmm12 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,5,6,14,4,5,6,14] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm7, %zmm12 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm10 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm13, %zmm10 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm6, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm12 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm10 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm13, %zmm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm7, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm10 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm10 # 64-byte Folded Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm2 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm4, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm13, %zmm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 2432(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm7, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 1984(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm3 # 64-byte Folded Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 1984(%rdi), %ymm2 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm7, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 2432(%rdi), %ymm2 +; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm3 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa 2880(%rdi), %ymm2 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm4, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm2 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm19, %zmm7, %zmm2 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa 3328(%rdi), %ymm1 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm14, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512BW-NEXT: vpermi2q %zmm23, %zmm1, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vpermi2q %zmm28, %zmm17, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm7, %zmm6 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [10,3,10,3,10,3,10,3] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [10,3,10,3,10,3,10,3] ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [12,5,12,5,12,5,12,5] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [11,4,11,4,11,4,11,4] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm20, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm9, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [12,5,12,5,12,5,12,5] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [2,9,2,9,2,9,2,9] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [13,6,13,6,13,6,13,6] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm27 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm18, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm20, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm30 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm19, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm21, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm23, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm19, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm20, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm21, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm9, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm20, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm27 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm18, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm7, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm20, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm19, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm20, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm7, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm19, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm20, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm30 -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm18, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm4, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm19, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm6, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm20, %zmm28 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm6, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm6, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm24 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,9,0,5,6,9] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm17, %zmm28, %zmm19 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm18 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,10,0,5,6,10] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm4 # 64-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm4 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm7, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm17, %zmm28, %zmm20 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm4 # 64-byte Folded Reload ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm19 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,11,0,5,6,11] +; AVX512BW-NEXT: vpermi2q %zmm17, %zmm28, %zmm21 +; AVX512BW-NEXT: vpermi2q %zmm17, %zmm28, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm28 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,9,0,5,6,9] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm29 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm19 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,10,0,5,6,10] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm20 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,11,0,5,6,11] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm21 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,12,0,5,6,12] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm27 +; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm26 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,5,8,15,4,5,8,15] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm20 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,6,12,0,5,6,12] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm25 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,5,8,15,4,5,8,15] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm24 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm17 = <0,7,14,u> +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm28 +; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm28 # 64-byte Folded Reload ; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = <0,7,14,u> -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm27 {%k1} = zmm8[0,1,4,5],zmm26[4,5,0,1] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [7,0,9,0,7,0,9,0] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,11,4,11] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm28 {%k1} = zmm1[0,1,4,5],zmm3[4,5,0,1] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [7,0,9,0,7,0,9,0] +; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm14, %zmm29 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[0,1,4,5],zmm16[4,5,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm14, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k1} = zmm13[0,1,4,5],zmm17[4,5,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,11,4,11] +; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm17, %zmm27 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm27 {%k1} = zmm13[0,1,4,5],zmm6[4,5,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm14, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm0[0,1,4,5],zmm15[4,5,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm16 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm14, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm12[0,1,4,5],zmm0[4,5,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm17, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm0[0,1,4,5],zmm12[4,5,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm14, %zmm15 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm0[0,1,4,5],zmm21[4,5,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm14, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[0,1,4,5],zmm4[4,5,0,1] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm23 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm17, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm10[0,1,4,5],zmm11[4,5,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm1[0,1,4,5],zmm5[4,5,0,1] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm17, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm0[0,1,4,5],zmm9[4,5,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <9,0,7,u> -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm17, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm0[0,1,4,5],zmm8[4,5,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm17, %zmm10 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm3, %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[0,1,4,5],zmm31[4,5,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm31, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm30 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm22, %zmm0, %zmm17 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm21, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm3, %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm0[0,1,4,5],zmm1[4,5,0,1] +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <9,0,7,u> +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm31 = [6,13] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm31, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm9 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm31, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm4 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm18, %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm22, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm22, %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovups (%rsp), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512BW-NEXT: vmovups %zmm1, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm1, %zmm18, %zmm1 ; AVX512BW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm1 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $-32, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm31 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %ymm18 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm18 = mem[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm18, %xmm18 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm18, %zmm23, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm19 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm19 = mem[8,9,10,11,12,13,14,15],ymm19[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm19[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti32x4 $1, %ymm19, %xmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm19, %zmm20, %zmm19 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm19 {%k2} +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm20 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm20 = mem[8,9,10,11,12,13,14,15],ymm20[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm20[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm20, %zmm0, %zmm20 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2} -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %ymm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm20, %zmm21, %zmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %ymm21 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm21 = mem[8,9,10,11,12,13,14,15],ymm21[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm21[16,17,18,19,20,21,22,23] ; AVX512BW-NEXT: vextracti32x4 $1, %ymm21, %xmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm21, %zmm0, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2} -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %ymm22 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm22 = mem[8,9,10,11,12,13,14,15],ymm22[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm22[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm22, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %ymm23 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm23 = mem[8,9,10,11,12,13,14,15],ymm23[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm23[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm23, %xmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm23, %zmm0, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2} -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %ymm24 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm24 = mem[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm24, %xmm24 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti32x4 $0, %xmm24, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2} -; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %ymm25 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm25 = mem[8,9,10,11,12,13,14,15],ymm25[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm25[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti32x4 $1, %ymm25, %xmm25 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm25, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 384(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 320(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 256(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%rsi) +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm21, %zmm22, %zmm21 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm21 {%k2} +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %ymm26 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm26 = mem[8,9,10,11,12,13,14,15],ymm26[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm26[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm26, %xmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm26, %zmm22, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %ymm26 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm26 = mem[8,9,10,11,12,13,14,15],ymm26[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm26[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm26, %xmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm26, %zmm22, %zmm24 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %ymm26 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm26 = mem[8,9,10,11,12,13,14,15],ymm26[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm26[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm26, %xmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm26, %zmm22, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %ymm26 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm26 = mem[8,9,10,11,12,13,14,15],ymm26[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm26[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm26, %xmm26 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-NEXT: vinserti32x4 $0, %xmm26, %zmm22, %zmm22 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %ymm26 +; AVX512BW-NEXT: vpalignr {{.*#+}} ymm26 = mem[8,9,10,11,12,13,14,15],ymm26[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm26[16,17,18,19,20,21,22,23] +; AVX512BW-NEXT: vextracti32x4 $1, %ymm26, %xmm26 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm26, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm17, 448(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 384(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 320(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%rsi) ; AVX512BW-NEXT: vmovdqa64 %zmm27, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm29, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm28, (%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 448(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 384(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 448(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 256(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 320(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm28, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 384(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 256(%r8) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 448(%rcx) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 320(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 256(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm31, (%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 384(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm23, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm19, (%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512BW-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512BW-NEXT: vmovaps %zmm0, 256(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%r9) +; AVX512BW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512BW-NEXT: vmovaps %zmm0, (%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, (%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rax) ; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: addq $6536, %rsp # imm = 0x1988 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-NEXT: addq $6600, %rsp # imm = 0x19C8 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <448 x i64>, ptr %in.vec, align 64 @@ -12716,6 +11107,10 @@ ; AVX2-FAST: {{.*}} ; AVX2-FAST-PERLANE: {{.*}} ; AVX2-SLOW: {{.*}} +; AVX512BW-ONLY-FAST: {{.*}} +; AVX512BW-ONLY-SLOW: {{.*}} +; AVX512DQBW-FAST: {{.*}} +; AVX512DQBW-SLOW: {{.*}} ; FALLBACK0: {{.*}} ; FALLBACK1: {{.*}} ; FALLBACK10: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll @@ -119,30 +119,30 @@ ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX512-NEXT: vmovaps (%rdi), %xmm1 -; AVX512-NEXT: vmovaps 16(%rdi), %xmm2 -; AVX512-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX512-NEXT: vmovaps 48(%rdi), %xmm4 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm0[0] -; AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX512-NEXT: vmovaps 80(%rdi), %xmm1 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0] -; AVX512-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX512-NEXT: vmovaps 96(%rdi), %xmm2 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0] -; AVX512-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX512-NEXT: vmovaps 112(%rdi), %xmm3 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0] -; AVX512-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] -; AVX512-NEXT: vmovaps %xmm5, (%rsi) -; AVX512-NEXT: vmovaps %xmm0, (%rdx) -; AVX512-NEXT: vmovaps %xmm6, (%rcx) -; AVX512-NEXT: vmovaps %xmm1, (%r8) -; AVX512-NEXT: vmovaps %xmm7, (%r9) -; AVX512-NEXT: vmovaps %xmm2, (%r11) -; AVX512-NEXT: vmovaps %xmm8, (%r10) -; AVX512-NEXT: vmovaps %xmm3, (%rax) +; AVX512-NEXT: vmovaps (%rdi), %xmm0 +; AVX512-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX512-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX512-NEXT: vmovaps 48(%rdi), %xmm3 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vpbroadcastq 8(%rdi), %xmm4 +; AVX512-NEXT: vpunpcklqdq 72(%rdi){1to2}, %xmm4, %xmm4 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512-NEXT: vpbroadcastq 24(%rdi), %xmm5 +; AVX512-NEXT: vpunpcklqdq 88(%rdi){1to2}, %xmm5, %xmm5 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512-NEXT: vpbroadcastq 40(%rdi), %xmm6 +; AVX512-NEXT: vpunpcklqdq 104(%rdi){1to2}, %xmm6, %xmm6 +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX512-NEXT: vpbroadcastq 56(%rdi), %xmm7 +; AVX512-NEXT: vpunpcklqdq 120(%rdi){1to2}, %xmm7, %xmm7 +; AVX512-NEXT: vmovaps %xmm0, (%rsi) +; AVX512-NEXT: vmovdqa %xmm4, (%rdx) +; AVX512-NEXT: vmovaps %xmm1, (%rcx) +; AVX512-NEXT: vmovdqa %xmm5, (%r8) +; AVX512-NEXT: vmovaps %xmm2, (%r9) +; AVX512-NEXT: vmovdqa %xmm6, (%r11) +; AVX512-NEXT: vmovaps %xmm3, (%r10) +; AVX512-NEXT: vmovdqa %xmm7, (%rax) ; AVX512-NEXT: retq %wide.vec = load <16 x i64>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <2 x i32> @@ -167,69 +167,69 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind { ; SSE-LABEL: load_i64_stride8_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movaps 112(%rdi), %xmm5 ; SSE-NEXT: movaps 240(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm8 -; SSE-NEXT: movaps 224(%rdi), %xmm10 +; SSE-NEXT: movaps 112(%rdi), %xmm5 +; SSE-NEXT: movaps 224(%rdi), %xmm9 ; SSE-NEXT: movaps 160(%rdi), %xmm0 -; SSE-NEXT: movaps 80(%rdi), %xmm12 -; SSE-NEXT: movaps 208(%rdi), %xmm13 -; SSE-NEXT: movaps 144(%rdi), %xmm2 -; SSE-NEXT: movaps 64(%rdi), %xmm14 -; SSE-NEXT: movaps (%rdi), %xmm7 +; SSE-NEXT: movaps 96(%rdi), %xmm11 +; SSE-NEXT: movaps 208(%rdi), %xmm12 +; SSE-NEXT: movaps 144(%rdi), %xmm3 +; SSE-NEXT: movaps 80(%rdi), %xmm13 +; SSE-NEXT: movaps 192(%rdi), %xmm14 +; SSE-NEXT: movaps 128(%rdi), %xmm7 +; SSE-NEXT: movaps 64(%rdi), %xmm15 +; SSE-NEXT: movaps (%rdi), %xmm8 ; SSE-NEXT: movaps 16(%rdi), %xmm6 ; SSE-NEXT: movaps 32(%rdi), %xmm4 -; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: movaps 192(%rdi), %xmm15 -; SSE-NEXT: movaps 128(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1] +; SSE-NEXT: movaps 48(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm15[1] ; SSE-NEXT: movaps %xmm7, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] -; SSE-NEXT: movaps %xmm2, %xmm14 +; SSE-NEXT: movaps %xmm6, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm13[1] -; SSE-NEXT: movaps %xmm6, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm13[1] +; SSE-NEXT: movaps %xmm3, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm12[1] -; SSE-NEXT: movaps %xmm0, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm12[1] +; SSE-NEXT: movaps %xmm4, %xmm12 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm11[1] +; SSE-NEXT: movaps %xmm0, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; SSE-NEXT: movaps %xmm2, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm5[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] ; SSE-NEXT: movaps 176(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] -; SSE-NEXT: movaps %xmm15, (%rsi) -; SSE-NEXT: movaps %xmm11, 16(%rsi) -; SSE-NEXT: movaps %xmm7, (%rdx) -; SSE-NEXT: movaps %xmm9, 16(%rdx) -; SSE-NEXT: movaps %xmm13, (%rcx) -; SSE-NEXT: movaps %xmm14, 16(%rcx) +; SSE-NEXT: movaps %xmm15, 16(%rsi) +; SSE-NEXT: movaps %xmm10, (%rsi) +; SSE-NEXT: movaps %xmm7, 16(%rdx) +; SSE-NEXT: movaps %xmm8, (%rdx) +; SSE-NEXT: movaps %xmm13, 16(%rcx) +; SSE-NEXT: movaps %xmm14, (%rcx) +; SSE-NEXT: movaps %xmm3, 16(%r8) ; SSE-NEXT: movaps %xmm6, (%r8) -; SSE-NEXT: movaps %xmm2, 16(%r8) -; SSE-NEXT: movaps %xmm10, (%r9) -; SSE-NEXT: movaps %xmm12, 16(%r9) +; SSE-NEXT: movaps %xmm11, 16(%r9) +; SSE-NEXT: movaps %xmm12, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm4, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps %xmm4, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm8, (%rax) ; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movaps %xmm9, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm5, 16(%rax) -; SSE-NEXT: movaps %xmm3, (%rax) +; SSE-NEXT: movaps %xmm2, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride8_vf4: @@ -434,18 +434,18 @@ ; SSE-NEXT: movaps 336(%rdi), %xmm11 ; SSE-NEXT: movaps 464(%rdi), %xmm6 ; SSE-NEXT: movaps 400(%rdi), %xmm7 -; SSE-NEXT: movaps 80(%rdi), %xmm0 -; SSE-NEXT: movaps 208(%rdi), %xmm1 +; SSE-NEXT: movaps 208(%rdi), %xmm0 ; SSE-NEXT: movaps 144(%rdi), %xmm8 +; SSE-NEXT: movaps 80(%rdi), %xmm1 ; SSE-NEXT: movaps 320(%rdi), %xmm2 ; SSE-NEXT: movaps 256(%rdi), %xmm10 ; SSE-NEXT: movaps 448(%rdi), %xmm3 ; SSE-NEXT: movaps 384(%rdi), %xmm12 -; SSE-NEXT: movaps 64(%rdi), %xmm4 -; SSE-NEXT: movaps (%rdi), %xmm13 +; SSE-NEXT: movaps 192(%rdi), %xmm4 +; SSE-NEXT: movaps 128(%rdi), %xmm13 +; SSE-NEXT: movaps 64(%rdi), %xmm5 +; SSE-NEXT: movaps (%rdi), %xmm14 ; SSE-NEXT: movaps 16(%rdi), %xmm9 -; SSE-NEXT: movaps 192(%rdi), %xmm5 -; SSE-NEXT: movaps 128(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm5[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -466,16 +466,16 @@ ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: movaps %xmm9, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -487,123 +487,123 @@ ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm11[1] ; SSE-NEXT: movaps 96(%rdi), %xmm0 -; SSE-NEXT: movaps 32(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps 32(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] ; SSE-NEXT: movaps 224(%rdi), %xmm0 ; SSE-NEXT: movaps 160(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] -; SSE-NEXT: movaps 352(%rdi), %xmm0 -; SSE-NEXT: movaps 288(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movaps 480(%rdi), %xmm0 +; SSE-NEXT: movaps 416(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] -; SSE-NEXT: movaps 480(%rdi), %xmm1 -; SSE-NEXT: movaps 416(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE-NEXT: movaps 352(%rdi), %xmm1 +; SSE-NEXT: movaps 288(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps 112(%rdi), %xmm1 -; SSE-NEXT: movaps 48(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] +; SSE-NEXT: movaps 48(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1] ; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 176(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; SSE-NEXT: movaps 176(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm0[1] ; SSE-NEXT: movaps 368(%rdi), %xmm0 -; SSE-NEXT: movaps 304(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: movaps 304(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: movaps 496(%rdi), %xmm0 ; SSE-NEXT: movaps 432(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps %xmm15, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movaps %xmm11, 32(%r9) -; SSE-NEXT: movaps %xmm8, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r9) +; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movaps %xmm6, 32(%r9) +; SSE-NEXT: movaps %xmm10, 48(%r9) +; SSE-NEXT: movaps %xmm14, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r9) +; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm4, 48(%rax) -; SSE-NEXT: movaps %xmm9, 32(%rax) +; SSE-NEXT: movaps %xmm5, 32(%rax) +; SSE-NEXT: movaps %xmm9, 48(%rax) ; SSE-NEXT: movaps %xmm13, 16(%rax) -; SSE-NEXT: movaps %xmm12, (%rax) +; SSE-NEXT: movaps %xmm11, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 48(%rax) -; SSE-NEXT: movaps %xmm5, 32(%rax) -; SSE-NEXT: movaps %xmm7, 16(%rax) -; SSE-NEXT: movaps %xmm14, (%rax) +; SSE-NEXT: movaps %xmm4, 32(%rax) +; SSE-NEXT: movaps %xmm3, 48(%rax) +; SSE-NEXT: movaps %xmm8, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm1, 48(%rax) -; SSE-NEXT: movaps %xmm3, 32(%rax) -; SSE-NEXT: movaps %xmm6, 16(%rax) -; SSE-NEXT: movaps %xmm10, (%rax) +; SSE-NEXT: movaps %xmm2, 32(%rax) +; SSE-NEXT: movaps %xmm7, 16(%rax) +; SSE-NEXT: movaps %xmm12, (%rax) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i64_stride8_vf8: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $184, %rsp -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm7[0] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm10[0],xmm9[0] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm12[0],xmm11[0] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] @@ -615,14 +615,14 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm12[1],xmm11[1] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm11[0],xmm10[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm13[0],xmm12[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -634,37 +634,37 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm13[1],xmm12[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm2[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] -; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm1[0],xmm4[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm3[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] @@ -674,47 +674,47 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rsi) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%rsi) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsi) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rsi) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rdx) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rdx) ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm8, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 32(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 48(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm13, (%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %xmm11, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 48(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: addq $184, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -722,84 +722,84 @@ ; AVX2-ONLY-LABEL: load_i64_stride8_vf8: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $184, %rsp -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm2[0] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm10[0],xmm5[0] +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm8[0] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm8[0],xmm7[0] +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm9[0],xmm7[0] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm2[1] ; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm5[1] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm8[1] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm6[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm5[0] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm7[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm7[1] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm9[0],ymm13[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm10[0],ymm13[2],ymm10[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm9[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm13[1],ymm9[1],ymm13[3],ymm9[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm9[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm13[1],ymm10[1],ymm13[3],ymm10[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm10[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm2[0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm2[0],xmm0[0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm2[0],xmm0[0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm2[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm2[0],xmm0[0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm2[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm0[0],xmm2[0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm9[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm14[2,3],ymm7[2,3] @@ -810,47 +810,47 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm9[1],ymm2[3],ymm9[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rdx) ; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm8, (%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm15, 16(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm12, 32(%r9) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm8, 32(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm15, 48(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm12, (%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %xmm10, (%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm13, 16(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm10, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm13, 48(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) ; AVX2-ONLY-NEXT: addq $184, %rsp ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -1114,21 +1114,21 @@ ; SSE-LABEL: load_i64_stride8_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $664, %rsp # imm = 0x298 -; SSE-NEXT: movaps 832(%rdi), %xmm0 -; SSE-NEXT: movaps 320(%rdi), %xmm1 -; SSE-NEXT: movaps 256(%rdi), %xmm8 -; SSE-NEXT: movaps 960(%rdi), %xmm2 -; SSE-NEXT: movaps 896(%rdi), %xmm9 -; SSE-NEXT: movaps 448(%rdi), %xmm3 -; SSE-NEXT: movaps 384(%rdi), %xmm10 -; SSE-NEXT: movaps 576(%rdi), %xmm4 -; SSE-NEXT: movaps 512(%rdi), %xmm11 -; SSE-NEXT: movaps 64(%rdi), %xmm5 -; SSE-NEXT: movaps (%rdi), %xmm12 -; SSE-NEXT: movaps 704(%rdi), %xmm6 -; SSE-NEXT: movaps 640(%rdi), %xmm13 -; SSE-NEXT: movaps 192(%rdi), %xmm7 -; SSE-NEXT: movaps 128(%rdi), %xmm14 +; SSE-NEXT: movaps 960(%rdi), %xmm0 +; SSE-NEXT: movaps 448(%rdi), %xmm1 +; SSE-NEXT: movaps 384(%rdi), %xmm8 +; SSE-NEXT: movaps 832(%rdi), %xmm2 +; SSE-NEXT: movaps 768(%rdi), %xmm9 +; SSE-NEXT: movaps 320(%rdi), %xmm3 +; SSE-NEXT: movaps 256(%rdi), %xmm10 +; SSE-NEXT: movaps 704(%rdi), %xmm4 +; SSE-NEXT: movaps 640(%rdi), %xmm11 +; SSE-NEXT: movaps 192(%rdi), %xmm5 +; SSE-NEXT: movaps 128(%rdi), %xmm12 +; SSE-NEXT: movaps 576(%rdi), %xmm6 +; SSE-NEXT: movaps 512(%rdi), %xmm13 +; SSE-NEXT: movaps 64(%rdi), %xmm7 +; SSE-NEXT: movaps (%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1164,7 +1164,7 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 768(%rdi), %xmm1 +; SSE-NEXT: movaps 896(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1261,11 +1261,11 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] ; SSE-NEXT: movaps 736(%rdi), %xmm0 -; SSE-NEXT: movaps 672(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movaps 672(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] ; SSE-NEXT: movaps 864(%rdi), %xmm0 ; SSE-NEXT: movaps 800(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, %xmm1 @@ -1286,11 +1286,11 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 176(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps 176(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] ; SSE-NEXT: movaps 368(%rdi), %xmm0 ; SSE-NEXT: movaps 304(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, %xmm1 @@ -1324,54 +1324,54 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rcx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps %xmm0, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rcx) +; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%r8) @@ -1406,7 +1406,7 @@ ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm7, 112(%rax) ; SSE-NEXT: movaps %xmm9, 96(%rax) -; SSE-NEXT: movaps %xmm12, 80(%rax) +; SSE-NEXT: movaps %xmm13, 80(%rax) ; SSE-NEXT: movaps %xmm15, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) @@ -1436,7 +1436,7 @@ ; SSE-NEXT: movaps %xmm5, 64(%rax) ; SSE-NEXT: movaps %xmm14, 48(%rax) ; SSE-NEXT: movaps %xmm10, 32(%rax) -; SSE-NEXT: movaps %xmm13, 16(%rax) +; SSE-NEXT: movaps %xmm12, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: addq $664, %rsp # imm = 0x298 @@ -1444,86 +1444,86 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride8_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $792, %rsp # imm = 0x318 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX1-ONLY-NEXT: subq $808, %rsp # imm = 0x328 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0] ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm10[0] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm9[1],xmm8[1] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovaps 848(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm13 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm13[0],ymm10[0],ymm13[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 848(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm15[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1545,242 +1545,242 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm8[0],ymm11[0],ymm8[2],ymm11[2] -; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm7[0],xmm9[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],ymm12[0],ymm7[2],ymm12[2] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm12[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm13[0],xmm15[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm10[0],ymm2[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm3[0],xmm4[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm10[1],ymm2[3],ymm10[3] +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm4[0],xmm6[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm12[1],ymm7[3],ymm12[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm9[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm13[1],xmm15[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm12[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%r9) +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm6[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 112(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%r9) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 80(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%rax) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 112(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 80(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm15, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rax) -; AVX1-ONLY-NEXT: addq $792, %rsp # imm = 0x318 +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) +; AVX1-ONLY-NEXT: addq $808, %rsp # imm = 0x328 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride8_vf16: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $808, %rsp # imm = 0x328 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm7 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] ; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0] ; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm10[0] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm9[1],xmm8[1] -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm9[1],xmm8[1] +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1789,35 +1789,35 @@ ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] @@ -1841,195 +1841,195 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm1[0],ymm11[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm4[0],ymm15[0],ymm4[2],ymm15[2] +; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm12[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm10[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm13[1],ymm6[3],ymm13[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm3[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 16(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, (%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 64(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 80(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 112(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 96(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 32(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 48(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, (%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 16(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 80(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 112(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 32(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 80(%r9) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 64(%r9) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, (%r9) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 16(%r9) -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 112(%r9) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 96(%r9) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 32(%r9) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 48(%r9) +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm15[1],ymm4[3],ymm15[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 64(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 96(%r9) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%r9) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 112(%r9) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%r9) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 64(%r9) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%r9) +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 80(%r9) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%r9) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 80(%rax) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, (%rax) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 16(%rax) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 112(%rax) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm3, 48(%rax) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 112(%rax) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%rax) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 80(%rax) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rax) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm12, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm13, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rax) ; AVX2-ONLY-NEXT: addq $808, %rsp # imm = 0x328 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -2517,21 +2517,21 @@ ; SSE-LABEL: load_i64_stride8_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $1688, %rsp # imm = 0x698 -; SSE-NEXT: movaps 832(%rdi), %xmm0 -; SSE-NEXT: movaps 320(%rdi), %xmm1 -; SSE-NEXT: movaps 256(%rdi), %xmm8 -; SSE-NEXT: movaps 960(%rdi), %xmm2 -; SSE-NEXT: movaps 896(%rdi), %xmm10 -; SSE-NEXT: movaps 448(%rdi), %xmm3 -; SSE-NEXT: movaps 384(%rdi), %xmm9 -; SSE-NEXT: movaps 576(%rdi), %xmm4 -; SSE-NEXT: movaps 512(%rdi), %xmm12 -; SSE-NEXT: movaps 64(%rdi), %xmm5 -; SSE-NEXT: movaps (%rdi), %xmm11 -; SSE-NEXT: movaps 704(%rdi), %xmm6 -; SSE-NEXT: movaps 640(%rdi), %xmm14 -; SSE-NEXT: movaps 192(%rdi), %xmm7 -; SSE-NEXT: movaps 128(%rdi), %xmm13 +; SSE-NEXT: movaps 960(%rdi), %xmm0 +; SSE-NEXT: movaps 448(%rdi), %xmm1 +; SSE-NEXT: movaps 384(%rdi), %xmm8 +; SSE-NEXT: movaps 832(%rdi), %xmm2 +; SSE-NEXT: movaps 768(%rdi), %xmm10 +; SSE-NEXT: movaps 320(%rdi), %xmm3 +; SSE-NEXT: movaps 256(%rdi), %xmm9 +; SSE-NEXT: movaps 704(%rdi), %xmm4 +; SSE-NEXT: movaps 640(%rdi), %xmm12 +; SSE-NEXT: movaps 192(%rdi), %xmm5 +; SSE-NEXT: movaps 128(%rdi), %xmm11 +; SSE-NEXT: movaps 576(%rdi), %xmm6 +; SSE-NEXT: movaps 512(%rdi), %xmm14 +; SSE-NEXT: movaps 64(%rdi), %xmm7 +; SSE-NEXT: movaps (%rdi), %xmm13 ; SSE-NEXT: movaps %xmm13, %xmm15 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0] ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2567,14 +2567,7 @@ ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm2[1] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 768(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1216(%rdi), %xmm0 -; SSE-NEXT: movaps 1152(%rdi), %xmm1 +; SSE-NEXT: movaps 896(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2587,8 +2580,8 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1472(%rdi), %xmm0 -; SSE-NEXT: movaps 1408(%rdi), %xmm1 +; SSE-NEXT: movaps 1216(%rdi), %xmm0 +; SSE-NEXT: movaps 1152(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2601,8 +2594,8 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1728(%rdi), %xmm0 -; SSE-NEXT: movaps 1664(%rdi), %xmm1 +; SSE-NEXT: movaps 1472(%rdi), %xmm0 +; SSE-NEXT: movaps 1408(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2615,8 +2608,8 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 1984(%rdi), %xmm0 -; SSE-NEXT: movaps 1920(%rdi), %xmm1 +; SSE-NEXT: movaps 1728(%rdi), %xmm0 +; SSE-NEXT: movaps 1664(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2629,6 +2622,13 @@ ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 1984(%rdi), %xmm0 +; SSE-NEXT: movaps 1920(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdi), %xmm0 ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 @@ -2951,14 +2951,6 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rsi) @@ -2967,13 +2959,13 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps %xmm0, 224(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps %xmm0, 160(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps %xmm0, 96(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2983,38 +2975,46 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 224(%rdx) +; SSE-NEXT: movaps %xmm0, 192(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rdx) +; SSE-NEXT: movaps %xmm0, 224(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rdx) +; SSE-NEXT: movaps %xmm0, 192(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rdx) +; SSE-NEXT: movaps %xmm0, 160(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rdx) +; SSE-NEXT: movaps %xmm0, 128(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rdx) +; SSE-NEXT: movaps %xmm0, 96(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 64(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%rcx) @@ -3199,54 +3199,54 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride8_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2280, %rsp # imm = 0x8E8 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm0[0] +; AVX1-ONLY-NEXT: subq $2216, %rsp # imm = 0x8A8 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm9[0],xmm7[0] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0] ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm8[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm9[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm10[0] -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm10[0] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 @@ -3260,26 +3260,26 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -3294,122 +3294,112 @@ ; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm10[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm11[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1104(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm12[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1872(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1104(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm3[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm8[0],ymm11[0],ymm8[2],ymm11[2] ; AVX1-ONLY-NEXT: vmovaps 848(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm2[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm6[0],xmm9[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm10[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm14[1],mem[1],ymm14[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm11[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovaps 1872(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm10[0],xmm13[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm12[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm3[1],ymm8[3],ymm3[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm11[1],ymm8[3],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm5[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3420,6 +3410,10 @@ ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm9[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] @@ -3428,80 +3422,80 @@ ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],xmm13[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -3512,8 +3506,8 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -3521,323 +3515,329 @@ ; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm13[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm10[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm4[0],xmm11[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm5[0],xmm12[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm6[0],xmm13[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm7[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm1[0],xmm4[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm3[0],xmm7[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1904(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm15[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm10[0],ymm0[0],ymm10[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm8[0],xmm11[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX1-ONLY-NEXT: vmovaps 1904(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm12[0],xmm0[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm13[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm4[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm7[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm11[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm12[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm13[1] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm11[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm15[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r8) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, (%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%r9) +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm12[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 240(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 176(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 208(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 144(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 240(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 176(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 208(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 144(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 224(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 240(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 192(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 208(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 144(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 240(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 80(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 144(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 112(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 240(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 176(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 144(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 112(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 80(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $2280, %rsp # imm = 0x8E8 +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rax) +; AVX1-ONLY-NEXT: addq $2216, %rsp # imm = 0x8A8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3845,52 +3845,52 @@ ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $2408, %rsp # imm = 0x968 ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm0[0] -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm4[0] -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm5[0] -; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm10[0],xmm9[0] +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] +; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] ; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0] +; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm6[1],xmm4[1] -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm8[0] -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm10[1],xmm9[1] -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm7[1],xmm5[1] -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm2[0] -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm10[0] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm8[1] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 @@ -3898,26 +3898,26 @@ ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -3980,172 +3980,166 @@ ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm12[0],ymm15[0],ymm12[2],ymm15[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm2[0],ymm9[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm14[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm14[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm12[1],ymm15[1],ymm12[3],ymm15[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm2[1],ymm9[3],ymm2[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -4153,17 +4147,23 @@ ; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] @@ -4176,144 +4176,138 @@ ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm12[0],ymm13[0],ymm12[2],ymm13[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm7[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm10[0],ymm5[2],ymm10[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm15 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm15 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm8[2,3],ymm7[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm7[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm7[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm10[1],ymm5[3],ymm10[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 240(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 224(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 160(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 240(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 176(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 208(%rsi) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 192(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -4323,6 +4317,8 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 208(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 144(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) @@ -4333,14 +4329,14 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 240(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 160(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 176(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 112(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) @@ -4349,25 +4345,25 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 208(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 128(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 144(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -4379,576 +4375,585 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 240(%r9) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 224(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%r9) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 160(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 176(%r9) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 96(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 112(%r9) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 240(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, (%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 176(%r9) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 112(%r9) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 192(%r9) -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 208(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 128(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 144(%r9) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 64(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 80(%r9) -; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 240(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 224(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 208(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 208(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 144(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 192(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 80(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%r9) +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 224(%rax) +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 240(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 192(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 80(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 208(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 160(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 176(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 144(%rax) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 128(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 144(%rax) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 96(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 112(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 80(%rax) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%rax) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rax) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rax) ; AVX2-ONLY-NEXT: addq $2408, %rsp # imm = 0x968 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i64_stride8_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $2568, %rsp # imm = 0xA08 -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm31 -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm27 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm30 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512F-NEXT: subq $2472, %rsp # imm = 0x9A8 +; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm25 +; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm26 +; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm11 +; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm28 +; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm29 +; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm23 +; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm24 +; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm21 +; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm27 ; AVX512F-NEXT: movb $-64, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %ymm28 -; AVX512F-NEXT: vmovdqa 1152(%rdi), %ymm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm28[0],ymm1[2],ymm28[2] -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %ymm25 -; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm5 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm25[0],ymm5[2],ymm25[2] +; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm18[0],ymm1[2],ymm18[2] +; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm16 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm16[0],ymm5[2],ymm16[2] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 704(%rdi), %ymm23 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %ymm17 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm17[0],ymm23[0],ymm17[2],ymm23[2] -; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm6 +; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm9 ; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm11[2,3],ymm8[2,3] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm12[2,3],ymm8[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm4, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm8 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm16[0],ymm4[0],ymm16[2],ymm4[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm8, %zmm8 +; AVX512F-NEXT: vmovdqa 1216(%rdi), %ymm4 +; AVX512F-NEXT: vmovdqa 1152(%rdi), %ymm12 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm12[0],ymm4[0],ymm12[2],ymm4[2] +; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm15 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %ymm19 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm19[0],ymm15[0],ymm19[2],ymm15[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] +; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm31 +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm21, %zmm12, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm30 +; AVX512F-NEXT: vpermi2q %zmm25, %zmm30, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa 1728(%rdi), %ymm14 -; AVX512F-NEXT: vmovdqa 1664(%rdi), %ymm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %ymm18 -; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm8 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm8[0],ymm18[0],ymm8[2],ymm18[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] +; AVX512F-NEXT: vmovdqa 1728(%rdi), %ymm8 +; AVX512F-NEXT: vmovdqa 1664(%rdi), %ymm11 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm11[0],ymm8[0],ymm11[2],ymm8[2] +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %ymm20 +; AVX512F-NEXT: vmovdqa64 1536(%rdi), %ymm22 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm22[0],ymm20[0],ymm22[2],ymm20[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm28[1],ymm1[3],ymm28[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm25[1],ymm5[3],ymm25[3] +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm0, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm18[1],ymm1[3],ymm18[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm16[1],ymm5[3],ymm16[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm17[1],ymm23[1],ymm17[3],ymm23[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm10[1],ymm2[1],ymm10[3],ymm2[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm9 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm16[1],ymm4[1],ymm16[3],ymm4[3] -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm10 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm4[1],ymm12[3],ymm4[3] +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm19[1],ymm15[1],ymm19[3],ymm15[3] +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm21, %zmm23, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm15 +; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm25, %zmm30, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm18[1],ymm8[3],ymm18[3] +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm8[1],ymm11[3],ymm8[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm22[1],ymm20[1],ymm22[3],ymm20[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm31[0],zmm19[0],zmm31[2],zmm19[2],zmm31[4],zmm19[4],zmm31[6],zmm19[6] +; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm20 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm21 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] ; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm15[0],zmm13[0],zmm15[2],zmm13[2],zmm15[4],zmm13[4],zmm15[6],zmm13[6] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm13 ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm5 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm18[0],zmm14[0],zmm18[2],zmm14[2],zmm18[4],zmm14[4],zmm18[6],zmm14[6] +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm24 ; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm29 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm29 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm17[0],zmm7[2],zmm17[2],zmm7[4],zmm17[4],zmm7[6],zmm17[6] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm1, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm23[0],zmm21[0],zmm23[2],zmm21[2],zmm23[4],zmm21[4],zmm23[6],zmm21[6] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm15[0],zmm25[0],zmm15[2],zmm25[2],zmm15[4],zmm25[4],zmm15[6],zmm25[6] +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm4 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm7, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2q %zmm5, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm5 +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm7, %zmm2, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm5, %zmm2, %zmm1 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm18, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] +; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm22, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm31[1],zmm19[1],zmm31[3],zmm19[3],zmm31[5],zmm19[5],zmm31[7],zmm19[7] -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm19, %zmm10, %zmm31 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm7, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm21[1],zmm23[1],zmm21[3],zmm23[3],zmm21[5],zmm23[5],zmm21[7],zmm23[7] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm23, %zmm26, %zmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm27[0],zmm20[0],zmm27[2],zmm20[2],zmm27[4],zmm20[4],zmm27[6],zmm20[6] +; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm27[1],zmm20[1],zmm27[3],zmm20[3],zmm27[5],zmm20[5],zmm27[7],zmm20[7] ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm27[1],zmm26[1],zmm27[3],zmm26[3],zmm27[5],zmm26[5],zmm27[7],zmm26[7] -; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm18, %zmm27 -; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm26, %zmm23, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm2, %zmm12 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512F-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm7, %zmm15 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm19, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm22, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm18, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm23, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm3, %zmm21 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm10, %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k1} = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm16[1],zmm30[1],zmm16[3],zmm30[3],zmm16[5],zmm30[5],zmm16[7],zmm30[7] -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm18, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm23, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm2, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm5, %zmm7, %zmm15 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,13,5,13] +; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm6, %zmm5, %zmm8 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm19, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm22, %zmm23 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm16 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm7, %zmm16 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm14[1],zmm18[3],zmm14[3],zmm18[5],zmm14[5],zmm18[7],zmm14[7] +; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k1} = zmm18[0],zmm1[0],zmm18[2],zmm1[2],zmm18[4],zmm1[4],zmm18[6],zmm1[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm18[1],zmm1[1],zmm18[3],zmm1[3],zmm18[5],zmm1[5],zmm18[7],zmm1[7] +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm19, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm28 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512F-NEXT: vpermt2q %zmm30, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm24, %zmm5, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm11[4,5,6,7] ; AVX512F-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm12 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm18, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm23, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm26 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm1[1],zmm5[3],zmm1[3],zmm5[5],zmm1[5],zmm5[7],zmm1[7] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm12[0],zmm28[0],zmm12[2],zmm28[2],zmm12[4],zmm28[4],zmm12[6],zmm28[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm12[1],zmm28[1],zmm12[3],zmm28[3],zmm12[5],zmm28[5],zmm12[7],zmm28[7] -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm23, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm2, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm18, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm1, %zmm6, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm23, %zmm28 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm6, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm6[1],zmm1[1],zmm6[3],zmm1[3],zmm6[5],zmm1[5],zmm6[7],zmm1[7] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm30[0],zmm4[0],zmm30[2],zmm4[2],zmm30[4],zmm4[4],zmm30[6],zmm4[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm30[1],zmm4[1],zmm30[3],zmm4[3],zmm30[5],zmm4[5],zmm30[7],zmm4[7] -; AVX512F-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm1, %zmm19, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm22, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm24 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm17[1],zmm1[1],zmm17[3],zmm1[3],zmm17[5],zmm1[5],zmm17[7],zmm1[7] +; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm15, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm10, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 32-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm11[0],zmm1[0],zmm11[2],zmm1[2],zmm11[4],zmm1[4],zmm11[6],zmm1[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm11[1],zmm1[1],zmm11[3],zmm1[3],zmm11[5],zmm1[5],zmm11[7],zmm1[7] +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm19, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm22, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm25 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm1, %zmm19, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm22, %zmm27 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm3[1],zmm2[1],zmm3[3],zmm2[3],zmm3[5],zmm2[5],zmm3[7],zmm2[7] +; AVX512F-NEXT: vpermt2q %zmm2, %zmm26, %zmm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7] +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm7, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm29, %zmm26, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm26, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm7, %zmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm2, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm10, %zmm26, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm10, %zmm9, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm10, %zmm9, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermt2q %zmm10, %zmm26, %zmm9 +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm14 {%k1} +; AVX512F-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm9 # 32-byte Folded Reload +; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm9, %zmm5, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm26, %zmm5, %zmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vpermi2q %zmm20, %zmm6, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14] ; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [7,15,7,15] -; AVX512F-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm7, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512F-NEXT: vpermi2q %zmm9, %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm7, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-NEXT: vpermi2q %zmm20, %zmm6, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm1 +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdi), %xmm14 -; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm13[0],xmm14[0] -; AVX512F-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm10[0],xmm7[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm8, %ymm21, %ymm8 -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa 192(%rdi), %xmm9 +; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm8[0],xmm9[0] +; AVX512F-NEXT: vmovdqa64 (%rdi), %xmm16 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %xmm21 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm16[0],xmm21[0] +; AVX512F-NEXT: vinserti32x4 $1, %xmm14, %ymm28, %ymm14 +; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqa 704(%rdi), %xmm12 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %xmm21 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm21[0],xmm12[0] +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa64 704(%rdi), %xmm18 +; AVX512F-NEXT: vmovdqa64 640(%rdi), %xmm28 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm18[0] ; AVX512F-NEXT: vmovdqa64 576(%rdi), %xmm31 ; AVX512F-NEXT: vmovdqa 512(%rdi), %xmm5 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm5[0],xmm31[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm30, %ymm9, %ymm9 -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %xmm19 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %xmm27 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm27[0],xmm19[0] -; AVX512F-NEXT: vmovdqa 1088(%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa 1024(%rdi), %xmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm2[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm30, %ymm4, %ymm4 -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %xmm25 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %xmm30 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm30[0],xmm25[0] -; AVX512F-NEXT: vmovdqa 1600(%rdi), %xmm4 -; AVX512F-NEXT: vmovdqa 1536(%rdi), %xmm3 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm3[0],xmm4[0] -; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm9, %ymm6 -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm18, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm5[0],xmm31[0] +; AVX512F-NEXT: vinserti32x4 $1, %xmm29, %ymm26, %ymm26 +; AVX512F-NEXT: vinserti64x4 $0, %ymm26, %zmm15, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa 1216(%rdi), %xmm11 +; AVX512F-NEXT: vmovdqa64 1152(%rdi), %xmm26 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm26[0],xmm11[0] +; AVX512F-NEXT: vmovdqa 1088(%rdi), %xmm4 +; AVX512F-NEXT: vmovdqa 1024(%rdi), %xmm3 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm3[0],xmm4[0] +; AVX512F-NEXT: vinserti32x4 $1, %xmm29, %ymm6, %ymm6 +; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa 1728(%rdi), %xmm7 +; AVX512F-NEXT: vmovdqa64 1664(%rdi), %xmm25 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm25[0],xmm7[0] +; AVX512F-NEXT: vmovdqa 1600(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 1536(%rdi), %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm1[0],xmm2[0] +; AVX512F-NEXT: vinserti32x4 $1, %xmm29, %ymm14, %ymm14 +; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm19, %zmm14 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm9[1] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm16[1],xmm21[1] +; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm27[1],xmm19[1] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX512F-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm16, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm22 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm21[1],xmm12[1] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm23 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm28[1],xmm18[1] ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm31[1] -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm20 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm13[1],xmm14[1] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm10[1],xmm7[1] -; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm30[1],xmm25[1] +; AVX512F-NEXT: vinserti128 $1, %xmm9, %ymm5, %ymm5 +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm30 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm26[1],xmm11[1] ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm4 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm25[1],xmm7[1] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm2 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm2 +; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512F-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512F-NEXT: vmovups (%rsp), %zmm7 # 64-byte Reload ; AVX512F-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload @@ -4961,530 +4966,537 @@ ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512F-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX512F-NEXT: # ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vmovups (%rsp), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm10, %zmm11, %zmm10 ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm11 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm11, %zmm12, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm6, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm6, 64(%rsi) +; AVX512F-NEXT: vinsertf64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm14, 192(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm13, 64(%rsi) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm6, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm1, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm5, 64(%rdx) +; AVX512F-NEXT: vmovdqa64 %zmm8, (%rdx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm1, 192(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, (%rcx) +; AVX512F-NEXT: vmovaps %zmm1, 128(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm1, 64(%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 128(%rcx) +; AVX512F-NEXT: vmovaps %zmm1, (%rcx) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm1, 192(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, (%r8) +; AVX512F-NEXT: vmovaps %zmm1, 128(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm1, 64(%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 128(%r8) +; AVX512F-NEXT: vmovaps %zmm1, (%r8) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm1, 192(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, (%r9) +; AVX512F-NEXT: vmovaps %zmm1, 128(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm1, 64(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 128(%r9) +; AVX512F-NEXT: vmovaps %zmm1, (%r9) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm1, 192(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, (%rax) +; AVX512F-NEXT: vmovaps %zmm1, 128(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm1, 64(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm1, 128(%rax) +; AVX512F-NEXT: vmovaps %zmm1, (%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-NEXT: vmovaps %zmm0, (%rax) ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovaps %zmm11, 128(%rax) ; AVX512F-NEXT: vmovaps %zmm10, 192(%rax) -; AVX512F-NEXT: vmovaps %zmm9, (%rax) +; AVX512F-NEXT: vmovaps %zmm9, 128(%rax) ; AVX512F-NEXT: vmovaps %zmm7, 64(%rax) -; AVX512F-NEXT: addq $2568, %rsp # imm = 0xA08 +; AVX512F-NEXT: vmovaps %zmm4, (%rax) +; AVX512F-NEXT: addq $2472, %rsp # imm = 0x9A8 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: load_i64_stride8_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2568, %rsp # imm = 0xA08 -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512BW-NEXT: subq $2472, %rsp # imm = 0x9A8 +; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm26 +; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm28 +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm29 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm27 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %ymm28 -; AVX512BW-NEXT: vmovdqa 1152(%rdi), %ymm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm28[0],ymm1[2],ymm28[2] -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %ymm25 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm25[0],ymm5[2],ymm25[2] +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm1[0],ymm18[0],ymm1[2],ymm18[2] +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm16 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm5[0],ymm16[0],ymm5[2],ymm16[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm7[2,3],ymm4[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %ymm23 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %ymm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm17[0],ymm23[0],ymm17[2],ymm23[2] -; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm6 +; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm9 ; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm11[2,3],ymm8[2,3] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm12[2,3],ymm8[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm4, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm9 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm16[0],ymm4[0],ymm16[2],ymm4[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm8, %zmm8 +; AVX512BW-NEXT: vmovdqa 1216(%rdi), %ymm4 +; AVX512BW-NEXT: vmovdqa 1152(%rdi), %ymm12 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm12[0],ymm4[0],ymm12[2],ymm4[2] +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm15 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %ymm19 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm19[0],ymm15[0],ymm19[2],ymm15[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm31 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm12, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm30 +; AVX512BW-NEXT: vpermi2q %zmm25, %zmm30, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm14 -; AVX512BW-NEXT: vmovdqa 1664(%rdi), %ymm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm18 -; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm8[0],ymm18[0],ymm8[2],ymm18[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] +; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm8 +; AVX512BW-NEXT: vmovdqa 1664(%rdi), %ymm11 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm11[0],ymm8[0],ymm11[2],ymm8[2] +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm20 +; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %ymm22 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm22[0],ymm20[0],ymm22[2],ymm20[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm28[1],ymm1[3],ymm28[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm25[1],ymm5[3],ymm25[3] +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm0, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm18[1],ymm1[3],ymm18[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm16[1],ymm5[3],ymm16[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm1 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm17[1],ymm23[1],ymm17[3],ymm23[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm10[1],ymm2[1],ymm10[3],ymm2[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm9 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm16[1],ymm4[1],ymm16[3],ymm4[3] -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm10 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm4[1],ymm12[3],ymm4[3] +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm10 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm19[1],ymm15[1],ymm19[3],ymm15[3] +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm21, %zmm23, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm25, %zmm30, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm18[1],ymm8[3],ymm18[3] +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm9 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm8[1],ymm11[3],ymm8[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm22[1],ymm20[1],ymm22[3],ymm20[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm31[0],zmm19[0],zmm31[2],zmm19[2],zmm31[4],zmm19[4],zmm31[6],zmm19[6] +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm21 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm24[0],zmm23[0],zmm24[2],zmm23[2],zmm24[4],zmm23[4],zmm24[6],zmm23[6] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm15[0],zmm13[0],zmm15[2],zmm13[2],zmm15[4],zmm13[4],zmm15[6],zmm13[6] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm18[0],zmm14[0],zmm18[2],zmm14[2],zmm18[4],zmm14[4],zmm18[6],zmm14[6] +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm24 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm29 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm6[0],zmm7[0],zmm6[2],zmm7[2],zmm6[4],zmm7[4],zmm6[6],zmm7[6] +; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm29 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm7[0],zmm17[0],zmm7[2],zmm17[2],zmm7[4],zmm17[4],zmm7[6],zmm17[6] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm4 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm23[0],zmm21[0],zmm23[2],zmm21[2],zmm23[4],zmm21[4],zmm23[6],zmm21[6] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm15[0],zmm25[0],zmm15[2],zmm25[2],zmm15[4],zmm25[4],zmm15[6],zmm25[6] +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm2, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm5, %zmm2, %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm18, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm19, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm23, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm22, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm31[1],zmm19[1],zmm31[3],zmm19[3],zmm31[5],zmm19[5],zmm31[7],zmm19[7] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm31 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm7, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm21[1],zmm23[1],zmm21[3],zmm23[3],zmm21[5],zmm23[5],zmm21[7],zmm23[7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm23, %zmm26, %zmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm27[0],zmm20[0],zmm27[2],zmm20[2],zmm27[4],zmm20[4],zmm27[6],zmm20[6] +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm27[1],zmm20[1],zmm27[3],zmm20[3],zmm27[5],zmm20[5],zmm27[7],zmm20[7] ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm27[1],zmm26[1],zmm27[3],zmm26[3],zmm27[5],zmm26[5],zmm27[7],zmm26[7] -; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm18, %zmm27 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm12 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] -; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm15 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm19, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm22, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm18, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm23, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm3, %zmm21 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm11[1],zmm16[1],zmm11[3],zmm16[3],zmm11[5],zmm16[5],zmm11[7],zmm16[7] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm10, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k1} = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm16[1],zmm30[1],zmm16[3],zmm30[3],zmm16[5],zmm30[5],zmm16[7],zmm30[7] -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm18, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm23, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm2, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm7, %zmm15 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,13,5,13] +; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm19, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm22, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm16 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm7, %zmm16 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 = zmm18[1],zmm14[1],zmm18[3],zmm14[3],zmm18[5],zmm14[5],zmm18[7],zmm14[7] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k1} = zmm18[0],zmm1[0],zmm18[2],zmm1[2],zmm18[4],zmm1[4],zmm18[6],zmm1[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm18[1],zmm1[1],zmm18[3],zmm1[3],zmm18[5],zmm1[5],zmm18[7],zmm1[7] +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm28 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm20 +; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm5, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm11[4,5,6,7] ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm12 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm18, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm26 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm5[1],zmm1[1],zmm5[3],zmm1[3],zmm5[5],zmm1[5],zmm5[7],zmm1[7] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k1} = zmm12[0],zmm28[0],zmm12[2],zmm28[2],zmm12[4],zmm28[4],zmm12[6],zmm28[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm12[1],zmm28[1],zmm12[3],zmm28[3],zmm12[5],zmm28[5],zmm12[7],zmm28[7] -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm18, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm6, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm28 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm6, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm6[1],zmm1[1],zmm6[3],zmm1[3],zmm6[5],zmm1[5],zmm6[7],zmm1[7] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm30[0],zmm4[0],zmm30[2],zmm4[2],zmm30[4],zmm4[4],zmm30[6],zmm4[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 {%k1} = zmm30[1],zmm4[1],zmm30[3],zmm4[3],zmm30[5],zmm4[5],zmm30[7],zmm4[7] -; AVX512BW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm24 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm17[1],zmm1[1],zmm17[3],zmm1[3],zmm17[5],zmm1[5],zmm17[7],zmm1[7] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm15, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm14 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 32-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm24 {%k1} = zmm11[0],zmm1[0],zmm11[2],zmm1[2],zmm11[4],zmm1[4],zmm11[6],zmm1[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm11[1],zmm1[1],zmm11[3],zmm1[3],zmm11[5],zmm1[5],zmm11[7],zmm1[7] +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm25 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm27 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm7, %zmm17 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm3[1],zmm2[1],zmm3[3],zmm2[3],zmm3[5],zmm2[5],zmm3[7],zmm2[7] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k1} = zmm31[0],zmm1[0],zmm31[2],zmm1[2],zmm31[4],zmm1[4],zmm31[6],zmm1[6] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm31[1],zmm1[1],zmm31[3],zmm1[3],zmm31[5],zmm1[5],zmm31[7],zmm1[7] +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm29, %zmm26, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm26, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm26, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm10, %zmm9, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm10, %zmm9, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm26, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm14 {%k1} +; AVX512BW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm9 # 32-byte Folded Reload +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm5, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm5, %zmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vpermi2q %zmm20, %zmm6, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14] ; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [7,15,7,15] -; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm7, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15] +; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm5, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-NEXT: vpermi2q %zmm20, %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm1 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm16, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm14 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm13[0],xmm14[0] -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm10[0],xmm7[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm8, %ymm21, %ymm8 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm9 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm8[0],xmm9[0] +; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm16 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %xmm21 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm16[0],xmm21[0] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm14, %ymm28, %ymm14 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqa 704(%rdi), %xmm12 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %xmm21 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm21[0],xmm12[0] +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %xmm18 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %xmm28 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm18[0] ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %xmm31 ; AVX512BW-NEXT: vmovdqa 512(%rdi), %xmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm5[0],xmm31[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm30, %ymm9, %ymm9 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %xmm19 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %xmm27 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm27[0],xmm19[0] -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %xmm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm2[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm30, %ymm4, %ymm4 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %xmm25 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %xmm30 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm30[0],xmm25[0] -; AVX512BW-NEXT: vmovdqa 1600(%rdi), %xmm4 -; AVX512BW-NEXT: vmovdqa 1536(%rdi), %xmm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm3[0],xmm4[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm9, %ymm6 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm18, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm5[0],xmm31[0] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm29, %ymm26, %ymm26 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm26, %zmm15, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa 1216(%rdi), %xmm11 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %xmm26 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm26[0],xmm11[0] +; AVX512BW-NEXT: vmovdqa 1088(%rdi), %xmm4 +; AVX512BW-NEXT: vmovdqa 1024(%rdi), %xmm3 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm3[0],xmm4[0] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm29, %ymm6, %ymm6 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa 1728(%rdi), %xmm7 +; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %xmm25 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm25[0],xmm7[0] +; AVX512BW-NEXT: vmovdqa 1600(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa 1536(%rdi), %xmm1 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm1[0],xmm2[0] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm29, %ymm14, %ymm14 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm19, %zmm14 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm9[1] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm16[1],xmm21[1] +; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm15, %zmm8 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm27[1],xmm19[1] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm16, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm22 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm21[1],xmm12[1] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm23 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm28[1],xmm18[1] ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm31[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm22, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm20 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm13[1],xmm14[1] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm10[1],xmm7[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm30[1],xmm25[1] +; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm5, %ymm5 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm23, %zmm5 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm30 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm26[1],xmm11[1] ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm23, %zmm3 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm4 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm3 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm22 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm25[1],xmm7[1] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm2 = mem[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm24, %zmm2 +; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 +; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512BW-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovups (%rsp), %zmm7 # 64-byte Reload ; AVX512BW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload @@ -5497,69 +5509,63 @@ ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX512BW-NEXT: # ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovups (%rsp), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm10, %zmm11, %zmm10 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm11 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm11, %zmm12, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm6, 64(%rsi) +; AVX512BW-NEXT: vinsertf64x4 $0, %ymm10, %zmm11, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rsi) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rsi) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm6, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rdx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm1, 192(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, (%rcx) +; AVX512BW-NEXT: vmovaps %zmm1, 128(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm1, 64(%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 128(%rcx) +; AVX512BW-NEXT: vmovaps %zmm1, (%rcx) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm1, 192(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, (%r8) +; AVX512BW-NEXT: vmovaps %zmm1, 128(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm1, 64(%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 128(%r8) +; AVX512BW-NEXT: vmovaps %zmm1, (%r8) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm1, 192(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, (%r9) +; AVX512BW-NEXT: vmovaps %zmm1, 128(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm1, 64(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 128(%r9) +; AVX512BW-NEXT: vmovaps %zmm1, (%r9) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm1, 192(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, (%rax) +; AVX512BW-NEXT: vmovaps %zmm1, 128(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm1, 64(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm1, 128(%rax) +; AVX512BW-NEXT: vmovaps %zmm1, (%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovaps %zmm0, (%rax) ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovaps %zmm11, 128(%rax) ; AVX512BW-NEXT: vmovaps %zmm10, 192(%rax) -; AVX512BW-NEXT: vmovaps %zmm9, (%rax) +; AVX512BW-NEXT: vmovaps %zmm9, 128(%rax) ; AVX512BW-NEXT: vmovaps %zmm7, 64(%rax) -; AVX512BW-NEXT: addq $2568, %rsp # imm = 0xA08 +; AVX512BW-NEXT: vmovaps %zmm4, (%rax) +; AVX512BW-NEXT: addq $2472, %rsp # imm = 0x9A8 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <256 x i64>, ptr %in.vec, align 64 @@ -6970,379 +6976,387 @@ ; ; AVX1-ONLY-LABEL: load_i64_stride8_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $4984, %rsp # imm = 0x1378 -; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %xmm1 +; AVX1-ONLY-NEXT: subq $4904, %rsp # imm = 0x1328 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 2944(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3520(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 3456(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2880(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 3392(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 3904(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 3840(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 4032(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 3968(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm10[0] -; AVX1-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0] ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3328(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm7[0] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2816(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2368(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2304(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2048(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2752(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2624(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2560(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 3200(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3136(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 3072(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3776(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 3712(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3648(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 3584(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3776(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 3712(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3648(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 3584(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 3200(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3136(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 3072(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2752(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2624(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 2560(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1600(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1856(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2368(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2944(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2880(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 2816(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3520(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 3456(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 2048(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3392(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 3328(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 4032(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 3968(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3904(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 3840(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 848(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1104(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1872(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2384(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2320(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2064(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 2752(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2944(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2896(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2640(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2832(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2576(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3520(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3456(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3200(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 3408(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3152(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3344(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 3088(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 4032(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 3776(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3968(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3712(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 3920(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3664(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3856(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 3600(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm7[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] -; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 848(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm4[0],xmm5[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1152(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %ymm15 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1104(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 1040(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 1296(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm8[0],xmm9[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1728(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 1984(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 1920(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1616(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 1872(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1552(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 1808(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2240(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2496(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2176(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 2432(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 2128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2384(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2064(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2320(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2752(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3008(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2688(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 2944(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 2640(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 2896(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2576(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 2832(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3264(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3520(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3200(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 3456(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 3152(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3408(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3088(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 3344(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3776(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 4032(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3712(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 3968(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 3664(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 3920(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3600(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 3856(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd (%rsp), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm13[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm7[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -7362,7 +7376,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -7451,14 +7465,6 @@ ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] @@ -7655,187 +7661,191 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 624(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 560(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm5[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1136(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1072(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1696(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm6[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1584(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2272(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 1904(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2208(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 2160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2096(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2272(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 2784(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2208(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2720(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2160(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2096(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 2672(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2608(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2528(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 3296(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2464(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3232(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2352(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm9[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 3184(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 3120(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2784(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 3808(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2720(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3744(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2672(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2608(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm10[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 3696(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 3632(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3040(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm10[0],ymm6[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm7[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2976(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 2928(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 2864(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm11[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 928(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] +; AVX1-ONLY-NEXT: vmovaps 880(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 816(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm4[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1392(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 1328(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm8[0],xmm9[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3296(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1952(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovaps 1904(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 1840(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3232(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 2528(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 3184(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3120(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm12[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2464(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3552(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 2416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2352(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3040(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 3440(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3376(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm13[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 2976(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3808(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 2928(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 2864(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3744(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 3552(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 3696(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3632(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 3488(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 4064(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 3440(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 3376(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 4000(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 4064(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vmovaps 3952(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 4000(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 3952(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 3888(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 3888(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm13 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm7[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7846,451 +7856,443 @@ ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm3[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm4[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm5[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm9[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm6[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm7[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm12[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm8[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm9[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm10[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm11[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm12[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm13[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm14[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 464(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 448(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 256(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 384(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 320(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 192(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 128(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 64(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, (%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 272(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 400(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 336(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 208(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 144(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 80(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 496(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 480(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 32(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 416(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 352(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 288(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 224(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 160(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 96(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 432(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 368(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 304(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 240(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 176(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 112(%rsi) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 144(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 256(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 272(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 80(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, (%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 208(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 320(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 336(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 384(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 400(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 448(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 464(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 112(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 48(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 176(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 240(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 288(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 304(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 352(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 368(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 416(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 432(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 480(%rdx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 496(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 448(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 384(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 320(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 256(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 480(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 416(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 352(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 288(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 480(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 448(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 416(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 384(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 352(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 320(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 288(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 256(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%r8) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 496(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 480(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 464(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 448(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 432(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 416(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 400(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 384(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 368(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 352(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 336(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 320(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 304(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 288(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 272(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 256(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 240(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 224(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 208(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 192(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 144(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 112(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 80(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 48(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, (%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = mem[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 480(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 416(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 352(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 288(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 224(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 160(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 96(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 32(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 496(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 432(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 368(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 304(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 240(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 176(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 112(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 48(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 448(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 384(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 320(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 256(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 192(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 128(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 64(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 464(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 400(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 336(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 272(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 208(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 144(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 80(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 496(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 416(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 432(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 352(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 368(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 288(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 304(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 240(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 176(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 112(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 48(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 464(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 400(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 320(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 336(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 272(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 208(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 144(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 80(%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rdx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 480(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 416(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 352(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 288(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 448(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 384(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 320(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 256(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 480(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 448(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 416(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 384(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 352(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 320(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 288(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 256(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 496(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 480(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 464(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 448(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 432(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 416(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 400(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 384(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 368(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 352(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 336(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 320(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 304(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 288(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 272(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 256(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 240(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 224(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 208(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 192(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 144(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 112(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 80(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 48(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%r9) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 496(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 480(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 464(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 448(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 432(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 416(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 400(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 384(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 368(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 336(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 304(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 288(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 272(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 256(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 240(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 208(%rax) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 176(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 144(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 112(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 80(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 48(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm8, (%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 496(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 480(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 464(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 432(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 400(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 368(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 336(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 304(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 272(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 240(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 176(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 144(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 112(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 80(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 480(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 448(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 416(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 384(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 352(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 320(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 288(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 256(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 480(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 416(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 352(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 288(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 448(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 384(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 320(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 256(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rax) @@ -8298,213 +8300,207 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm3, 384(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 352(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 288(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 256(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $4984, %rsp # imm = 0x1378 +; AVX1-ONLY-NEXT: vmovaps %ymm13, (%rax) +; AVX1-ONLY-NEXT: addq $4904, %rsp # imm = 0x1328 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: load_i64_stride8_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $5560, %rsp # imm = 0x15B8 -; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 2432(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3008(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 2944(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3520(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 3456(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2880(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 3392(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 3904(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 3840(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 4032(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 3968(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm10[0] -; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0] -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm8[1] +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] ; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3328(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm7[0] -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm7[1] -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2816(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm10[0],xmm9[0] +; AVX2-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm10[1],xmm9[1] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm6[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm6[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2368(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2304(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm2[0] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm8[0] +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 2048(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2752(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 2688(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 2560(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 3264(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 3200(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 3136(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 3072(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 3776(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 3712(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 3648(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 3584(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3776(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 3712(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3648(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 3584(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3264(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 3200(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3136(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 3072(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2752(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 2688(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 2560(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 2432(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2368(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 2304(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 3008(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 2944(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 2880(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 2816(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 3520(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 3456(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 2048(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 3392(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 3328(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 4032(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 3968(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 3904(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 3840(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -8515,180 +8511,180 @@ ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2368(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2304(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2048(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2432(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2880(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2816(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2560(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3008(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2752(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2944(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 2688(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3392(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 3136(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3328(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 3072(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3520(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 3264(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3456(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 3200(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3904(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 3648(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3840(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 3584(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 4032(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 3776(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3968(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 3712(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1088(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1344(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1024(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1280(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1216(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1472(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1152(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1408(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1600(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1856(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1536(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1792(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1728(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovaps 1664(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 1984(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 1920(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm15[0],ymm10[2],ymm15[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2112(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2368(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2048(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2304(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2240(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 2176(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 2496(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 2432(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2624(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2880(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2560(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2816(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2752(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 2688(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 3008(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 2944(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3136(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 3072(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 3264(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 3200(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 3392(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 3328(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 3520(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 3456(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3648(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 3584(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 3776(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 3712(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 3904(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 3840(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 4032(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 3968(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] @@ -8741,6 +8737,14 @@ ; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload @@ -8799,14 +8803,6 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] @@ -8960,7 +8956,7 @@ ; AVX2-ONLY-NEXT: vmovaps 3296(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps 3232(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 3424(%rdi), %xmm0 @@ -9011,320 +9007,280 @@ ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1120(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1056(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1248(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1184(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2144(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2080(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2272(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 2208(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1632(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 2656(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1568(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 2592(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 2784(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1696(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 2720(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 3168(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 3104(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 3296(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 3232(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2144(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 3680(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2080(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 3616(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2272(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 3808(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2208(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 3744(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2400(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2336(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2528(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2464(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2656(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2592(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2784(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2720(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2912(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1376(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2848(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1312(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3040(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 1504(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 2976(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 1440(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3168(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 1888(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3104(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 1824(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3296(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 3232(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3424(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 3360(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 3552(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 3488(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm10[0],ymm14[2],ymm10[2] +; AVX2-ONLY-NEXT: vmovaps 2016(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 1952(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm15[0],ymm10[2],ymm15[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3680(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 3616(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 3808(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 3744(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm13[0],ymm6[0],ymm13[2],ymm6[2] +; AVX2-ONLY-NEXT: vmovaps 2400(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2336(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2528(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 2464(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 3936(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 3872(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 2912(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 2848(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 3040(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 2976(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 3424(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 3360(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 3552(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 3488(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm11[0],ymm14[0],ymm11[2],ymm14[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 3936(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 3872(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovaps 4064(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 4000(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm12[0],ymm4[0],ymm12[2],ymm4[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm15[1],ymm10[3],ymm15[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm10[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm9[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm6[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm11[1],ymm14[1],ymm11[3],ymm14[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm15 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm14[1],ymm10[1],ymm14[3],ymm10[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm13[1],ymm6[1],ymm13[3],ymm6[3] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm12[1],ymm4[1],ymm12[3],ymm4[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 464(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 448(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 256(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 384(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 320(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 192(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 128(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 64(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 272(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 400(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 336(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 208(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 144(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 496(%rsi) +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 480(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 416(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 352(%rsi) @@ -9337,7 +9293,9 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 96(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 32(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 496(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 432(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -9351,37 +9309,63 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 112(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 144(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 448(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 256(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 384(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 272(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 320(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 256(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 192(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 128(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 64(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 208(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 464(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 320(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 400(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 336(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 336(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 384(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 272(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 400(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 208(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 448(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 144(%rsi) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 464(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 80(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%rsi) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 480(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 496(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 416(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 432(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 352(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 368(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 288(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 304(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 240(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 160(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 176(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 96(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -9391,45 +9375,37 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 48(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 160(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 448(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 176(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 464(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 384(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 240(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 400(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 288(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 320(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 304(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 336(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 352(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 256(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 368(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 272(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 416(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 192(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 432(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 208(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 480(%rdx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 128(%rdx) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm1, 496(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 144(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 80(%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rdx) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -9447,6 +9423,22 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%r8) @@ -9490,7 +9482,7 @@ ; AVX2-ONLY-NEXT: vmovaps %xmm1, 432(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 416(%r9) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 400(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm1, 384(%r9) @@ -9611,51 +9603,47 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 256(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm3, 448(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 416(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 384(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 352(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 352(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 320(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 288(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm15, 224(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 192(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9674,122 +9662,122 @@ ; ; AVX512F-ONLY-SLOW-LABEL: load_i64_stride8_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $6728, %rsp # imm = 0x1A48 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3328(%rdi), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3520(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3456(%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: subq $6664, %rsp # imm = 0x1A08 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm15 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: movb $-64, %al ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3200(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3136(%rdi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm9[0],ymm3[2],ymm9[2] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %ymm23 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm25[0],ymm26[0],ymm25[2],ymm26[2] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1152(%rdi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm31[0],ymm8[2],ymm31[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm21[0],ymm19[2],ymm21[2] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1728(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1664(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1728(%rdi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm12[0],ymm17[2],ymm12[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm22[0],ymm30[2],ymm22[2] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2432(%rdi), %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2368(%rdi), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2240(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2176(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2112(%rdi), %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2048(%rdi), %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm24[0],ymm27[0],ymm24[2],ymm27[2] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9808,55 +9796,53 @@ ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2688(%rdi), %ymm11 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2624(%rdi), %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2624(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2560(%rdi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2432(%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3520(%rdi), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3456(%rdi), %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2368(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3392(%rdi), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2304(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3328(%rdi), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3264(%rdi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2176(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3200(%rdi), %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2112(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2048(%rdi), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3136(%rdi), %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3072(%rdi), %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm20[0],ymm18[2],ymm20[2] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 4032(%rdi), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3968(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3968(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3840(%rdi), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3776(%rdi), %ymm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3712(%rdi), %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3648(%rdi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3776(%rdi), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3712(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3648(%rdi), %ymm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9864,174 +9850,175 @@ ; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm15 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm13, %ymm14 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm14 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm14 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm29[1],mem[1],ymm29[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm25[1],ymm26[1],ymm25[3],ymm26[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm8[1],ymm31[1],ymm8[3],ymm31[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm21[1],ymm19[3],ymm21[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm14 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm12[1],ymm17[3],ymm12[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm30[1],ymm22[1],ymm30[3],ymm22[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm14 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm24[1],ymm27[1],ymm24[3],ymm27[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm2, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm2, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm18[1],ymm20[1],ymm18[3],ymm20[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3136(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3072(%rdi), %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm23[0],zmm16[2],zmm23[2],zmm16[4],zmm23[4],zmm16[6],zmm23[6] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm25[0],zmm1[2],zmm25[2],zmm1[4],zmm25[4],zmm1[6],zmm25[6] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm19[0],zmm8[2],zmm19[2],zmm8[4],zmm19[4],zmm8[6],zmm19[6] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10042,32 +10029,33 @@ ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm30[0],zmm12[2],zmm30[2],zmm12[4],zmm30[4],zmm12[6],zmm30[6] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2048(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2240(%rdi), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2176(%rdi), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = zmm15[0],mem[0],zmm15[2],mem[2],zmm15[4],mem[4],zmm15[6],mem[6] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2624(%rdi), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10078,33 +10066,32 @@ ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2688(%rdi), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2112(%rdi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3136(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2240(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2176(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3200(%rdi), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm6[0],zmm21[0],zmm6[2],zmm21[2],zmm6[4],zmm21[4],zmm6[6],zmm21[6] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3648(%rdi), %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3584(%rdi), %zmm1 @@ -10114,96 +10101,97 @@ ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3712(%rdi), %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm28[1],zmm16[3],zmm28[3],zmm16[5],zmm28[5],zmm16[7],zmm28[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm23[1],zmm25[3],zmm23[3],zmm25[5],zmm23[5],zmm25[7],zmm23[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = zmm15[1],mem[1],zmm15[3],mem[3],zmm15[5],mem[5],zmm15[7],mem[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload @@ -10214,476 +10202,474 @@ ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm6[1],zmm21[1],zmm6[3],zmm21[3],zmm6[5],zmm21[5],zmm6[7],zmm21[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm24[1],zmm21[1],zmm24[3],zmm21[3],zmm24[5],zmm21[5],zmm24[7],zmm21[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm4[0],zmm30[0],zmm4[2],zmm30[2],zmm4[4],zmm30[4],zmm4[6],zmm30[6] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm22[0],zmm20[0],zmm22[2],zmm20[2],zmm22[4],zmm20[4],zmm22[6],zmm20[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm28[0],zmm20[0],zmm28[2],zmm20[2],zmm28[4],zmm20[4],zmm28[6],zmm20[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm18 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm29[0],zmm19[0],zmm29[2],zmm19[2],zmm29[4],zmm19[4],zmm29[6],zmm19[6] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm3 {%k1} = zmm5[0],mem[0],zmm5[2],mem[2],zmm5[4],mem[4],zmm5[6],mem[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm15[0],zmm6[0],zmm15[2],zmm6[2],zmm15[4],zmm6[4],zmm15[6],zmm6[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm16, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm16, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm3[1],zmm13[3],zmm3[3],zmm13[5],zmm3[5],zmm13[7],zmm3[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm3, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm28[1],zmm20[1],zmm28[3],zmm20[3],zmm28[5],zmm20[5],zmm28[7],zmm20[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm10, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm10, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm26[1],zmm27[3],zmm26[3],zmm27[5],zmm26[5],zmm27[7],zmm26[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm17, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm29[1],zmm19[1],zmm29[3],zmm19[3],zmm29[5],zmm19[5],zmm29[7],zmm19[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm17, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm10, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm16, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm16, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm12, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm15[1],zmm6[1],zmm15[3],zmm6[3],zmm15[5],zmm6[5],zmm15[7],zmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm15 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm7 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm8 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm11 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm12 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm14 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm13 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm13, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm30, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm28, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %xmm20 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %xmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %xmm16 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm0[0],xmm3[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1216(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm24 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1152(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %xmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %xmm21 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm21[0],xmm0[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %xmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm15[0],xmm17[0] ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %xmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %xmm31 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %xmm30 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1088(%rdi), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm13[0],xmm14[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm24, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm24, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %xmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %xmm24 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm24[0],xmm23[0] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1600(%rdi), %xmm12 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm11[0],xmm12[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm19, %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm19, %zmm18, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2240(%rdi), %xmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2176(%rdi), %xmm21 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2112(%rdi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2048(%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2240(%rdi), %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2176(%rdi), %xmm19 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm19[0],xmm16[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2112(%rdi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2048(%rdi), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm9[0],xmm4[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm25, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2752(%rdi), %xmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2688(%rdi), %xmm29 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2560(%rdi), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2752(%rdi), %xmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2688(%rdi), %xmm26 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm26[0],xmm25[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2624(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2560(%rdi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm7[0],xmm8[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm27, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %xmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %xmm22 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3200(%rdi), %xmm27 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3136(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3072(%rdi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3776(%rdi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3712(%rdi), %xmm26 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3648(%rdi), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3584(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm4 = xmm4[1],mem[1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm8 = xmm8[1],mem[1] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm27[0],xmm22[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3136(%rdi), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3072(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm5[0],xmm6[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3776(%rdi), %xmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3712(%rdi), %xmm31 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm20[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3648(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3584(%rdi), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm3[0],xmm0[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm18, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm2 = xmm2[1],mem[1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm18, %xmm18 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm18 = xmm18[1],mem[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm18, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm21, %xmm18 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm18 = xmm21[1],mem[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm15 = xmm15[1],xmm17[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm15, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm17 = xmm30[1],xmm28[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm14[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm13, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm1, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm24[1],xmm23[1] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # xmm11 = xmm11[1],mem[1] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm11, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm19[1],xmm16[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm4[1] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm26[1],xmm25[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm8[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm27[1],xmm22[1] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 448(%rsi) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm31[1],xmm20[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rsi) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rsi) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -10697,234 +10683,234 @@ ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rsi) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 448(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 384(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 256(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 192(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 64(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rcx) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%r8) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%r8) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%r8) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%r8) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%r8) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r8) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%r8) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r8) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%r9) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%r9) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%r9) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r9) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $6728, %rsp # imm = 0x1A48 +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $6664, %rsp # imm = 0x1A08 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: load_i64_stride8_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $6728, %rsp # imm = 0x1A48 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3328(%rdi), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3520(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3456(%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: subq $6664, %rsp # imm = 0x1A08 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: movb $-64, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm2, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %ymm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 3200(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 3136(%rdi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm9[0],ymm3[2],ymm9[2] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %ymm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %ymm23 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm29 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %ymm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm25[0],ymm26[0],ymm25[2],ymm26[2] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %ymm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1152(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm31[0],ymm8[2],ymm31[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm21[0],ymm19[2],ymm21[2] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1728(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1664(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %ymm26 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1728(%rdi), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm12[0],ymm17[2],ymm12[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm22[0],ymm30[2],ymm22[2] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2432(%rdi), %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2368(%rdi), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %ymm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %ymm29 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %ymm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %ymm27 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 2240(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 2176(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2112(%rdi), %ymm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2048(%rdi), %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm24[0],ymm27[0],ymm24[2],ymm27[2] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10943,55 +10929,53 @@ ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 2688(%rdi), %ymm11 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2624(%rdi), %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 2624(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa 2560(%rdi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2432(%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3520(%rdi), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3456(%rdi), %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2368(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3392(%rdi), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2304(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3328(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 3264(%rdi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 2176(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 3200(%rdi), %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 2112(%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 2048(%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3136(%rdi), %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3072(%rdi), %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm20[0],ymm18[2],ymm20[2] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 4032(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3968(%rdi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3968(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3840(%rdi), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3776(%rdi), %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3712(%rdi), %ymm17 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 3648(%rdi), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 3776(%rdi), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 3712(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 3648(%rdi), %ymm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10999,174 +10983,175 @@ ; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm2, %zmm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq (%rsp), %ymm13, %ymm14 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm14 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm14 ; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm29[1],mem[1],ymm29[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm25[1],ymm26[1],ymm25[3],ymm26[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm8[1],ymm31[1],ymm8[3],ymm31[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm21[1],ymm19[3],ymm21[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm12[1],ymm17[3],ymm12[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm30[1],ymm22[1],ymm30[3],ymm22[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm24[1],ymm27[1],ymm24[3],ymm27[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm18[1],ymm20[1],ymm18[3],ymm20[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm9, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6] +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3136(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3072(%rdi), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm23[0],zmm16[2],zmm23[2],zmm16[4],zmm23[4],zmm16[6],zmm23[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm25[0],zmm1[2],zmm25[2],zmm1[4],zmm25[4],zmm1[6],zmm25[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm19[0],zmm8[2],zmm19[2],zmm8[4],zmm19[4],zmm8[6],zmm19[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11177,32 +11162,33 @@ ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm30[0],zmm12[2],zmm30[2],zmm12[4],zmm30[4],zmm12[6],zmm30[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2048(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2240(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2176(%rdi), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm3 = zmm15[0],mem[0],zmm15[2],mem[2],zmm15[4],mem[4],zmm15[6],mem[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2624(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11213,33 +11199,32 @@ ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2688(%rdi), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2112(%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3136(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2240(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2176(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3200(%rdi), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm6[0],zmm21[0],zmm6[2],zmm21[2],zmm6[4],zmm21[4],zmm6[6],zmm21[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3648(%rdi), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3584(%rdi), %zmm1 @@ -11249,96 +11234,97 @@ ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3712(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 {%k1} ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] ; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm28[1],zmm16[3],zmm28[3],zmm16[5],zmm28[5],zmm16[7],zmm28[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm23[1],zmm25[3],zmm23[3],zmm25[5],zmm23[5],zmm25[7],zmm23[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm15[1],mem[1],zmm15[3],mem[3],zmm15[5],mem[5],zmm15[7],mem[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload @@ -11349,476 +11335,474 @@ ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm6[1],zmm21[1],zmm6[3],zmm21[3],zmm6[5],zmm21[5],zmm6[7],zmm21[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm24[1],zmm21[1],zmm24[3],zmm21[3],zmm24[5],zmm21[5],zmm24[7],zmm21[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] ; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm4[0],zmm30[0],zmm4[2],zmm30[2],zmm4[4],zmm30[4],zmm4[6],zmm30[6] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm22[0],zmm20[0],zmm22[2],zmm20[2],zmm22[4],zmm20[4],zmm22[6],zmm20[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm28[0],zmm20[0],zmm28[2],zmm20[2],zmm28[4],zmm20[4],zmm28[6],zmm20[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm18 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm29[0],zmm19[0],zmm29[2],zmm19[2],zmm29[4],zmm19[4],zmm29[6],zmm19[6] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm3 {%k1} = zmm5[0],mem[0],zmm5[2],mem[2],zmm5[4],mem[4],zmm5[6],mem[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm15[0],zmm6[0],zmm15[2],zmm6[2],zmm15[4],zmm6[4],zmm15[6],zmm6[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] -; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm16, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm3, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm16, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm3[1],zmm13[3],zmm3[3],zmm13[5],zmm3[5],zmm13[7],zmm3[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm3, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm17, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm28[1],zmm20[1],zmm28[3],zmm20[3],zmm28[5],zmm20[5],zmm28[7],zmm20[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm16, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm17, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm17, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm26[1],zmm27[3],zmm26[3],zmm27[5],zmm26[5],zmm27[7],zmm26[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm17, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm29[1],zmm19[1],zmm29[3],zmm19[3],zmm29[5],zmm19[5],zmm29[7],zmm19[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm17, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm10, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm16, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm16, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm23 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm19, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm19, %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm17, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm15[1],zmm6[1],zmm15[3],zmm6[3],zmm15[5],zmm6[5],zmm15[7],zmm6[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm15 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm7 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm8 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm11 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm12 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm14 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm13 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm13, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm30, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm28, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %xmm20 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %xmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %xmm16 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm0[0],xmm3[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1216(%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm24 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1152(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %xmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %xmm21 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm21[0],xmm0[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %xmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm15[0],xmm17[0] ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %xmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %xmm31 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %xmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %xmm30 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1088(%rdi), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm13[0],xmm14[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm24, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %xmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %xmm24 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm24[0],xmm23[0] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 1600(%rdi), %xmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm11[0],xmm12[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm19, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm19, %zmm18, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2240(%rdi), %xmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2176(%rdi), %xmm21 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 2112(%rdi), %xmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 2048(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm25 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2240(%rdi), %xmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2176(%rdi), %xmm19 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm19[0],xmm16[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 2112(%rdi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 2048(%rdi), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm9[0],xmm4[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm25, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2752(%rdi), %xmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2688(%rdi), %xmm29 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 2560(%rdi), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2752(%rdi), %xmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2688(%rdi), %xmm26 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm26[0],xmm25[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 2624(%rdi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 2560(%rdi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm7[0],xmm8[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm27, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %xmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %xmm22 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3200(%rdi), %xmm27 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 3136(%rdi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 3072(%rdi), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 3776(%rdi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3712(%rdi), %xmm26 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 3648(%rdi), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 3584(%rdi), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm4 = xmm4[1],mem[1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm8 = xmm8[1],mem[1] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm27[0],xmm22[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 3136(%rdi), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 3072(%rdi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm5[0],xmm6[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3776(%rdi), %xmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3712(%rdi), %xmm31 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm20[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 3648(%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 3584(%rdi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm3[0],xmm0[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm18, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm2 = xmm2[1],mem[1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm18, %xmm18 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm18 = xmm18[1],mem[1] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm18, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm21, %xmm18 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm18 = xmm21[1],mem[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm15 = xmm15[1],xmm17[1] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm15, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm17 = xmm30[1],xmm28[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm14[1] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm13, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm1, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm24[1],xmm23[1] ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm11 = xmm11[1],mem[1] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm11, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm19[1],xmm16[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm4[1] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm26[1],xmm25[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm8[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm27[1],xmm22[1] ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 448(%rsi) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm31[1],xmm20[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rsi) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rsi) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -11832,234 +11816,234 @@ ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rsi) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 448(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 320(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 384(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 256(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 192(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 64(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rcx) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%r8) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%r8) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%r8) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%r8) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%r8) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r8) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%r8) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%r8) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%r9) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%r9) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%r9) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%r9) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $6728, %rsp # imm = 0x1A48 +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $6664, %rsp # imm = 0x1A08 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: load_i64_stride8_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $6728, %rsp # imm = 0x1A48 -; AVX512DQ-SLOW-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 3328(%rdi), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 3520(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 3456(%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: subq $6664, %rsp # imm = 0x1A08 ; AVX512DQ-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm15 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm8 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: movb $-64, %al ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 3264(%rdi), %ymm21 -; AVX512DQ-SLOW-NEXT: vmovdqa 3200(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 3136(%rdi), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm9[0],ymm3[2],ymm9[2] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %ymm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %ymm23 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm29 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %ymm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %ymm25 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm25[0],ymm26[0],ymm25[2],ymm26[2] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm31 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %ymm31 +; AVX512DQ-SLOW-NEXT: vmovdqa 1152(%rdi), %ymm8 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm31[0],ymm8[2],ymm31[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %ymm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1024(%rdi), %ymm19 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm21[0],ymm19[2],ymm21[2] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 1728(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 1664(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %ymm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %ymm26 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 1728(%rdi), %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %ymm17 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm12[0],ymm17[2],ymm12[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %ymm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %ymm30 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm22[0],ymm30[2],ymm22[2] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 2432(%rdi), %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 2368(%rdi), %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %ymm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %ymm29 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] -; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %ymm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1024(%rdi), %ymm27 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 2240(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 2176(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 2112(%rdi), %ymm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 2048(%rdi), %ymm24 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm24[0],ymm27[0],ymm24[2],ymm27[2] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -12078,55 +12062,53 @@ ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 2688(%rdi), %ymm11 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa64 2624(%rdi), %ymm16 +; AVX512DQ-SLOW-NEXT: vmovdqa 2624(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa 2560(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 2432(%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 3520(%rdi), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 3456(%rdi), %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 2368(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 3392(%rdi), %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 2304(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 3328(%rdi), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 3264(%rdi), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 2176(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 3200(%rdi), %ymm3 ; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 2112(%rdi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 2048(%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 3136(%rdi), %ymm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 3072(%rdi), %ymm18 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm20[0],ymm18[2],ymm20[2] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 4032(%rdi), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 3968(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 3968(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 3840(%rdi), %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 3776(%rdi), %ymm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 3712(%rdi), %ymm17 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 3648(%rdi), %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 3776(%rdi), %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 3712(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 3648(%rdi), %ymm4 ; AVX512DQ-SLOW-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -12134,174 +12116,175 @@ ; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm15 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] -; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm13, %ymm14 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm14 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm14 ; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm13 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm13 = ymm29[1],mem[1],ymm29[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm25[1],ymm26[1],ymm25[3],ymm26[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm8[1],ymm31[1],ymm8[3],ymm31[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm21[1],ymm19[3],ymm21[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm14 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm12[1],ymm17[3],ymm12[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm30[1],ymm22[1],ymm30[3],ymm22[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm14 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3] +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm24[1],ymm27[1],ymm24[3],ymm27[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm2, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm2, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm18[1],ymm20[1],ymm18[3],ymm20[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6] +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 3136(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 3072(%rdi), %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 3264(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] -; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 ; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm23[0],zmm16[2],zmm23[2],zmm16[4],zmm23[4],zmm16[6],zmm23[6] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm25[0],zmm1[2],zmm25[2],zmm1[4],zmm25[4],zmm1[6],zmm25[6] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm4 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm19[0],zmm8[2],zmm19[2],zmm8[4],zmm19[4],zmm8[6],zmm19[6] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -12312,32 +12295,33 @@ ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm30[0],zmm12[2],zmm30[2],zmm12[4],zmm30[4],zmm12[6],zmm30[6] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 2048(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 2240(%rdi), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 2176(%rdi), %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm3 = zmm15[0],mem[0],zmm15[2],mem[2],zmm15[4],mem[4],zmm15[6],mem[6] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 2624(%rdi), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -12348,33 +12332,32 @@ ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 2688(%rdi), %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 2112(%rdi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 3136(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 2240(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 3264(%rdi), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 2176(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 3200(%rdi), %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm6[0],zmm21[0],zmm6[2],zmm21[2],zmm6[4],zmm21[4],zmm6[6],zmm21[6] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 ; AVX512DQ-SLOW-NEXT: vmovdqa64 3648(%rdi), %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 3584(%rdi), %zmm1 @@ -12384,96 +12367,97 @@ ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 3712(%rdi), %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 {%k1} ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] ; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm11 ; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] ; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm28[1],zmm16[3],zmm28[3],zmm16[5],zmm28[5],zmm16[7],zmm28[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm23[1],zmm25[3],zmm23[3],zmm25[5],zmm23[5],zmm25[7],zmm23[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm4 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm4 = zmm15[1],mem[1],zmm15[3],mem[3],zmm15[5],mem[5],zmm15[7],mem[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm4 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm3 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm4 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm4 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload @@ -12484,476 +12468,474 @@ ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm6[1],zmm21[1],zmm6[3],zmm21[3],zmm6[5],zmm21[5],zmm6[7],zmm21[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm24[1],zmm21[1],zmm24[3],zmm21[3],zmm24[5],zmm21[5],zmm24[7],zmm21[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 ; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] ; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm4[0],zmm30[0],zmm4[2],zmm30[2],zmm4[4],zmm30[4],zmm4[6],zmm30[6] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm25 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm22[0],zmm20[0],zmm22[2],zmm20[2],zmm22[4],zmm20[4],zmm22[6],zmm20[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm28[0],zmm20[0],zmm28[2],zmm20[2],zmm28[4],zmm20[4],zmm28[6],zmm20[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm18 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm3 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm29[0],zmm19[0],zmm29[2],zmm19[2],zmm29[4],zmm19[4],zmm29[6],zmm19[6] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm3 {%k1} = zmm5[0],mem[0],zmm5[2],mem[2],zmm5[4],mem[4],zmm5[6],mem[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm15[0],zmm6[0],zmm15[2],zmm6[2],zmm15[4],zmm6[4],zmm15[6],zmm6[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] -; AVX512DQ-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] +; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] -; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm16, %zmm1 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm16, %zmm12 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm3[1],zmm13[3],zmm3[3],zmm13[5],zmm3[5],zmm13[7],zmm3[7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm7 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm9 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm3, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm28[1],zmm20[1],zmm28[3],zmm20[3],zmm28[5],zmm20[5],zmm28[7],zmm20[7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm10, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm31 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm10, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm26[1],zmm27[3],zmm26[3],zmm27[5],zmm26[5],zmm27[7],zmm26[7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm17, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm29[1],zmm19[1],zmm29[3],zmm19[3],zmm29[5],zmm19[5],zmm29[7],zmm19[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm17, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm10, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm16, %zmm15 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm16, %zmm15 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm19, %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm19, %zmm12, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm12 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm15[1],zmm6[1],zmm15[3],zmm6[3],zmm15[5],zmm6[5],zmm15[7],zmm6[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm15 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm7 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,7,15] -; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm8 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm9 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm11 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm12 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm17 = [7,15,7,15] +; AVX512DQ-SLOW-NEXT: # ymm17 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm14 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm6 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm13 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm13, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm6, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm30, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm28, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 704(%rdi), %xmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %xmm20 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0] -; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %xmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %xmm16 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm0[0],xmm3[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 1216(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm24 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 704(%rdi), %xmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 1152(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0] -; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %xmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0] +; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %xmm21 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm21[0],xmm0[0] +; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %xmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %xmm15 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm15[0],xmm17[0] ; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %xmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %xmm31 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %xmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %xmm30 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512DQ-SLOW-NEXT: vmovdqa 1088(%rdi), %xmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm13 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm13[0],xmm14[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm24, %ymm24 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm24, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %xmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %xmm24 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm24[0],xmm23[0] ; AVX512DQ-SLOW-NEXT: vmovdqa 1600(%rdi), %xmm12 ; AVX512DQ-SLOW-NEXT: vmovdqa 1536(%rdi), %xmm11 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm11[0],xmm12[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm19, %ymm19 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm19, %zmm18, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 2240(%rdi), %xmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 2176(%rdi), %xmm21 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0] -; AVX512DQ-SLOW-NEXT: vmovdqa 2112(%rdi), %xmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa 2048(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 2240(%rdi), %xmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 2176(%rdi), %xmm19 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm19[0],xmm16[0] +; AVX512DQ-SLOW-NEXT: vmovdqa 2112(%rdi), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 2048(%rdi), %xmm9 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm9[0],xmm4[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm25, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 2752(%rdi), %xmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 2688(%rdi), %xmm29 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0] -; AVX512DQ-SLOW-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa 2560(%rdi), %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 2752(%rdi), %xmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 2688(%rdi), %xmm26 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm26[0],xmm25[0] +; AVX512DQ-SLOW-NEXT: vmovdqa 2624(%rdi), %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 2560(%rdi), %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm7[0],xmm8[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm27, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 3264(%rdi), %xmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 3264(%rdi), %xmm22 ; AVX512DQ-SLOW-NEXT: vmovdqa64 3200(%rdi), %xmm27 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0] -; AVX512DQ-SLOW-NEXT: vmovdqa 3136(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 3072(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 3776(%rdi), %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 3712(%rdi), %xmm26 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0] -; AVX512DQ-SLOW-NEXT: vmovdqa 3648(%rdi), %xmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 3584(%rdi), %xmm5 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm4 = xmm4[1],mem[1] -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm8 = xmm8[1],mem[1] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm27[0],xmm22[0] +; AVX512DQ-SLOW-NEXT: vmovdqa 3136(%rdi), %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 3072(%rdi), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm5[0],xmm6[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 3776(%rdi), %xmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 3712(%rdi), %xmm31 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm20[0] +; AVX512DQ-SLOW-NEXT: vmovdqa 3648(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 3584(%rdi), %xmm3 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm3[0],xmm0[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm18, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm2 = xmm2[1],mem[1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm18, %xmm18 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm18 = xmm18[1],mem[1] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm18, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm21, %xmm18 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm18 = xmm21[1],mem[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm15 = xmm15[1],xmm17[1] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm15, %ymm15 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm17 = xmm30[1],xmm28[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm14[1] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm13, %ymm13 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm1, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm24[1],xmm23[1] ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # xmm11 = xmm11[1],mem[1] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm11, %ymm11 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm19[1],xmm16[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm4[1] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm26[1],xmm25[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm8[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm27[1],xmm22[1] ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 448(%rsi) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm31[1],xmm20[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rsi) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rsi) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -12967,234 +12949,234 @@ ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rsi) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 448(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 384(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 256(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 192(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 64(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rcx) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rcx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rcx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rcx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%rcx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rcx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rcx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rcx) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%r8) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%r8) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%r8) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%r8) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%r8) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%r8) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%r8) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%r8) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%r8) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%r8) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%r8) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%r9) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%r9) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%r9) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%r9) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax) ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-SLOW-NEXT: addq $6728, %rsp # imm = 0x1A48 +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-SLOW-NEXT: addq $6664, %rsp # imm = 0x1A08 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: load_i64_stride8_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $6728, %rsp # imm = 0x1A48 -; AVX512DQ-FAST-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 3328(%rdi), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 3520(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 3456(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: subq $6664, %rsp # imm = 0x1A08 ; AVX512DQ-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm15 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm8 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: movb $-64, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm2, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 3264(%rdi), %ymm21 -; AVX512DQ-FAST-NEXT: vmovdqa 3200(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2] -; AVX512DQ-FAST-NEXT: vmovdqa 3136(%rdi), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm9[0],ymm3[2],ymm9[2] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %ymm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %ymm23 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %ymm29 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %ymm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %ymm25 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm25[0],ymm26[0],ymm25[2],ymm26[2] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %ymm31 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %ymm31 +; AVX512DQ-FAST-NEXT: vmovdqa 1152(%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm31[0],ymm8[2],ymm31[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 1024(%rdi), %ymm19 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm21[0],ymm19[2],ymm21[2] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 1728(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 1664(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %ymm26 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2] +; AVX512DQ-FAST-NEXT: vmovdqa 1728(%rdi), %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %ymm17 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm12[0],ymm17[2],ymm12[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %ymm30 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm22[0],ymm30[2],ymm22[2] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 2432(%rdi), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 2368(%rdi), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %ymm29 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] -; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %ymm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 1024(%rdi), %ymm27 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2] +; AVX512DQ-FAST-NEXT: vmovdqa 2240(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 2176(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 2112(%rdi), %ymm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 2048(%rdi), %ymm24 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm24[0],ymm27[0],ymm24[2],ymm27[2] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13213,55 +13195,53 @@ ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 2688(%rdi), %ymm11 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa64 2624(%rdi), %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa 2624(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa 2560(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 2432(%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 3520(%rdi), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 3456(%rdi), %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 2368(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 3392(%rdi), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 2304(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 3328(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 3264(%rdi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 2176(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 3200(%rdi), %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa 2112(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 2048(%rdi), %ymm8 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 3136(%rdi), %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 3072(%rdi), %ymm18 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm20[0],ymm18[2],ymm20[2] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 4032(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 3968(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 3968(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 3840(%rdi), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 3776(%rdi), %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 3712(%rdi), %ymm17 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2] -; AVX512DQ-FAST-NEXT: vmovdqa 3648(%rdi), %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa 3776(%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa 3712(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] +; AVX512DQ-FAST-NEXT: vmovdqa 3648(%rdi), %ymm4 ; AVX512DQ-FAST-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13269,174 +13249,175 @@ ; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm2, %zmm15 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] -; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq (%rsp), %ymm13, %ymm14 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm14 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm14 ; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm13 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm13 = ymm29[1],mem[1],ymm29[3],mem[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm25[1],ymm26[1],ymm25[3],ymm26[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm8[1],ymm31[1],ymm8[3],ymm31[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm21[1],ymm19[3],ymm21[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm14 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm12[1],ymm17[3],ymm12[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm30[1],ymm22[1],ymm30[3],ymm22[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm14 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm24[1],ymm27[1],ymm24[3],ymm27[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm2, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm18[1],ymm20[1],ymm18[3],ymm20[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 3136(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 3072(%rdi), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 3264(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 ; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm23[0],zmm16[2],zmm23[2],zmm16[4],zmm23[4],zmm16[6],zmm23[6] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm25[0],zmm1[2],zmm25[2],zmm1[4],zmm25[4],zmm1[6],zmm25[6] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm4 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm19[0],zmm8[2],zmm19[2],zmm8[4],zmm19[4],zmm8[6],zmm19[6] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13447,32 +13428,33 @@ ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm30[0],zmm12[2],zmm30[2],zmm12[4],zmm30[4],zmm12[6],zmm30[6] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 2048(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 2240(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 2176(%rdi), %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm3 = zmm15[0],mem[0],zmm15[2],mem[2],zmm15[4],mem[4],zmm15[6],mem[6] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 2624(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13483,33 +13465,32 @@ ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 2688(%rdi), %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 2112(%rdi), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 3136(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 2240(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 3264(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 2176(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 3200(%rdi), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm6[0],zmm21[0],zmm6[2],zmm21[2],zmm6[4],zmm21[4],zmm6[6],zmm21[6] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 ; AVX512DQ-FAST-NEXT: vmovdqa64 3648(%rdi), %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 3584(%rdi), %zmm1 @@ -13519,96 +13500,97 @@ ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 3712(%rdi), %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 {%k1} ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm11 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] ; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm28[1],zmm16[3],zmm28[3],zmm16[5],zmm28[5],zmm16[7],zmm28[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm23[1],zmm25[3],zmm23[3],zmm25[5],zmm23[5],zmm25[7],zmm23[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm4 = zmm15[1],mem[1],zmm15[3],mem[3],zmm15[5],mem[5],zmm15[7],mem[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm4 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm3 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm4 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload @@ -13619,476 +13601,474 @@ ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm6[1],zmm21[1],zmm6[3],zmm21[3],zmm6[5],zmm21[5],zmm6[7],zmm21[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm24[1],zmm21[1],zmm24[3],zmm21[3],zmm24[5],zmm21[5],zmm24[7],zmm21[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] ; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm4[0],zmm30[0],zmm4[2],zmm30[2],zmm4[4],zmm30[4],zmm4[6],zmm30[6] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm25 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm22[0],zmm20[0],zmm22[2],zmm20[2],zmm22[4],zmm20[4],zmm22[6],zmm20[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm28[0],zmm20[0],zmm28[2],zmm20[2],zmm28[4],zmm20[4],zmm28[6],zmm20[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm18 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm3 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm29[0],zmm19[0],zmm29[2],zmm19[2],zmm29[4],zmm19[4],zmm29[6],zmm19[6] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm3 {%k1} = zmm5[0],mem[0],zmm5[2],mem[2],zmm5[4],mem[4],zmm5[6],mem[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm15[0],zmm6[0],zmm15[2],zmm6[2],zmm15[4],zmm6[4],zmm15[6],zmm6[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] -; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm16, %zmm1 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm3, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm16, %zmm12 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm3[1],zmm13[3],zmm3[3],zmm13[5],zmm3[5],zmm13[7],zmm3[7] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm7 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm9 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm3, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm28[1],zmm20[1],zmm28[3],zmm20[3],zmm28[5],zmm20[5],zmm28[7],zmm20[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm16, %zmm31 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm26[1],zmm27[3],zmm26[3],zmm27[5],zmm26[5],zmm27[7],zmm26[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm17, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm29[1],zmm19[1],zmm29[3],zmm19[3],zmm29[5],zmm19[5],zmm29[7],zmm19[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm17, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm10, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm16, %zmm15 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm16, %zmm15 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm23 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm19, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm19, %zmm12, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm17, %zmm12 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm15[1],zmm6[1],zmm15[3],zmm6[3],zmm15[5],zmm6[5],zmm15[7],zmm6[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm15 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm7 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,7,15] -; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm8 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm11 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm12 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm17 = [7,15,7,15] +; AVX512DQ-FAST-NEXT: # ymm17 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm14 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm13 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm13, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm6, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm28, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %xmm20 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0] -; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %xmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %xmm16 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm0[0],xmm3[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 1216(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm24 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 1152(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0] -; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %xmm23 -; AVX512DQ-FAST-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0] +; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %xmm21 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm21[0],xmm0[0] +; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %xmm17 +; AVX512DQ-FAST-NEXT: vmovdqa 512(%rdi), %xmm15 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm15[0],xmm17[0] ; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %xmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %xmm31 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %xmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %xmm30 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512DQ-FAST-NEXT: vmovdqa 1088(%rdi), %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa 1024(%rdi), %xmm13 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm13[0],xmm14[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm24, %ymm24 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %xmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %xmm24 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm24[0],xmm23[0] ; AVX512DQ-FAST-NEXT: vmovdqa 1600(%rdi), %xmm12 ; AVX512DQ-FAST-NEXT: vmovdqa 1536(%rdi), %xmm11 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm11[0],xmm12[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm19, %ymm19 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm19, %zmm18, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 2240(%rdi), %xmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 2176(%rdi), %xmm21 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0] -; AVX512DQ-FAST-NEXT: vmovdqa 2112(%rdi), %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa 2048(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm25 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 2240(%rdi), %xmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 2176(%rdi), %xmm19 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm19[0],xmm16[0] +; AVX512DQ-FAST-NEXT: vmovdqa 2112(%rdi), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa 2048(%rdi), %xmm9 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm9[0],xmm4[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm25, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 2752(%rdi), %xmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 2688(%rdi), %xmm29 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0] -; AVX512DQ-FAST-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa 2560(%rdi), %xmm9 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 2752(%rdi), %xmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 2688(%rdi), %xmm26 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm26[0],xmm25[0] +; AVX512DQ-FAST-NEXT: vmovdqa 2624(%rdi), %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa 2560(%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm7[0],xmm8[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm27, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 3264(%rdi), %xmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 3264(%rdi), %xmm22 ; AVX512DQ-FAST-NEXT: vmovdqa64 3200(%rdi), %xmm27 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0] -; AVX512DQ-FAST-NEXT: vmovdqa 3136(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 3072(%rdi), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 3776(%rdi), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 3712(%rdi), %xmm26 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0] -; AVX512DQ-FAST-NEXT: vmovdqa 3648(%rdi), %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 3584(%rdi), %xmm5 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm4 = xmm4[1],mem[1] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm8 = xmm8[1],mem[1] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm27[0],xmm22[0] +; AVX512DQ-FAST-NEXT: vmovdqa 3136(%rdi), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa 3072(%rdi), %xmm5 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm5[0],xmm6[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 3776(%rdi), %xmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 3712(%rdi), %xmm31 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm20[0] +; AVX512DQ-FAST-NEXT: vmovdqa 3648(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 3584(%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm3[0],xmm0[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm18, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm2 = xmm2[1],mem[1] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm18, %xmm18 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm18 = xmm18[1],mem[1] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm18, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm21, %xmm18 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm18 = xmm21[1],mem[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm15 = xmm15[1],xmm17[1] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm15, %ymm15 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm17 = xmm30[1],xmm28[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm14[1] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm13, %ymm13 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm1, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm24[1],xmm23[1] ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm11 = xmm11[1],mem[1] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm14, %ymm11, %ymm11 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm19[1],xmm16[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm4[1] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm26[1],xmm25[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm8[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm27[1],xmm22[1] ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 448(%rsi) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm31[1],xmm20[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rsi) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rsi) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14102,234 +14082,234 @@ ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rsi) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 448(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 320(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, (%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 384(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 256(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 192(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 64(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rcx) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rcx) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rcx) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rcx) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rcx) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rcx) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rcx) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rcx) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rcx) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%r8) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%r8) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%r8) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%r8) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%r8) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%r8) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r8) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%r8) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%r8) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%r8) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r8) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%r9) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%r9) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%r9) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r9) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQ-FAST-NEXT: addq $6728, %rsp # imm = 0x1A48 +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: addq $6664, %rsp # imm = 0x1A08 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: load_i64_stride8_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $6728, %rsp # imm = 0x1A48 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3328(%rdi), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3520(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3456(%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: subq $6664, %rsp # imm = 0x1A08 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %ymm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3200(%rdi), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3136(%rdi), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm9[0],ymm3[2],ymm9[2] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm29 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %ymm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %ymm25 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm25[0],ymm26[0],ymm25[2],ymm26[2] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm31 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %ymm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1152(%rdi), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm31[0],ymm8[2],ymm31[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %ymm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %ymm19 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm21[0],ymm19[2],ymm21[2] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1728(%rdi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1664(%rdi), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %ymm26 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1728(%rdi), %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %ymm17 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm12[0],ymm17[2],ymm12[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %ymm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %ymm30 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm22[0],ymm30[2],ymm22[2] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2432(%rdi), %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2368(%rdi), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %ymm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %ymm29 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %ymm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %ymm27 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2240(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2176(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2112(%rdi), %ymm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2048(%rdi), %ymm24 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm24[0],ymm27[0],ymm24[2],ymm27[2] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14348,55 +14328,53 @@ ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2688(%rdi), %ymm11 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2624(%rdi), %ymm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2624(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2560(%rdi), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2432(%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3520(%rdi), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3456(%rdi), %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2368(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3392(%rdi), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2304(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3328(%rdi), %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3264(%rdi), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2176(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3200(%rdi), %ymm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2112(%rdi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2048(%rdi), %ymm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3136(%rdi), %ymm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3072(%rdi), %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm20[0],ymm18[2],ymm20[2] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 4032(%rdi), %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3968(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3968(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3840(%rdi), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3776(%rdi), %ymm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3712(%rdi), %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3648(%rdi), %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3776(%rdi), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3712(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3648(%rdi), %ymm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14404,174 +14382,175 @@ ; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm13, %ymm14 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm14 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm29[1],mem[1],ymm29[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm25[1],ymm26[1],ymm25[3],ymm26[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm8[1],ymm31[1],ymm8[3],ymm31[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm21[1],ymm19[3],ymm21[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm14 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm12[1],ymm17[3],ymm12[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm30[1],ymm22[1],ymm30[3],ymm22[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm14 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm24[1],ymm27[1],ymm24[3],ymm27[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm2, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm2, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm18[1],ymm20[1],ymm18[3],ymm20[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3136(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3072(%rdi), %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm23[0],zmm16[2],zmm23[2],zmm16[4],zmm23[4],zmm16[6],zmm23[6] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm25[0],zmm1[2],zmm25[2],zmm1[4],zmm25[4],zmm1[6],zmm25[6] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm19[0],zmm8[2],zmm19[2],zmm8[4],zmm19[4],zmm8[6],zmm19[6] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14582,32 +14561,33 @@ ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm30[0],zmm12[2],zmm30[2],zmm12[4],zmm30[4],zmm12[6],zmm30[6] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2048(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2240(%rdi), %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2176(%rdi), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = zmm15[0],mem[0],zmm15[2],mem[2],zmm15[4],mem[4],zmm15[6],mem[6] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2624(%rdi), %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -14618,33 +14598,32 @@ ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2688(%rdi), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2112(%rdi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3136(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2240(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2176(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3200(%rdi), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm6[0],zmm21[0],zmm6[2],zmm21[2],zmm6[4],zmm21[4],zmm6[6],zmm21[6] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3648(%rdi), %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3584(%rdi), %zmm1 @@ -14654,96 +14633,97 @@ ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3712(%rdi), %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm28[1],zmm16[3],zmm28[3],zmm16[5],zmm28[5],zmm16[7],zmm28[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm23[1],zmm25[3],zmm23[3],zmm25[5],zmm23[5],zmm25[7],zmm23[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = zmm15[1],mem[1],zmm15[3],mem[3],zmm15[5],mem[5],zmm15[7],mem[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload @@ -14754,476 +14734,474 @@ ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm6[1],zmm21[1],zmm6[3],zmm21[3],zmm6[5],zmm21[5],zmm6[7],zmm21[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm24[1],zmm21[1],zmm24[3],zmm21[3],zmm24[5],zmm21[5],zmm24[7],zmm21[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm4[0],zmm30[0],zmm4[2],zmm30[2],zmm4[4],zmm30[4],zmm4[6],zmm30[6] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm22[0],zmm20[0],zmm22[2],zmm20[2],zmm22[4],zmm20[4],zmm22[6],zmm20[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm28[0],zmm20[0],zmm28[2],zmm20[2],zmm28[4],zmm20[4],zmm28[6],zmm20[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm29[0],zmm19[0],zmm29[2],zmm19[2],zmm29[4],zmm19[4],zmm29[6],zmm19[6] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 {%k1} = zmm5[0],mem[0],zmm5[2],mem[2],zmm5[4],mem[4],zmm5[6],mem[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm15[0],zmm6[0],zmm15[2],zmm6[2],zmm15[4],zmm6[4],zmm15[6],zmm6[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm16, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm16, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm3[1],zmm13[3],zmm3[3],zmm13[5],zmm3[5],zmm13[7],zmm3[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm3, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm28[1],zmm20[1],zmm28[3],zmm20[3],zmm28[5],zmm20[5],zmm28[7],zmm20[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm10, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm10, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm26[1],zmm27[3],zmm26[3],zmm27[5],zmm26[5],zmm27[7],zmm26[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm17, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm29[1],zmm19[1],zmm29[3],zmm19[3],zmm29[5],zmm19[5],zmm29[7],zmm19[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm17, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm10, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm16, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm16, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm19, %zmm12, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm15[1],zmm6[1],zmm15[3],zmm6[3],zmm15[5],zmm6[5],zmm15[7],zmm6[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm15 # 64-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm7 # 64-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm8 # 64-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm9 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm11 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm12 # 64-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm14 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm6 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 # 64-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm13 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm13, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm6, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm30, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm28, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %xmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %xmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %xmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm0[0],xmm3[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1216(%rdi), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm24 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1152(%rdi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %xmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %xmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm21[0],xmm0[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %xmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %xmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm15[0],xmm17[0] ; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %xmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %xmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %xmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %xmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1088(%rdi), %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm13[0],xmm14[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm24, %ymm24 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm24, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %xmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %xmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm24[0],xmm23[0] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1600(%rdi), %xmm12 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %xmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm11[0],xmm12[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm19, %ymm19 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm19, %zmm18, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2240(%rdi), %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2176(%rdi), %xmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2112(%rdi), %xmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2048(%rdi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2240(%rdi), %xmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2176(%rdi), %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm19[0],xmm16[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2112(%rdi), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2048(%rdi), %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm9[0],xmm4[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm25, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2752(%rdi), %xmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2688(%rdi), %xmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2560(%rdi), %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2752(%rdi), %xmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2688(%rdi), %xmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm26[0],xmm25[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2624(%rdi), %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2560(%rdi), %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm7[0],xmm8[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm27, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %xmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %xmm22 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3200(%rdi), %xmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3136(%rdi), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3072(%rdi), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3776(%rdi), %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3712(%rdi), %xmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3648(%rdi), %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3584(%rdi), %xmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # xmm4 = xmm4[1],mem[1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # xmm8 = xmm8[1],mem[1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm27[0],xmm22[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3136(%rdi), %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3072(%rdi), %xmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm5[0],xmm6[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3776(%rdi), %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3712(%rdi), %xmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm20[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3648(%rdi), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3584(%rdi), %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm3[0],xmm0[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm18, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # xmm2 = xmm2[1],mem[1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm18, %xmm18 # 16-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # xmm18 = xmm18[1],mem[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm18, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm21, %xmm18 # 16-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # xmm18 = xmm21[1],mem[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm15 = xmm15[1],xmm17[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm15, %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm17 = xmm30[1],xmm28[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm14[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm13, %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm1, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm24[1],xmm23[1] ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # xmm11 = xmm11[1],mem[1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm11, %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm19[1],xmm16[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm4[1] ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm26[1],xmm25[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm8[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm27[1],xmm22[1] ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 448(%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm31[1],xmm20[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rsi) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rsi) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -15237,234 +15215,234 @@ ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rsi) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 448(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 384(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 256(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 192(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 64(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rcx) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rcx) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rcx) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rcx) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rcx) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rcx) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rcx) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%r8) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%r8) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%r8) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%r8) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%r8) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r8) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%r8) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r8) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%r9) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%r9) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%r9) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r9) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $6728, %rsp # imm = 0x1A48 +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $6664, %rsp # imm = 0x1A08 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: load_i64_stride8_vf64: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $6728, %rsp # imm = 0x1A48 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3328(%rdi), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3520(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3456(%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: subq $6664, %rsp # imm = 0x1A08 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm15 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm2, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %ymm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3200(%rdi), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3136(%rdi), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm9[0],ymm3[2],ymm9[2] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %ymm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %ymm23 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm29 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %ymm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %ymm25 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm25[0],ymm26[0],ymm25[2],ymm26[2] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm31 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %ymm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1152(%rdi), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm31[0],ymm8[2],ymm31[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %ymm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %ymm19 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm21[0],ymm19[2],ymm21[2] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1728(%rdi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1664(%rdi), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %ymm26 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1728(%rdi), %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %ymm17 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm12[0],ymm17[2],ymm12[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %ymm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %ymm30 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm22[0],ymm30[2],ymm22[2] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2432(%rdi), %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2368(%rdi), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %ymm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %ymm29 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %ymm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %ymm27 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2240(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2176(%rdi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2112(%rdi), %ymm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2048(%rdi), %ymm24 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm24[0],ymm27[0],ymm24[2],ymm27[2] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -15483,55 +15461,53 @@ ; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2688(%rdi), %ymm11 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2624(%rdi), %ymm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2624(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2560(%rdi), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2432(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3520(%rdi), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3456(%rdi), %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2368(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3392(%rdi), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2304(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3328(%rdi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3264(%rdi), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2176(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3200(%rdi), %ymm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2112(%rdi), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2048(%rdi), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3136(%rdi), %ymm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3072(%rdi), %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm20[0],ymm18[2],ymm20[2] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 4032(%rdi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3968(%rdi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3968(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3840(%rdi), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3776(%rdi), %ymm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3712(%rdi), %ymm17 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3648(%rdi), %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3776(%rdi), %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3712(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3648(%rdi), %ymm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -15539,174 +15515,175 @@ ; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm2, %zmm15 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq (%rsp), %ymm13, %ymm14 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm14 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm14 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm29[1],mem[1],ymm29[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm25[1],ymm26[1],ymm25[3],ymm26[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm8[1],ymm31[1],ymm8[3],ymm31[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm21[1],ymm19[3],ymm21[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm14 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm12[1],ymm17[3],ymm12[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm30[1],ymm22[1],ymm30[3],ymm22[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm14 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm24[1],ymm27[1],ymm24[3],ymm27[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm2, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm2, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm18[1],ymm20[1],ymm18[3],ymm20[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm9, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3136(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3072(%rdi), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm23[0],zmm16[2],zmm23[2],zmm16[4],zmm23[4],zmm16[6],zmm23[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm25[0],zmm1[2],zmm25[2],zmm1[4],zmm25[4],zmm1[6],zmm25[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm19[0],zmm8[2],zmm19[2],zmm8[4],zmm19[4],zmm8[6],zmm19[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -15717,32 +15694,33 @@ ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm30[0],zmm12[2],zmm30[2],zmm12[4],zmm30[4],zmm12[6],zmm30[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2048(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2240(%rdi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2176(%rdi), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = zmm15[0],mem[0],zmm15[2],mem[2],zmm15[4],mem[4],zmm15[6],mem[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2624(%rdi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -15753,33 +15731,32 @@ ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2688(%rdi), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2112(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3136(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2240(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2176(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3200(%rdi), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm6[0],zmm21[0],zmm6[2],zmm21[2],zmm6[4],zmm21[4],zmm6[6],zmm21[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3648(%rdi), %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3584(%rdi), %zmm1 @@ -15789,96 +15766,97 @@ ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3712(%rdi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm28[1],zmm16[3],zmm28[3],zmm16[5],zmm28[5],zmm16[7],zmm28[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm23[1],zmm25[3],zmm23[3],zmm25[5],zmm23[5],zmm25[7],zmm23[7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = zmm15[1],mem[1],zmm15[3],mem[3],zmm15[5],mem[5],zmm15[7],mem[7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload @@ -15889,476 +15867,474 @@ ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm6[1],zmm21[1],zmm6[3],zmm21[3],zmm6[5],zmm21[5],zmm6[7],zmm21[7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm24[1],zmm21[1],zmm24[3],zmm21[3],zmm24[5],zmm21[5],zmm24[7],zmm21[7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm4[0],zmm30[0],zmm4[2],zmm30[2],zmm4[4],zmm30[4],zmm4[6],zmm30[6] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm22[0],zmm20[0],zmm22[2],zmm20[2],zmm22[4],zmm20[4],zmm22[6],zmm20[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm28[0],zmm20[0],zmm28[2],zmm20[2],zmm28[4],zmm20[4],zmm28[6],zmm20[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm18 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm29[0],zmm19[0],zmm29[2],zmm19[2],zmm29[4],zmm19[4],zmm29[6],zmm19[6] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm3 {%k1} = zmm5[0],mem[0],zmm5[2],mem[2],zmm5[4],mem[4],zmm5[6],mem[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm15[0],zmm6[0],zmm15[2],zmm6[2],zmm15[4],zmm6[4],zmm15[6],zmm6[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm16, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm3, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm16, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm3[1],zmm13[3],zmm3[3],zmm13[5],zmm3[5],zmm13[7],zmm3[7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm3, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm28[1],zmm20[1],zmm28[3],zmm20[3],zmm28[5],zmm20[5],zmm28[7],zmm20[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm16, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm26[1],zmm27[3],zmm26[3],zmm27[5],zmm26[5],zmm27[7],zmm26[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm17, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm29[1],zmm19[1],zmm29[3],zmm19[3],zmm29[5],zmm19[5],zmm29[7],zmm19[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm17, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm10, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm16, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm16, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm23 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm19, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm19, %zmm12, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm17, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm15[1],zmm6[1],zmm15[3],zmm6[3],zmm15[5],zmm6[5],zmm15[7],zmm6[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm15 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm7 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm8 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm9 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm11 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm12 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm14 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm6 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm13 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm13, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm6, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm30, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm15 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm28, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %xmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %xmm16 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm0[0],xmm3[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1216(%rdi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1152(%rdi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %xmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %xmm21 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm21[0],xmm0[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %xmm15 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm15[0],xmm17[0] ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %xmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %xmm31 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %xmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %xmm30 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1088(%rdi), %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm13[0],xmm14[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm24, %ymm24 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %xmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %xmm24 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm24[0],xmm23[0] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1600(%rdi), %xmm12 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %xmm11 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm11[0],xmm12[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm19, %ymm19 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm19, %zmm18, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2240(%rdi), %xmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2176(%rdi), %xmm21 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2112(%rdi), %xmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2048(%rdi), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm25 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2240(%rdi), %xmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2176(%rdi), %xmm19 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm19[0],xmm16[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2112(%rdi), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2048(%rdi), %xmm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm9[0],xmm4[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm25, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2752(%rdi), %xmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2688(%rdi), %xmm29 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2560(%rdi), %xmm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2752(%rdi), %xmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2688(%rdi), %xmm26 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm26[0],xmm25[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2624(%rdi), %xmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2560(%rdi), %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm7[0],xmm8[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm27, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %xmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %xmm22 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3200(%rdi), %xmm27 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3136(%rdi), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3072(%rdi), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3776(%rdi), %xmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3712(%rdi), %xmm26 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3648(%rdi), %xmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3584(%rdi), %xmm5 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # xmm4 = xmm4[1],mem[1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # xmm8 = xmm8[1],mem[1] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm27[0],xmm22[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3136(%rdi), %xmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3072(%rdi), %xmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm5[0],xmm6[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3776(%rdi), %xmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3712(%rdi), %xmm31 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm20[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3648(%rdi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3584(%rdi), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm3[0],xmm0[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm18, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # xmm2 = xmm2[1],mem[1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm18, %xmm18 # 16-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # xmm18 = xmm18[1],mem[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm18, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm21, %xmm18 # 16-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # xmm18 = xmm21[1],mem[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm15 = xmm15[1],xmm17[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm15, %ymm15 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm17 = xmm30[1],xmm28[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm14[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm13, %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm1, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm24[1],xmm23[1] ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # xmm11 = xmm11[1],mem[1] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm11, %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm19[1],xmm16[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm4[1] ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm26[1],xmm25[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm8[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm27[1],xmm22[1] ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 448(%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm31[1],xmm20[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rsi) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rsi) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16372,234 +16348,234 @@ ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rsi) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 448(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 320(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 384(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 256(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 192(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 64(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rcx) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rcx) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rcx) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rcx) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rcx) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rcx) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rcx) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%r8) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%r8) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%r8) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%r8) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%r8) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r8) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%r8) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%r8) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%r9) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%r9) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%r9) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%r9) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $6728, %rsp # imm = 0x1A48 +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $6664, %rsp # imm = 0x1A08 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: load_i64_stride8_vf64: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $6728, %rsp # imm = 0x1A48 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 3328(%rdi), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 3520(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 3456(%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: subq $6664, %rsp # imm = 0x1A08 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm15 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm8 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: movb $-64, %al ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 3264(%rdi), %ymm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa 3200(%rdi), %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 3136(%rdi), %ymm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm9[0],ymm3[2],ymm9[2] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %ymm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %ymm23 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm29 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %ymm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %ymm25 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm25[0],ymm26[0],ymm25[2],ymm26[2] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm31 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %ymm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1152(%rdi), %ymm8 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm31[0],ymm8[2],ymm31[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %ymm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %ymm19 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm21[0],ymm19[2],ymm21[2] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 1728(%rdi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 1664(%rdi), %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1600(%rdi), %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %ymm26 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 1728(%rdi), %ymm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %ymm17 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm12[0],ymm17[2],ymm12[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1600(%rdi), %ymm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %ymm30 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm22[0],ymm30[2],ymm22[2] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2432(%rdi), %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2368(%rdi), %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %ymm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %ymm29 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %ymm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %ymm27 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 2240(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 2176(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2112(%rdi), %ymm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2048(%rdi), %ymm24 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm24[0],ymm27[0],ymm24[2],ymm27[2] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16618,55 +16594,53 @@ ; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa 2688(%rdi), %ymm11 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 2624(%rdi), %ymm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa 2624(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa 2560(%rdi), %ymm9 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 2432(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3520(%rdi), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3456(%rdi), %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 2368(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3392(%rdi), %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 2304(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3328(%rdi), %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 3264(%rdi), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 2176(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 3200(%rdi), %ymm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 2112(%rdi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 2048(%rdi), %ymm8 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3136(%rdi), %ymm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3072(%rdi), %ymm18 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm20[0],ymm18[2],ymm20[2] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 4032(%rdi), %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 3968(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3968(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 3840(%rdi), %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 3776(%rdi), %ymm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 3712(%rdi), %ymm17 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 3648(%rdi), %ymm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa 3776(%rdi), %ymm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa 3712(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 3648(%rdi), %ymm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16674,174 +16648,175 @@ ; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm2, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm15 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] -; AVX512DQBW-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq (%rsp), %ymm13, %ymm14 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm14 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm14 ; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm29[1],mem[1],ymm29[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm25[1],ymm26[1],ymm25[3],ymm26[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm2, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm8[1],ymm31[1],ymm8[3],ymm31[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm21[1],ymm19[3],ymm21[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm14 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm12[1],ymm17[3],ymm12[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm30[1],ymm22[1],ymm30[3],ymm22[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm14 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm24[1],ymm27[1],ymm24[3],ymm27[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm13 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm2, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm2, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm18[1],ymm20[1],ymm18[3],ymm20[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 3136(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 3072(%rdi), %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 3264(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 ; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm23[0],zmm16[2],zmm23[2],zmm16[4],zmm23[4],zmm16[6],zmm23[6] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm25[0],zmm1[2],zmm25[2],zmm1[4],zmm25[4],zmm1[6],zmm25[6] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm7, %zmm4 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm19[0],zmm8[2],zmm19[2],zmm8[4],zmm19[4],zmm8[6],zmm19[6] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16852,32 +16827,33 @@ ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm30[0],zmm12[2],zmm30[2],zmm12[4],zmm30[4],zmm12[6],zmm30[6] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2048(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2240(%rdi), %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2176(%rdi), %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm3 = zmm15[0],mem[0],zmm15[2],mem[2],zmm15[4],mem[4],zmm15[6],mem[6] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 2624(%rdi), %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16888,33 +16864,32 @@ ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 2688(%rdi), %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 2112(%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3136(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 2240(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3264(%rdi), %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 2176(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3200(%rdi), %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm6[0],zmm21[0],zmm6[2],zmm21[2],zmm6[4],zmm21[4],zmm6[6],zmm21[6] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 3648(%rdi), %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 3584(%rdi), %zmm1 @@ -16924,96 +16899,97 @@ ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 3712(%rdi), %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 {%k1} ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm11 ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] ; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm28[1],zmm16[3],zmm28[3],zmm16[5],zmm28[5],zmm16[7],zmm28[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm23[1],zmm25[3],zmm23[3],zmm25[5],zmm23[5],zmm25[7],zmm23[7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm4 = zmm15[1],mem[1],zmm15[3],mem[3],zmm15[5],mem[5],zmm15[7],mem[7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm4 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm3 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm4 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload @@ -17024,476 +17000,474 @@ ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm6[1],zmm21[1],zmm6[3],zmm21[3],zmm6[5],zmm21[5],zmm6[7],zmm21[7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm24[1],zmm21[1],zmm24[3],zmm21[3],zmm24[5],zmm21[5],zmm24[7],zmm21[7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] ; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm4[0],zmm30[0],zmm4[2],zmm30[2],zmm4[4],zmm30[4],zmm4[6],zmm30[6] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm0, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm22[0],zmm20[0],zmm22[2],zmm20[2],zmm22[4],zmm20[4],zmm22[6],zmm20[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm28[0],zmm20[0],zmm28[2],zmm20[2],zmm28[4],zmm20[4],zmm28[6],zmm20[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm18 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm3 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm29[0],zmm19[0],zmm29[2],zmm19[2],zmm29[4],zmm19[4],zmm29[6],zmm19[6] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm3 {%k1} = zmm5[0],mem[0],zmm5[2],mem[2],zmm5[4],mem[4],zmm5[6],mem[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm15[0],zmm6[0],zmm15[2],zmm6[2],zmm15[4],zmm6[4],zmm15[6],zmm6[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] -; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm16, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm16, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm3[1],zmm13[3],zmm3[3],zmm13[5],zmm3[5],zmm13[7],zmm3[7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm17, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm3, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm28[1],zmm20[1],zmm28[3],zmm20[3],zmm28[5],zmm20[5],zmm28[7],zmm20[7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm10, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm10, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm26[1],zmm27[3],zmm26[3],zmm27[5],zmm26[5],zmm27[7],zmm26[7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm17, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm29[1],zmm19[1],zmm29[3],zmm19[3],zmm29[5],zmm19[5],zmm29[7],zmm19[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm17, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm10, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm16, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm16, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm19, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm19, %zmm12, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm17, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm15[1],zmm6[1],zmm15[3],zmm6[3],zmm15[5],zmm6[5],zmm15[7],zmm6[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm15 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm7 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm8 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm9 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm11 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm12 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm17 = [7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # ymm17 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm14 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm13 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm13, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm6, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm30, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm15 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm28, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 704(%rdi), %xmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %xmm20 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %xmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %xmm16 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm0[0],xmm3[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 1216(%rdi), %xmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm24 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 704(%rdi), %xmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa 1152(%rdi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %xmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %xmm21 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm21[0],xmm0[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %xmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa 512(%rdi), %xmm15 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm15[0],xmm17[0] ; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %xmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %xmm31 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %xmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %xmm30 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa 1088(%rdi), %xmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm13 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm13[0],xmm14[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm24, %ymm24 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm24, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %xmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %xmm24 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm24[0],xmm23[0] ; AVX512DQBW-SLOW-NEXT: vmovdqa 1600(%rdi), %xmm12 ; AVX512DQBW-SLOW-NEXT: vmovdqa 1536(%rdi), %xmm11 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm11[0],xmm12[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm19, %ymm19 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm19, %zmm18, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 2240(%rdi), %xmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 2176(%rdi), %xmm21 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0] -; AVX512DQBW-SLOW-NEXT: vmovdqa 2112(%rdi), %xmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa 2048(%rdi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2240(%rdi), %xmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2176(%rdi), %xmm19 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm19[0],xmm16[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa 2112(%rdi), %xmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa 2048(%rdi), %xmm9 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm9[0],xmm4[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm25, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 2752(%rdi), %xmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 2688(%rdi), %xmm29 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0] -; AVX512DQBW-SLOW-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa 2560(%rdi), %xmm9 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm27 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2752(%rdi), %xmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2688(%rdi), %xmm26 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm26[0],xmm25[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa 2624(%rdi), %xmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa 2560(%rdi), %xmm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm7[0],xmm8[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm27, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 3264(%rdi), %xmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3264(%rdi), %xmm22 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 3200(%rdi), %xmm27 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0] -; AVX512DQBW-SLOW-NEXT: vmovdqa 3136(%rdi), %xmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa 3072(%rdi), %xmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 3776(%rdi), %xmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 3712(%rdi), %xmm26 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0] -; AVX512DQBW-SLOW-NEXT: vmovdqa 3648(%rdi), %xmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa 3584(%rdi), %xmm5 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # xmm4 = xmm4[1],mem[1] -; AVX512DQBW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # xmm8 = xmm8[1],mem[1] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm27[0],xmm22[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa 3136(%rdi), %xmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa 3072(%rdi), %xmm5 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm5[0],xmm6[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3776(%rdi), %xmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3712(%rdi), %xmm31 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm20[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa 3648(%rdi), %xmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 3584(%rdi), %xmm3 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm3[0],xmm0[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm18, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # xmm2 = xmm2[1],mem[1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm18, %xmm18 # 16-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # xmm18 = xmm18[1],mem[1] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm18, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm21, %xmm18 # 16-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # xmm18 = xmm21[1],mem[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm15 = xmm15[1],xmm17[1] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm15, %ymm15 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm17 = xmm30[1],xmm28[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm14[1] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm13, %ymm13 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm1, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm24[1],xmm23[1] ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # xmm11 = xmm11[1],mem[1] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm11, %ymm11 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm19[1],xmm16[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm4[1] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm26[1],xmm25[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm8[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm7 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm27[1],xmm22[1] ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 448(%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm31[1],xmm20[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rsi) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rsi) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -17507,234 +17481,234 @@ ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rsi) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 448(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 384(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 256(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 192(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 64(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rcx) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rcx) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rcx) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%rcx) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rcx) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rcx) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rcx) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rcx) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%r8) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%r8) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%r8) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%r8) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%r8) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%r8) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%r8) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%r8) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%r8) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%r8) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%r8) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%r9) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%r9) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%r9) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%r9) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax) ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQBW-SLOW-NEXT: addq $6728, %rsp # imm = 0x1A48 +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQBW-SLOW-NEXT: addq $6664, %rsp # imm = 0x1A08 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: load_i64_stride8_vf64: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $6728, %rsp # imm = 0x1A48 -; AVX512DQBW-FAST-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 3328(%rdi), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 3520(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 3456(%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: subq $6664, %rsp # imm = 0x1A08 ; AVX512DQBW-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm11 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm15 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: movb $-64, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm2, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 3264(%rdi), %ymm21 -; AVX512DQBW-FAST-NEXT: vmovdqa 3200(%rdi), %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 3136(%rdi), %ymm4 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm9[0],ymm3[2],ymm9[2] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 704(%rdi), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %ymm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %ymm23 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %ymm29 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm0[0],ymm29[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %ymm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %ymm25 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm25[0],ymm26[0],ymm25[2],ymm26[2] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %ymm31 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %ymm31 +; AVX512DQBW-FAST-NEXT: vmovdqa 1152(%rdi), %ymm8 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm31[0],ymm8[2],ymm31[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %ymm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %ymm19 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm21[0],ymm19[2],ymm21[2] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQBW-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 1728(%rdi), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 1664(%rdi), %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQBW-FAST-NEXT: vmovdqa64 1600(%rdi), %ymm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %ymm26 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 1728(%rdi), %ymm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %ymm17 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm17[0],ymm12[0],ymm17[2],ymm12[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 1600(%rdi), %ymm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %ymm30 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm30[0],ymm22[0],ymm30[2],ymm22[2] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 2496(%rdi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 2432(%rdi), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 2368(%rdi), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 2304(%rdi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %ymm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %ymm29 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] -; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %ymm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %ymm27 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 2240(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 2176(%rdi), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 2112(%rdi), %ymm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 2048(%rdi), %ymm24 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm24[0],ymm27[0],ymm24[2],ymm27[2] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17753,55 +17727,53 @@ ; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa 2688(%rdi), %ymm11 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa64 2624(%rdi), %ymm16 +; AVX512DQBW-FAST-NEXT: vmovdqa 2624(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa 2560(%rdi), %ymm9 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 2432(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 3520(%rdi), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 3456(%rdi), %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 2368(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 3392(%rdi), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 2304(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 3328(%rdi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 3264(%rdi), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 2176(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 3200(%rdi), %ymm3 ; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 2112(%rdi), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 2048(%rdi), %ymm8 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 3136(%rdi), %ymm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 3072(%rdi), %ymm18 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm18[0],ymm20[0],ymm18[2],ymm20[2] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 4032(%rdi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 3968(%rdi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 3968(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 3840(%rdi), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 3776(%rdi), %ymm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 3712(%rdi), %ymm17 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 3648(%rdi), %ymm12 +; AVX512DQBW-FAST-NEXT: vmovdqa 3776(%rdi), %ymm6 +; AVX512DQBW-FAST-NEXT: vmovdqa 3712(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm3[0],ymm6[0],ymm3[2],ymm6[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 3648(%rdi), %ymm4 ; AVX512DQBW-FAST-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17809,174 +17781,175 @@ ; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm2, %zmm15 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] -; AVX512DQBW-FAST-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq (%rsp), %ymm13, %ymm14 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm14 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm14 ; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm29, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm13 = ymm29[1],mem[1],ymm29[3],mem[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm25[1],ymm26[1],ymm25[3],ymm26[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm2, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm2, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm8[1],ymm31[1],ymm8[3],ymm31[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm21[1],ymm19[3],ymm21[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm14 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm17[1],ymm12[1],ymm17[3],ymm12[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm30[1],ymm22[1],ymm30[3],ymm22[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm14 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3] +; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm24[1],ymm27[1],ymm24[3],ymm27[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm2, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm2, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm9 = ymm9[1],mem[1],ymm9[3],mem[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm18[1],ymm20[1],ymm18[3],ymm20[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm6[1],ymm3[3],ymm6[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6] +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 3136(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 3072(%rdi), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 3264(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] -; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 ; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm16[0],zmm23[0],zmm16[2],zmm23[2],zmm16[4],zmm23[4],zmm16[6],zmm23[6] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm4 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm25[0],zmm1[2],zmm25[2],zmm1[4],zmm25[4],zmm1[6],zmm25[6] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm7, %zmm4 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm8[0],zmm19[0],zmm8[2],zmm19[2],zmm8[4],zmm19[4],zmm8[6],zmm19[6] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17987,32 +17960,33 @@ ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm30[0],zmm12[2],zmm30[2],zmm12[4],zmm30[4],zmm12[6],zmm30[6] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 2112(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 2048(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 2240(%rdi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 2176(%rdi), %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm3 = zmm15[0],mem[0],zmm15[2],mem[2],zmm15[4],mem[4],zmm15[6],mem[6] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 2624(%rdi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18023,33 +17997,32 @@ ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 2688(%rdi), %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm4 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 2112(%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 3136(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 3072(%rdi), %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 2240(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 3264(%rdi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 2176(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 3200(%rdi), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm6[0],zmm21[0],zmm6[2],zmm21[2],zmm6[4],zmm21[4],zmm6[6],zmm21[6] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm14 ; AVX512DQBW-FAST-NEXT: vmovdqa64 3648(%rdi), %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 3584(%rdi), %zmm1 @@ -18059,96 +18032,97 @@ ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 3712(%rdi), %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm7 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 = zmm9[0],zmm10[0],zmm9[2],zmm10[2],zmm9[4],zmm10[4],zmm9[6],zmm10[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 {%k1} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm11 ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] ; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm16[1],zmm28[1],zmm16[3],zmm28[3],zmm16[5],zmm28[5],zmm16[7],zmm28[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm5 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm23[1],zmm25[3],zmm23[3],zmm25[5],zmm23[5],zmm25[7],zmm23[7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm4 = zmm15[1],mem[1],zmm15[3],mem[3],zmm15[5],mem[5],zmm15[7],mem[7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm4 = zmm17[1],mem[1],zmm17[3],mem[3],zmm17[5],mem[5],zmm17[7],mem[7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm3 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm4 = zmm27[1],mem[1],zmm27[3],mem[3],zmm27[5],mem[5],zmm27[7],mem[7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload @@ -18159,476 +18133,474 @@ ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm6[1],zmm21[1],zmm6[3],zmm21[3],zmm6[5],zmm21[5],zmm6[7],zmm21[7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 = zmm24[1],zmm21[1],zmm24[3],zmm21[3],zmm24[5],zmm21[5],zmm24[7],zmm21[7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] ; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm1, %zmm3 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm4[0],zmm30[0],zmm4[2],zmm30[2],zmm4[4],zmm30[4],zmm4[6],zmm30[6] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm25 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k1} = zmm22[0],zmm20[0],zmm22[2],zmm20[2],zmm22[4],zmm20[4],zmm22[6],zmm20[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm25, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm15 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm28[0],zmm20[0],zmm28[2],zmm20[2],zmm28[4],zmm20[4],zmm28[6],zmm20[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm15, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm3 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm18 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm22[0],zmm23[2],zmm22[2],zmm23[4],zmm22[4],zmm23[6],zmm22[6] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm3 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm27[0],zmm26[0],zmm27[2],zmm26[2],zmm27[4],zmm26[4],zmm27[6],zmm26[6] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm3 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm29[0],zmm19[0],zmm29[2],zmm19[2],zmm29[4],zmm19[4],zmm29[6],zmm19[6] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm3 {%k1} = zmm5[0],mem[0],zmm5[2],mem[2],zmm5[4],mem[4],zmm5[6],mem[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm1 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm15[0],zmm6[0],zmm15[2],zmm6[2],zmm15[4],zmm6[4],zmm15[6],zmm6[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] -; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm16, %zmm1 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm3, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm16, %zmm12 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm13[1],zmm3[1],zmm13[3],zmm3[3],zmm13[5],zmm3[5],zmm13[7],zmm3[7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm7 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm17, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm9 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm3, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm28[1],zmm20[1],zmm28[3],zmm20[3],zmm28[5],zmm20[5],zmm28[7],zmm20[7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm16, %zmm31 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm22[1],zmm23[3],zmm22[3],zmm23[5],zmm22[5],zmm23[7],zmm22[7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm1, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm26[1],zmm27[3],zmm26[3],zmm27[5],zmm26[5],zmm27[7],zmm26[7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm17, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm29[1],zmm19[1],zmm29[3],zmm19[3],zmm29[5],zmm19[5],zmm29[7],zmm19[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm17, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm22[1],zmm12[1],zmm22[3],zmm12[3],zmm22[5],zmm12[5],zmm22[7],zmm12[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm10, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm16, %zmm15 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm16, %zmm15 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm23 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm19, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm19, %zmm12, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm17, %zmm12 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm15[1],zmm6[1],zmm15[3],zmm6[3],zmm15[5],zmm6[5],zmm15[7],zmm6[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm15 # 64-byte Folded Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm7 # 64-byte Folded Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,7,15] -; AVX512DQBW-FAST-NEXT: # ymm16 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm8 # 64-byte Folded Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm9 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm11 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm12 # 64-byte Folded Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm17 = [7,15,7,15] +; AVX512DQBW-FAST-NEXT: # ymm17 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm14 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 # 64-byte Folded Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm4 # 64-byte Folded Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm13 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm13, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm6, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm9[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm30, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm15 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm15 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm28, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm19, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 704(%rdi), %xmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %xmm20 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0] -; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %xmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %xmm16 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm0[0],xmm3[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm21, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 1216(%rdi), %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 704(%rdi), %xmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa 1152(%rdi), %xmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0] -; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %xmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0] +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %xmm21 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm21[0],xmm0[0] +; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %xmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa 512(%rdi), %xmm15 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm15[0],xmm17[0] ; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm30, %zmm24, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %xmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %xmm31 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %xmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %xmm30 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512DQBW-FAST-NEXT: vmovdqa 1088(%rdi), %xmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa 1024(%rdi), %xmm13 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm13[0],xmm14[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm24, %ymm24 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %xmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %xmm24 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm24[0],xmm23[0] ; AVX512DQBW-FAST-NEXT: vmovdqa 1600(%rdi), %xmm12 ; AVX512DQBW-FAST-NEXT: vmovdqa 1536(%rdi), %xmm11 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm11[0],xmm12[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm19, %ymm19 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm19, %zmm18, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 2240(%rdi), %xmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 2176(%rdi), %xmm21 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0] -; AVX512DQBW-FAST-NEXT: vmovdqa 2112(%rdi), %xmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa 2048(%rdi), %xmm1 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm25 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 2240(%rdi), %xmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 2176(%rdi), %xmm19 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm19[0],xmm16[0] +; AVX512DQBW-FAST-NEXT: vmovdqa 2112(%rdi), %xmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa 2048(%rdi), %xmm9 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm9[0],xmm4[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm25, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 2752(%rdi), %xmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 2688(%rdi), %xmm29 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0] -; AVX512DQBW-FAST-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa 2560(%rdi), %xmm9 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm27 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 2752(%rdi), %xmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 2688(%rdi), %xmm26 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm26[0],xmm25[0] +; AVX512DQBW-FAST-NEXT: vmovdqa 2624(%rdi), %xmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa 2560(%rdi), %xmm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm7[0],xmm8[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm27, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 3264(%rdi), %xmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 3264(%rdi), %xmm22 ; AVX512DQBW-FAST-NEXT: vmovdqa64 3200(%rdi), %xmm27 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0] -; AVX512DQBW-FAST-NEXT: vmovdqa 3136(%rdi), %xmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa 3072(%rdi), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 3776(%rdi), %xmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 3712(%rdi), %xmm26 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0] -; AVX512DQBW-FAST-NEXT: vmovdqa 3648(%rdi), %xmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa 3584(%rdi), %xmm5 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # xmm4 = xmm4[1],mem[1] -; AVX512DQBW-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # xmm8 = xmm8[1],mem[1] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm27[0],xmm22[0] +; AVX512DQBW-FAST-NEXT: vmovdqa 3136(%rdi), %xmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa 3072(%rdi), %xmm5 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm5[0],xmm6[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm18, %ymm18 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm20, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 3776(%rdi), %xmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 3712(%rdi), %xmm31 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm20[0] +; AVX512DQBW-FAST-NEXT: vmovdqa 3648(%rdi), %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 3584(%rdi), %xmm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm3[0],xmm0[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm18, %ymm2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # xmm2 = xmm2[1],mem[1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm18, %xmm18 # 16-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # xmm18 = xmm18[1],mem[1] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm18, %ymm2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm21, %xmm18 # 16-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # xmm18 = xmm21[1],mem[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm15 = xmm15[1],xmm17[1] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm15, %ymm15 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm1, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm17 = xmm30[1],xmm28[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm14[1] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm13, %ymm13 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm1, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm24[1],xmm23[1] ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # xmm11 = xmm11[1],mem[1] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm11, %ymm11 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm19[1],xmm16[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm4[1] ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm26[1],xmm25[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm8[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm7 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm27[1],xmm22[1] ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 448(%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm31[1],xmm20[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rsi) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rsi) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18642,113 +18614,113 @@ ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rsi) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 448(%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 320(%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, (%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 384(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 320(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 256(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 192(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 128(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 64(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rcx) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rcx) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rcx) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rcx) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rcx) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rcx) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rcx) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rcx) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rcx) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rcx) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%r8) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%r8) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%r8) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%r8) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%r8) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%r8) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%r8) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%r8) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%r8) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%r8) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%r8) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%r9) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%r9) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%r9) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%r9) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%r9) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%r9) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%r9) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%r9) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax) ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512DQBW-FAST-NEXT: addq $6728, %rsp # imm = 0x1A48 +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQBW-FAST-NEXT: addq $6664, %rsp # imm = 0x1A08 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %wide.vec = load <512 x i64>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll @@ -247,25 +247,25 @@ ; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: pand %xmm4, %xmm6 ; SSE-NEXT: packuswb %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 ; SSE-NEXT: packuswb %xmm5, %xmm4 -; SSE-NEXT: psrlw $8, %xmm3 -; SSE-NEXT: psrlw $8, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 ; SSE-NEXT: psrlw $8, %xmm1 ; SSE-NEXT: psrlw $8, %xmm0 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm4, (%rsi) -; SSE-NEXT: movdqa %xmm6, 16(%rsi) -; SSE-NEXT: movdqa %xmm0, (%rdx) +; SSE-NEXT: psrlw $8, %xmm3 +; SSE-NEXT: psrlw $8, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm4, 16(%rsi) +; SSE-NEXT: movdqa %xmm6, (%rsi) ; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride2_vf32: @@ -365,28 +365,28 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind { ; SSE-LABEL: load_i8_stride2_vf64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa 80(%rdi), %xmm4 -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: movdqa 112(%rdi), %xmm7 -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm9 +; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa 112(%rdi), %xmm4 +; SSE-NEXT: movdqa 64(%rdi), %xmm2 +; SSE-NEXT: movdqa 80(%rdi), %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm11 ; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa 48(%rdi), %xmm11 +; SSE-NEXT: movdqa 48(%rdi), %xmm9 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm11, %xmm8 ; SSE-NEXT: pand %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pand %xmm6, %xmm5 ; SSE-NEXT: packuswb %xmm8, %xmm5 ; SSE-NEXT: movdqa %xmm9, %xmm10 ; SSE-NEXT: pand %xmm6, %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pand %xmm6, %xmm8 ; SSE-NEXT: packuswb %xmm10, %xmm8 ; SSE-NEXT: movdqa %xmm7, %xmm12 ; SSE-NEXT: pand %xmm6, %xmm12 -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: pand %xmm6, %xmm10 ; SSE-NEXT: packuswb %xmm12, %xmm10 ; SSE-NEXT: movdqa %xmm4, %xmm12 @@ -394,72 +394,72 @@ ; SSE-NEXT: pand %xmm0, %xmm6 ; SSE-NEXT: packuswb %xmm12, %xmm6 ; SSE-NEXT: psrlw $8, %xmm11 -; SSE-NEXT: psrlw $8, %xmm3 -; SSE-NEXT: packuswb %xmm11, %xmm3 +; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: packuswb %xmm11, %xmm1 ; SSE-NEXT: psrlw $8, %xmm9 -; SSE-NEXT: psrlw $8, %xmm2 -; SSE-NEXT: packuswb %xmm9, %xmm2 +; SSE-NEXT: psrlw $8, %xmm3 +; SSE-NEXT: packuswb %xmm9, %xmm3 ; SSE-NEXT: psrlw $8, %xmm7 -; SSE-NEXT: psrlw $8, %xmm1 -; SSE-NEXT: packuswb %xmm7, %xmm1 +; SSE-NEXT: psrlw $8, %xmm2 +; SSE-NEXT: packuswb %xmm7, %xmm2 ; SSE-NEXT: psrlw $8, %xmm4 ; SSE-NEXT: psrlw $8, %xmm0 ; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm6, 32(%rsi) -; SSE-NEXT: movdqa %xmm10, 48(%rsi) -; SSE-NEXT: movdqa %xmm8, (%rsi) -; SSE-NEXT: movdqa %xmm5, 16(%rsi) -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm1, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) +; SSE-NEXT: movdqa %xmm6, 48(%rsi) +; SSE-NEXT: movdqa %xmm10, 32(%rsi) +; SSE-NEXT: movdqa %xmm8, 16(%rsi) +; SSE-NEXT: movdqa %xmm5, (%rsi) +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm2, 32(%rdx) ; SSE-NEXT: movdqa %xmm3, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride2_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpand %xmm1, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpand %xmm1, %xmm3, %xmm4 -; AVX1-ONLY-NEXT: vpackuswb %xmm0, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpand %xmm1, %xmm4, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpand %xmm1, %xmm6, %xmm7 -; AVX1-ONLY-NEXT: vpackuswb %xmm5, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpand %xmm1, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vpand %xmm1, %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vpackuswb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpand %xmm1, %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vpand %xmm1, %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vpackuswb %xmm6, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpand %xmm1, %xmm7, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpand %xmm1, %xmm9, %xmm10 +; AVX1-ONLY-NEXT: vpackuswb %xmm8, %xmm10, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vpand %xmm1, %xmm10, %xmm11 -; AVX1-ONLY-NEXT: vpand %xmm1, %xmm9, %xmm12 -; AVX1-ONLY-NEXT: vpackuswb %xmm11, %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vpand %xmm1, %xmm8, %xmm12 -; AVX1-ONLY-NEXT: vpand %xmm1, %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vpackuswb %xmm12, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpand %xmm1, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vpackuswb %xmm11, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm9, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm5[0],xmm4[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 16(%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 32(%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 48(%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -470,29 +470,29 @@ ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] -; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm3, %ymm5 +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm1, %ymm5 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] -; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm2, %ymm7 +; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,1,3] -; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm1, %ymm4 -; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm3, %ymm4 +; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm2, %ymm6 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] -; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] -; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] ; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 32(%rsi) +; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 32(%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll @@ -769,13 +769,13 @@ ; SSE-LABEL: load_i8_stride3_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movdqa 80(%rdi), %xmm2 -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movdqa 64(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm6 -; SSE-NEXT: movdqa 64(%rdi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa 48(%rdi), %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm0, %xmm7 @@ -850,7 +850,7 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm15 +; SSE-NEXT: movdqa 80(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm15, %xmm11 ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] ; SSE-NEXT: movdqa %xmm12, %xmm0 @@ -869,10 +869,10 @@ ; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm14 +; SSE-NEXT: movdqa 112(%rdi), %xmm14 ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: movdqa 144(%rdi), %xmm1 +; SSE-NEXT: movdqa 96(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -896,7 +896,7 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: packuswb %xmm0, %xmm4 -; SSE-NEXT: movdqa 176(%rdi), %xmm2 +; SSE-NEXT: movdqa 128(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] ; SSE-NEXT: movdqa %xmm12, %xmm8 @@ -915,11 +915,11 @@ ; SSE-NEXT: pand %xmm6, %xmm4 ; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm0 +; SSE-NEXT: movdqa 160(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa 96(%rdi), %xmm8 +; SSE-NEXT: movdqa 144(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm3, %xmm8 ; SSE-NEXT: por %xmm4, %xmm8 @@ -938,7 +938,7 @@ ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; SSE-NEXT: packuswb %xmm4, %xmm9 -; SSE-NEXT: movdqa 128(%rdi), %xmm8 +; SSE-NEXT: movdqa 176(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1225,44 +1225,44 @@ ; SSE-NEXT: pandn %xmm2, %xmm5 ; SSE-NEXT: por %xmm9, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) -; SSE-NEXT: movdqa %xmm6, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm13, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) -; SSE-NEXT: movdqa %xmm5, 32(%rcx) -; SSE-NEXT: movdqa %xmm8, 48(%rcx) -; SSE-NEXT: movdqa %xmm4, (%rcx) -; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movdqa %xmm6, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm13, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movdqa %xmm5, 48(%rcx) +; SSE-NEXT: movdqa %xmm8, 32(%rcx) +; SSE-NEXT: movdqa %xmm4, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: addq $152, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride3_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u> ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm9, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u> @@ -1270,70 +1270,72 @@ ; AVX1-ONLY-NEXT: vpor %xmm9, %xmm15, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm10, %xmm10 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm15, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm14, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm10, %xmm13 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm10 = xmm10[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm0 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm14, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm14, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm15, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm15, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm9, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm15 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm15, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 @@ -1342,17 +1344,17 @@ ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 32(%rsi) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 32(%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm13, (%rsi) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%rsi) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, 48(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 32(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm13, 16(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 48(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, (%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 16(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm8, 48(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 32(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, (%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm12, 16(%rcx) ; AVX1-ONLY-NEXT: retq ; @@ -1366,53 +1368,53 @@ ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm2 -; AVX2-ONLY-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3 +; AVX2-ONLY-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm4 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] ; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm7 +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm7 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14] ; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm5 +; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm5 ; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm5, %ymm5 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,0,0,255,0,0,255,0,0,255,0,0,255,0,0,255,255,0,0,255,0,0,255,0,0,255,0,0,255,0,0,255] ; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm9 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0,1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0] ; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm8 +; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm8 ; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm8, %ymm8 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm8 = ymm5[11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = ymm7[11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7,8,9,10],ymm7[27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,128,128,128,128,128,17,20,23,26,29,16,19,22,25,28,31,128,128,128,128,128] -; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm6, %ymm6 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128,128,128,128,128,128,128,18,21,24,27,30] +; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vpor %ymm2, %ymm6, %ymm2 +; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm6, %ymm4 -; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX2-ONLY-NEXT: vpor %ymm4, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5] -; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255] ; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] -; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] +; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rsi) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, (%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm8, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, 32(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%rcx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -101,9 +101,9 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 ; SSE-NEXT: packuswb %xmm4, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] @@ -1024,435 +1024,406 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i8_stride4_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $632, %rsp # imm = 0x278 -; SSE-NEXT: movdqa 16(%rdi), %xmm15 -; SSE-NEXT: movdqa 32(%rdi), %xmm13 -; SSE-NEXT: movdqa 48(%rdi), %xmm7 -; SSE-NEXT: movdqa 128(%rdi), %xmm14 -; SSE-NEXT: movdqa 144(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm4 -; SSE-NEXT: movdqa 176(%rdi), %xmm3 -; SSE-NEXT: movdqa 64(%rdi), %xmm8 -; SSE-NEXT: movdqa 80(%rdi), %xmm11 -; SSE-NEXT: movdqa 96(%rdi), %xmm2 -; SSE-NEXT: movdqa 112(%rdi), %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,0,255,0,255,0] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: subq $648, %rsp # imm = 0x288 +; SSE-NEXT: movdqa 144(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm10 +; SSE-NEXT: movdqa 80(%rdi), %xmm12 +; SSE-NEXT: movdqa 96(%rdi), %xmm9 +; SSE-NEXT: movdqa 112(%rdi), %xmm8 +; SSE-NEXT: movdqa (%rdi), %xmm13 +; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: movdqa 48(%rdi), %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa 128(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: movdqa 192(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: packuswb %xmm0, %xmm6 -; SSE-NEXT: packuswb %xmm1, %xmm6 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: packuswb %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm3, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] +; SSE-NEXT: packuswb %xmm7, %xmm14 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: packuswb %xmm5, %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: packuswb %xmm7, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm14[0,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: packuswb %xmm5, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm9[0,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm14[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: packuswb %xmm7, %xmm15 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE-NEXT: packuswb %xmm5, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm10[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: packuswb %xmm7, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm15[0,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: packuswb %xmm5, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm9[0,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: packuswb %xmm7, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: packuswb %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] -; SSE-NEXT: packuswb %xmm1, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm0[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm7[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; SSE-NEXT: packuswb %xmm0, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm15[0,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: packuswb %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; SSE-NEXT: packuswb %xmm2, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: packuswb %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm15[0,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm4[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm3[0,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm5[0,3] +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,7,6,5,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,7,6,5,4] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,7,6,5,4] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] +; SSE-NEXT: packuswb %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm5[0,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm6 +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm0, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm8, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm6[0,3] ; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] -; SSE-NEXT: pand %xmm0, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm8 ; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm9 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm7[0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm8[0,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; SSE-NEXT: packuswb %xmm7, %xmm8 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: packuswb %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm8[0,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; SSE-NEXT: packuswb %xmm8, %xmm9 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] -; SSE-NEXT: packuswb %xmm8, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm9[0,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: packuswb %xmm8, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm9[0,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] @@ -1463,11 +1434,11 @@ ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] -; SSE-NEXT: packuswb %xmm9, %xmm11 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; SSE-NEXT: packuswb %xmm9, %xmm10 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] @@ -1477,65 +1448,96 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] ; SSE-NEXT: packuswb %xmm9, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm11[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm10[0,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] +; SSE-NEXT: packuswb %xmm10, %xmm11 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm9[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1] +; SSE-NEXT: packuswb %xmm10, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm11[0,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; SSE-NEXT: packuswb %xmm11, %xmm12 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload ; SSE-NEXT: # xmm11 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[3,1,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1] ; SSE-NEXT: packuswb %xmm11, %xmm13 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm12[0,3] -; SSE-NEXT: movdqa %xmm6, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movaps %xmm6, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rsi) ; SSE-NEXT: movaps %xmm1, 48(%rdx) -; SSE-NEXT: movaps %xmm14, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 32(%rdx) +; SSE-NEXT: movaps %xmm14, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps %xmm4, 48(%rcx) -; SSE-NEXT: movaps %xmm3, 32(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, (%rdx) +; SSE-NEXT: movaps %xmm6, 48(%rcx) +; SSE-NEXT: movaps %xmm5, 32(%rcx) ; SSE-NEXT: movaps %xmm2, 16(%rcx) -; SSE-NEXT: movaps %xmm10, (%rcx) +; SSE-NEXT: movaps %xmm4, (%rcx) ; SSE-NEXT: movaps %xmm13, 48(%r8) -; SSE-NEXT: movaps %xmm8, 32(%r8) -; SSE-NEXT: movaps %xmm7, 16(%r8) +; SSE-NEXT: movaps %xmm9, 32(%r8) +; SSE-NEXT: movaps %xmm8, 16(%r8) ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: addq $632, %rsp # imm = 0x278 +; SSE-NEXT: addq $648, %rsp # imm = 0x288 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride4_vf64: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -104,7 +104,7 @@ ; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm6[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,3,0,4,5,6,7] @@ -1527,141 +1527,142 @@ ; AVX1-ONLY-LABEL: load_i8_stride5_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[1,6,11] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u],zero,zero,zero,xmm2[3,8,13,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,4,9,14],zero,zero,zero,xmm3[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[0,5,10,15],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm8[u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[1,6,11] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u],zero,zero,zero,xmm3[3,8,13,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,4,9,14],zero,zero,zero,xmm6[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[0,5,10,15],zero,zero,zero,zero,zero,zero,xmm7[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm4[0,1,2,3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u],zero,zero,zero,xmm8[2,7,12,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm5[u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm5, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11] -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm12, %ymm14 -; AVX1-ONLY-NEXT: vorps %ymm14, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,7,12] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm14, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[1,6,11],zero,zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm8[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm15, %xmm7 -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm14, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,0,5,10,15],zero,zero,zero,xmm3[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u> +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm9, %xmm11, %xmm9 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm11, %ymm9, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11] +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm11, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm14, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm13, %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,7,12] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm4[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u],zero,zero,zero,xmm8[3,8,13,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm5[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm14, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u],zero,zero,zero,zero,xmm3[4,9,14,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,0,5,10,15],zero,zero,zero,xmm6[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm14, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3,4],xmm11[5,6,7] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm12, %ymm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12] -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm12, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm13, %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,1,6,11],zero,zero,zero,zero,xmm3[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm2[u,u,u],zero,zero,zero,xmm2[0,5,10,15,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm7[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7] +; AVX1-ONLY-NEXT: vandps %ymm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12] +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm11, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm13, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,1,6,11],zero,zero,zero,zero,xmm6[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u],zero,zero,zero,xmm3[0,5,10,15,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm13 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm7[u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1,2,3,4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[2,7,12],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,0,5,10,15],zero,zero,zero,xmm8[u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vandps %ymm12, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm8[u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm13[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[3,8,13],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u],zero,zero,zero,xmm10[0,5,10,15,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm13, %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,2,7,12],zero,zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[3,8,13],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm13 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm15, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm15, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm0[4,9,14] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm13 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0,1,2,3,4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm4[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u],zero,zero,zero,zero,xmm8[4,9,14,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm10[0,1,2],xmm14[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm4[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm5[u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm15[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[3,8,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u],zero,zero,zero,xmm8[0,5,10,15,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u],zero,zero,zero,xmm3[1,6,11,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,2,7,12],zero,zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm15, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[3,8,13],zero,zero,zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm13, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm13, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm12, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm0[4,9,14] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm13 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm15, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u],zero,zero,zero,xmm10[1,6,11,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,2,7,12],zero,zero,zero,xmm8[u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,9,14],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[2,7,12,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,3,8,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm13, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,1,6,11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[0,5,10,15] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,3,8,13],zero,zero,zero,xmm6[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[4,9,14],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[4,9,14],zero,zero,zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,1,6,11],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[0,5,10,15] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm2, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[2,7,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,0,5,10,15],zero,zero,zero,xmm5[u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm14, %xmm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u],zero,zero,zero,xmm8[1,6,11,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,2,7,12],zero,zero,zero,xmm5[u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm12, (%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -2013,13 +2014,13 @@ ; SSE-LABEL: load_i8_stride5_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $568, %rsp # imm = 0x238 -; SSE-NEXT: movdqa 160(%rdi), %xmm9 -; SSE-NEXT: movdqa 176(%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm9 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm1 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 @@ -2076,7 +2077,7 @@ ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa 224(%rdi), %xmm1 +; SSE-NEXT: movdqa 64(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] @@ -2093,11 +2094,11 @@ ; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa 112(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 48(%rdi), %xmm1 +; SSE-NEXT: movdqa 128(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 @@ -2113,11 +2114,11 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3] ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa 80(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2147,7 +2148,7 @@ ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa 64(%rdi), %xmm1 +; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2164,11 +2165,11 @@ ; SSE-NEXT: pand %xmm14, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 272(%rdi), %xmm2 +; SSE-NEXT: movdqa 192(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 288(%rdi), %xmm2 +; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 @@ -2184,11 +2185,11 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3] ; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 256(%rdi), %xmm1 +; SSE-NEXT: movdqa 176(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, %xmm7 ; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: movdqa 240(%rdi), %xmm4 +; SSE-NEXT: movdqa 160(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm3 @@ -2217,7 +2218,7 @@ ; SSE-NEXT: packuswb %xmm4, %xmm4 ; SSE-NEXT: pand %xmm10, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movdqa 304(%rdi), %xmm2 +; SSE-NEXT: movdqa 224(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2234,11 +2235,11 @@ ; SSE-NEXT: pand %xmm14, %xmm4 ; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm0 +; SSE-NEXT: movdqa 272(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa 128(%rdi), %xmm4 +; SSE-NEXT: movdqa 288(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm13, %xmm4 ; SSE-NEXT: por %xmm2, %xmm4 @@ -2254,11 +2255,11 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,3] ; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: movdqa 96(%rdi), %xmm0 +; SSE-NEXT: movdqa 256(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa 80(%rdi), %xmm9 +; SSE-NEXT: movdqa 240(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, %xmm7 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm15, %xmm7 @@ -2278,7 +2279,7 @@ ; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa 144(%rdi), %xmm4 +; SSE-NEXT: movdqa 304(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3107,77 +3108,73 @@ ; SSE-NEXT: packuswb %xmm5, %xmm8 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rdx) +; SSE-NEXT: movaps %xmm3, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 48(%rdx) +; SSE-NEXT: movaps %xmm3, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rdx) +; SSE-NEXT: movaps %xmm3, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, 32(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rcx) +; SSE-NEXT: movaps %xmm3, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rcx) -; SSE-NEXT: movdqa %xmm14, 16(%r8) -; SSE-NEXT: movdqa %xmm6, 48(%r8) -; SSE-NEXT: movdqa %xmm15, (%r8) +; SSE-NEXT: movaps %xmm3, (%rcx) +; SSE-NEXT: movdqa %xmm14, 48(%r8) +; SSE-NEXT: movdqa %xmm6, 32(%r8) +; SSE-NEXT: movdqa %xmm15, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%r8) -; SSE-NEXT: movaps %xmm1, 16(%r9) -; SSE-NEXT: movaps %xmm4, 48(%r9) -; SSE-NEXT: movaps %xmm7, (%r9) -; SSE-NEXT: movaps %xmm2, 32(%r9) +; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movaps %xmm1, 48(%r9) +; SSE-NEXT: movaps %xmm4, 32(%r9) +; SSE-NEXT: movaps %xmm7, 16(%r9) +; SSE-NEXT: movaps %xmm2, (%r9) ; SSE-NEXT: addq $568, %rsp # imm = 0x238 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride5_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8 +; AVX1-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,9,14,0,4,9,14,0,4,9,14,0,4,9,14,0] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,2,7,12,0,0,128,128,128,2,7,12,0,0,128] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm10 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [8,13,128,128,128,0,0,3,8,13,128,128,128,0,0,3] ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u> ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm5, %xmm6, %xmm5 ; AVX1-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm5 @@ -3186,378 +3183,378 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm14 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [1,6,11,128,128,128,128,0,1,6,11,128,128,128,128,0] ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,0,5,10,15,0,128,128,128,0,5,10,15,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [9,14,128,128,128,0,0,4,9,14,128,128,128,0,0,4] ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [9,14,128,128,128,0,0,4,9,14,128,128,128,0,0,4] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm8 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm2, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm14, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm8[u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u],zero,zero,zero,xmm10[0,5,10,15,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [11,0,0,128,128,128,1,6,11,0,0,128,128,128,1,6] +; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,0,0,2,7,12,128,128,128,0,0,2,7,12,128,128] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[3,4,5,6,7,8,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [8,13,0,0,128,128,128,3,8,13,0,0,128,128,128,3] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <3,8,13,128,128,128,128,128,128,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [128,128,0,0,4,9,14,128,128,128,0,0,4,9,14,128] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm14, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm3[0,1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <0,5,10,15,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm13, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,1,6,11,0,1,6,11,0,1,6,11,0,1,6,11] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm8, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm0, %xmm7, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm14[u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[3,8,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[0,5,10,15,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15 ; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm11[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm8, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[2,7,12] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [9,14,0,128,128,128,128,4,9,14,0,128,128,128,128,4] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,0,0,5,10,15,128,128,128,0,0,5,10,15,128] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[3,4,5,6,7,8,9,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <1,6,11,128,128,128,128,128,128,128,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4],xmm11[5,6,7] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,2,7,12,0,2,7,12,0,2,7,12,0,2,7,12] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15 -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm8, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm15, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[2,7,12] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm10, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u,u,u,u,1,6,11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[0,5,10,15] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u],zero,zero,zero,xmm12[2,7,12,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,3,8,13],zero,zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <4,9,14,128,128,128,128,128,128,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm0, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,u,u,u,u,u,u,u,1,6,11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[0,5,10,15] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[u,u,u],zero,zero,zero,xmm15[2,7,12,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,3,8,13],zero,zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vpblendvb %xmm7, %xmm10, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm13, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u],zero,zero,zero,xmm12[3,8,13,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [128,128,0,0,4,9,14,128,128,128,0,0,4,9,14,128] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm0 +; AVX1-ONLY-NEXT: vpxor %xmm15, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <0,5,10,15,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm14, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,1,6,11,0,1,6,11,0,1,6,11,0,1,6,11] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm8, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm10 +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm8, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm10, %ymm14, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[3,8,13] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,128,0,1,6,11,128,128,128,128,0,1,6,11,128,128] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm3[1,6,11] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u],zero,zero,zero,xmm2[3,8,13,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u],zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm12, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm12[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm14, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3,4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,2,7,12,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] +; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm9, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm15 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u],zero,zero,zero,zero,xmm12[4,9,14,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [128,128,0,0,5,10,15,128,128,128,0,0,5,10,15,128] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm13, %xmm0 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm8, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u],zero,zero,zero,xmm4[0,5,10,15,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm2[3,4,5,6,7,8,9,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm15[u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[3,4,5,6,7,8,9,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <1,6,11,128,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1,2,3,4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,2,7,12,0,2,7,12,0,2,7,12,0,2,7,12] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm8, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[2,7,12] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm4[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,zero,xmm2[4,9,14,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm8, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm8, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3,4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm8, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm9, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,128,2,7,12,0,0,128,128,128,2,7,12,0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm5[u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u,u,u,u,u,u],zero,zero,zero,xmm10[0,5,10,15,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2],xmm7[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u],zero,zero,zero,xmm8[1,6,11,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm11, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm10 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[3,8,13],zero,zero,zero,zero,zero,zero,xmm12[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm14[u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[0,5,10,15,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,1,6,11],zero,zero,zero,zero,xmm5[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[u,u,u],zero,zero,zero,xmm15[0,5,10,15,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm13, %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <2,7,12,128,128,128,128,128,128,128,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0,1,2,3,4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [128,128,128,1,6,11,0,0,128,128,128,1,6,11,0,0] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm14 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [128,128,4,9,14,0,128,128,128,128,4,9,14,0,128,128] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0,1,2],xmm10[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,2,7,12,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,2,7,12],zero,zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,8,13],zero,zero,zero,zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[u,u,u,u,u,u,0,5,10,15],zero,zero,zero,xmm15[u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm9[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm8, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,3,8,13,0,3,8,13,0,3,8,13,0,3,8,13] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm10 +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm8, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,1,6,11],zero,zero,zero,zero,xmm4[u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u],zero,zero,zero,xmm1[0,5,10,15,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm12, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,1,6,11,128,128,128,128,0,1,6,11,128,128,128,128] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,128,128,128,0,5,10,15,0,128,128,128,0,5,10,15] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u],zero,zero,zero,xmm8[2,7,12,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,0,0,3,8,13,128,128,128,0,0,3,8,13,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm14[u,u,u,u,u,u,0,5,10,15],zero,zero,zero,xmm14[u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,1,6,11,0,0,128,128,128,1,6,11,0,0,128,128] ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm8 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[4,9,14],zero,zero,zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [128,1,6,11,0,0,128,128,128,1,6,11,0,0,128,128] -; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm5 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,128,128,3,8,13,0,0,128,128,128,3,8,13,0,0] ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [12,128,128,128,0,0,2,7,12,128,128,128,0,0,2,7] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm13 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [4,9,14,128,128,128,0,0,4,9,14,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm5, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[2,7,12,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[3,4,5,6,7,8,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,9,14],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [12,128,128,128,0,0,2,7,12,128,128,128,0,0,2,7] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm8 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [4,9,14,128,128,128,0,0,4,9,14,128,128,128,0,0] +; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3,4,5],xmm10[6,7] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm10[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rsi) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) -; AVX1-ONLY-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r9) +; AVX1-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3844,58 +3841,57 @@ ; AVX512F-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm7 ; AVX512F-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm8 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,6,11,16,21,26,31,20,25,30,19,24,29],zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,6,11,16,21,26,31,20,25,30,19,24,29],zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512F-NEXT: vpternlogq $248, %ymm18, %ymm6, %ymm9 -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512F-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm7, %ymm10 -; AVX512F-NEXT: vmovdqa 208(%rdi), %xmm8 -; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm10 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,ymm10[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm10 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX512F-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpternlogq $186, %ymm12, %ymm16, %ymm1 -; AVX512F-NEXT: vmovdqa 144(%rdi), %xmm13 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm13[1,6,11] -; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm14 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512F-NEXT: vpor %xmm12, %xmm15, %xmm12 -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm12, %zmm17 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512F-NEXT: vpternlogq $184, %zmm9, %zmm20, %zmm17 -; AVX512F-NEXT: vmovdqa 256(%rdi), %ymm15 -; AVX512F-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX512F-NEXT: vmovdqa %ymm5, %ymm9 -; AVX512F-NEXT: vpternlogq $202, %ymm15, %ymm12, %ymm9 -; AVX512F-NEXT: vextracti128 $1, %ymm9, %xmm2 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[1,6,11] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,4,9,14],zero,zero,zero,xmm9[2,7,12],zero,zero,zero -; AVX512F-NEXT: vpor %xmm2, %xmm9, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm17, %zmm19 +; AVX512F-NEXT: vpternlogq $248, %ymm18, %ymm6, %ymm7 +; AVX512F-NEXT: vmovdqa 144(%rdi), %xmm8 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[1,6,11] +; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm10 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512F-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512F-NEXT: vpternlogq $184, %zmm7, %zmm19, %zmm6 +; AVX512F-NEXT: vmovdqa 256(%rdi), %ymm9 +; AVX512F-NEXT: vmovdqa 288(%rdi), %ymm7 +; AVX512F-NEXT: vmovdqa %ymm5, %ymm11 +; AVX512F-NEXT: vpternlogq $202, %ymm9, %ymm7, %ymm11 +; AVX512F-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u],zero,zero,zero,xmm12[3,8,13],zero,zero,zero,xmm12[1,6,11] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,4,9,14],zero,zero,zero,xmm11[2,7,12],zero,zero,zero +; AVX512F-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX512F-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm1 +; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX512F-NEXT: vmovdqa 224(%rdi), %ymm12 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm14 +; AVX512F-NEXT: vpternlogq $202, %ymm11, %ymm12, %ymm14 +; AVX512F-NEXT: vmovdqa 208(%rdi), %xmm13 +; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm14 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm14[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm14 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] +; AVX512F-NEXT: vmovdqa 160(%rdi), %xmm15 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] +; AVX512F-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpternlogq $186, %ymm2, %ymm4, %ymm3 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm20 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512F-NEXT: vpternlogq $202, %ymm12, %ymm15, %ymm1 +; AVX512F-NEXT: vpternlogq $202, %ymm7, %ymm9, %ymm1 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,0,5,10,15],zero,zero,zero,xmm1[3,8,13],zero,zero,zero ; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm7, %ymm2 -; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm2 +; AVX512F-NEXT: vpternlogq $202, %ymm11, %ymm12, %ymm2 +; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm10[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpor %xmm3, %xmm9, %xmm3 -; AVX512F-NEXT: vpternlogq $186, %ymm2, %ymm16, %ymm3 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm14[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX512F-NEXT: vpternlogq $186, %ymm2, %ymm4, %ymm3 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpternlogq $226, %ymm1, %ymm16, %ymm3 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm1 @@ -3904,37 +3900,37 @@ ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u] ; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] -; AVX512F-NEXT: vmovdqa %ymm9, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] +; AVX512F-NEXT: vmovdqa %ymm6, %ymm2 ; AVX512F-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm2 ; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm4 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,7,12,17,22,27,16,21,26,31,20,25,30],zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpternlogq $248, %ymm18, %ymm1, %ymm2 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm13[2,7,12] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[2,7,12] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero ; AVX512F-NEXT: vpor %xmm1, %xmm4, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpternlogq $184, %zmm2, %zmm20, %zmm1 +; AVX512F-NEXT: vpternlogq $184, %zmm2, %zmm19, %zmm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm17 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512F-NEXT: vpternlogq $202, %ymm12, %ymm15, %ymm1 +; AVX512F-NEXT: vpternlogq $202, %ymm7, %ymm9, %ymm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13] ; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512F-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm2 -; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm2 +; AVX512F-NEXT: vpternlogq $202, %ymm12, %ymm11, %ymm2 +; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm10[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm14[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-NEXT: vpternlogq $226, %ymm1, %ymm16, %ymm2 -; AVX512F-NEXT: vmovdqa %ymm9, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm6, %ymm1 ; AVX512F-NEXT: vpternlogq $202, %ymm23, %ymm24, %ymm1 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[4,9,14,u,u,u] @@ -3946,25 +3942,25 @@ ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[3,8,13,18,23,28,17,22,27,16,21,26,31],zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpternlogq $248, %ymm18, %ymm1, %ymm3 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm13[3,8,13] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm14[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[3,8,13] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero ; AVX512F-NEXT: vpor %xmm1, %xmm4, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpternlogq $184, %zmm3, %zmm20, %zmm1 +; AVX512F-NEXT: vpternlogq $184, %zmm3, %zmm19, %zmm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm18 -; AVX512F-NEXT: vmovdqa %ymm9, %ymm1 -; AVX512F-NEXT: vpternlogq $202, %ymm15, %ymm12, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm6, %ymm1 +; AVX512F-NEXT: vpternlogq $202, %ymm9, %ymm7, %ymm1 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,7,12],zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero ; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm2 -; AVX512F-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm2 -; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm2 +; AVX512F-NEXT: vpternlogq $202, %ymm12, %ymm11, %ymm2 +; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm10[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[3,8,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm14[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] @@ -3981,24 +3977,24 @@ ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm13[4,9,14] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[4,9,14] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX512F-NEXT: vpor %xmm1, %xmm4, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm4, %zmm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogq $226, %ymm15, %ymm0, %ymm12 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero -; AVX512F-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX512F-NEXT: vpternlogq $226, %ymm9, %ymm0, %ymm7 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero +; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm3 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] ; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm7, %ymm9 -; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm10[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4,9,14],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpternlogq $202, %ymm11, %ymm12, %ymm6 +; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm6 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm14[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] @@ -4019,7 +4015,7 @@ ; AVX512F-NEXT: vpermd %ymm2, %ymm5, %ymm2 ; AVX512F-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm19, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm20, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm17, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm18, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm1, (%r8) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -97,72 +97,73 @@ ; SSE-LABEL: load_i8_stride6_vf4: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: movaps (%rdi), %xmm3 +; SSE-NEXT: movaps 16(%rdi), %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: andps %xmm1, %xmm2 +; SSE-NEXT: andnps %xmm0, %xmm1 +; SSE-NEXT: orps %xmm2, %xmm1 +; SSE-NEXT: xorps %xmm2, %xmm2 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movaps {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: andnps %xmm0, %xmm4 +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[2,3] +; SSE-NEXT: movaps {{.*#+}} xmm7 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: andps %xmm7, %xmm0 +; SSE-NEXT: andnps %xmm3, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,2],xmm3[0,3] +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movaps {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: andps %xmm3, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: andps %xmm2, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: orps %xmm5, %xmm4 +; SSE-NEXT: andps %xmm3, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,3,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm3[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,0] +; SSE-NEXT: orps %xmm0, %xmm7 +; SSE-NEXT: andps %xmm7, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm8[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm8[2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movd %xmm1, (%rsi) -; SSE-NEXT: movd %xmm5, (%rdx) -; SSE-NEXT: movd %xmm6, (%rcx) -; SSE-NEXT: movd %xmm7, (%r8) +; SSE-NEXT: movss %xmm4, (%rdx) +; SSE-NEXT: movd %xmm5, (%rcx) +; SSE-NEXT: movd %xmm6, (%r8) ; SSE-NEXT: movd %xmm0, (%r9) ; SSE-NEXT: movd %xmm2, (%rax) ; SSE-NEXT: retq @@ -866,18 +867,18 @@ ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm2 -; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[4,10] -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero +; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[4,10] +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-ONLY-NEXT: vpblendvb %xmm8, %xmm2, %xmm7, %xmm2 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,11] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,11] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX2-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX2-ONLY-NEXT: vpblendvb %xmm8, %xmm5, %xmm6, %xmm5 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> @@ -886,34 +887,34 @@ ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[2,8,14,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[2,8,14],zero,zero,xmm6[0,6,12],zero,zero,zero,xmm6[u,u,u,u,u] ; AVX2-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,6,12] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,6,12] ; AVX2-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10 ; AVX2-ONLY-NEXT: vpblendvb %xmm8, %xmm9, %xmm10, %xmm9 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[3,9,15,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,9,15],zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[u,u,u,u,u] ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,7,13] ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7 ; AVX2-ONLY-NEXT: vpblendvb %xmm8, %xmm6, %xmm7, %xmm6 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = ; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[2,8,14] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[4,10],zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7] ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,8,14] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero -; AVX2-ONLY-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[3,9,15] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3,4],xmm1[5,6,7] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,11],zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,9,15] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero -; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] +; AVX2-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, (%rsi) ; AVX2-ONLY-NEXT: vmovdqa %xmm5, (%rdx) ; AVX2-ONLY-NEXT: vmovdqa %xmm9, (%rcx) @@ -2360,11 +2361,11 @@ ; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm1 +; SSE-NEXT: movdqa 128(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 336(%rdi), %xmm9 +; SSE-NEXT: movdqa 144(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm9, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2383,11 +2384,11 @@ ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa 304(%rdi), %xmm1 +; SSE-NEXT: movdqa 112(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 288(%rdi), %xmm1 +; SSE-NEXT: movdqa 96(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm3 ; SSE-NEXT: pand %xmm5, %xmm3 ; SSE-NEXT: movdqa %xmm5, %xmm10 @@ -2403,11 +2404,11 @@ ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa 368(%rdi), %xmm3 +; SSE-NEXT: movdqa 176(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa 352(%rdi), %xmm3 +; SSE-NEXT: movdqa 160(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm6, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 @@ -2484,11 +2485,11 @@ ; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm2 +; SSE-NEXT: movdqa 320(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa 144(%rdi), %xmm7 +; SSE-NEXT: movdqa 336(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm5, %xmm4 ; SSE-NEXT: pandn %xmm7, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2518,10 +2519,10 @@ ; SSE-NEXT: pandn %xmm9, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: movdqa 112(%rdi), %xmm6 +; SSE-NEXT: movdqa 304(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm10, %xmm9 ; SSE-NEXT: pandn %xmm6, %xmm10 -; SSE-NEXT: movdqa 160(%rdi), %xmm8 +; SSE-NEXT: movdqa 352(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm8 ; SSE-NEXT: movdqa %xmm0, %xmm1 @@ -2563,11 +2564,11 @@ ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm6, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: movdqa 288(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm14 +; SSE-NEXT: movdqa 368(%rdi), %xmm14 ; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm3 @@ -3548,47 +3549,47 @@ ; SSE-NEXT: pandn %xmm4, %xmm2 ; SSE-NEXT: por %xmm9, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rsi) +; SSE-NEXT: movaps %xmm0, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rsi) +; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps %xmm0, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rdx) +; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps %xmm0, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r8) +; SSE-NEXT: movaps %xmm0, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps %xmm0, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r8) -; SSE-NEXT: movdqa %xmm7, 16(%r9) +; SSE-NEXT: movdqa %xmm7, 48(%r9) ; SSE-NEXT: movdqa %xmm6, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r9) +; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm2, 16(%rax) +; SSE-NEXT: movdqa %xmm2, 48(%rax) ; SSE-NEXT: movdqa %xmm13, 32(%rax) -; SSE-NEXT: movdqa %xmm12, 48(%rax) +; SSE-NEXT: movdqa %xmm12, 16(%rax) ; SSE-NEXT: movdqa %xmm8, (%rax) ; SSE-NEXT: addq $824, %rsp # imm = 0x338 ; SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -1501,20 +1501,20 @@ ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm3 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0],xmm10[1],xmm9[2],xmm10[3] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm3 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm2[5,6,7] +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm3, %xmm2 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u] ; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero -; AVX2-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] +; AVX2-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> ; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm4, %xmm5 @@ -1601,90 +1601,92 @@ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] -; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm5 -; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm4 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm7[2,9] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7] -; AVX512F-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0] +; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm4 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3,4],xmm3[5,6,7] +; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm9 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm10 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm5 = xmm9[0],xmm10[1],xmm9[2],xmm10[3] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm5[2,9] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,7,14],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] +; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535] -; AVX512F-NEXT: vmovdqa %ymm8, %ymm6 -; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm6 -; AVX512F-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512F-NEXT: vpor %xmm7, %xmm6, %xmm9 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10] +; AVX512F-NEXT: vmovdqa %ymm8, %ymm4 +; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm4 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX512F-NEXT: vpor %xmm5, %xmm4, %xmm11 +; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm5[1],xmm6[2,3,4],xmm5[5],xmm6[6,7] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero -; AVX512F-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512F-NEXT: vpor %xmm7, %xmm4, %xmm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX512F-NEXT: vpternlogq $184, %xmm9, %xmm7, %xmm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512F-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm10 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u] -; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero -; AVX512F-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512F-NEXT: vpternlogq $184, %xmm10, %xmm7, %xmm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-NEXT: vmovdqa %ymm10, %ymm12 +; AVX512F-NEXT: vpternlogq $184, %xmm11, %xmm7, %xmm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512F-NEXT: vmovdqa %ymm11, %ymm12 ; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm12 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u,u,u] ; AVX512F-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm13[5,12] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero +; AVX512F-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm13[4,11] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero +; AVX512F-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512F-NEXT: vpternlogq $184, %xmm12, %xmm7, %xmm13 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-NEXT: vmovdqa %ymm12, %ymm14 +; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm14 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX512F-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512F-NEXT: vpternlogq $184, %xmm12, %xmm7, %xmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero +; AVX512F-NEXT: vpor %xmm15, %xmm10, %xmm10 +; AVX512F-NEXT: vpternlogq $184, %xmm14, %xmm7, %xmm10 ; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm8 -; AVX512F-NEXT: vextracti128 $1, %ymm8, %xmm12 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u] +; AVX512F-NEXT: vextracti128 $1, %ymm8, %xmm14 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512F-NEXT: vpor %xmm12, %xmm8, %xmm8 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero -; AVX512F-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX512F-NEXT: vpternlogq $184, %xmm8, %xmm7, %xmm12 -; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm9 -; AVX512F-NEXT: vextracti128 $1, %ymm9, %xmm8 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] -; AVX512F-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm9[0,7,14] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512F-NEXT: vpor %xmm13, %xmm9, %xmm9 +; AVX512F-NEXT: vpor %xmm14, %xmm8, %xmm8 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero +; AVX512F-NEXT: vpor %xmm14, %xmm9, %xmm9 ; AVX512F-NEXT: vpternlogq $184, %xmm8, %xmm7, %xmm9 -; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm10 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u] -; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm2 +; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm11 +; AVX512F-NEXT: vextracti128 $1, %ymm11, %xmm8 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u,u,u,u] +; AVX512F-NEXT: vpor %xmm8, %xmm11, %xmm8 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0],xmm6[1,2],xmm5[3],xmm6[4,5,6],xmm5[7] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero +; AVX512F-NEXT: vpor %xmm14, %xmm11, %xmm11 +; AVX512F-NEXT: vpternlogq $184, %xmm8, %xmm7, %xmm11 +; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm12 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[6,13],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX512F-NEXT: vextracti128 $1, %ymm12, %xmm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm6[1,2,3],xmm5[4],xmm6[5,6],xmm5[7] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero ; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512F-NEXT: vpternlogq $184, %xmm1, %xmm7, %xmm0 -; AVX512F-NEXT: vmovdqa %xmm5, (%rsi) -; AVX512F-NEXT: vmovdqa %xmm6, (%rdx) -; AVX512F-NEXT: vmovdqa %xmm11, (%rcx) -; AVX512F-NEXT: vmovdqa %xmm14, (%r8) -; AVX512F-NEXT: vmovdqa %xmm12, (%r9) -; AVX512F-NEXT: vmovdqa %xmm9, (%r10) +; AVX512F-NEXT: vmovdqa %xmm3, (%rsi) +; AVX512F-NEXT: vmovdqa %xmm4, (%rdx) +; AVX512F-NEXT: vmovdqa %xmm13, (%rcx) +; AVX512F-NEXT: vmovdqa %xmm10, (%r8) +; AVX512F-NEXT: vmovdqa %xmm9, (%r9) +; AVX512F-NEXT: vmovdqa %xmm11, (%r10) ; AVX512F-NEXT: vmovdqa %xmm0, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1693,97 +1695,99 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm8[1],xmm7[2],xmm8[3] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512BW-NEXT: movw $-28382, %r11w # imm = 0x9122 ; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4],xmm4[5,6,7] -; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm5 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7] -; AVX512BW-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX512BW-NEXT: movw $4644, %di # imm = 0x1224 -; AVX512BW-NEXT: kmovd %edi, %k2 -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k2} -; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm0[1],xmm1[2,3,4],xmm0[5],xmm1[6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero -; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm3 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3,4],xmm2[5,6,7] +; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u] +; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7] +; AVX512BW-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: movw $4644, %r11w # imm = 0x1224 +; AVX512BW-NEXT: kmovd %r11d, %k2 +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k2} +; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0],xmm5[1],xmm6[2,3,4],xmm5[5],xmm6[6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm9[3,10] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero +; AVX512BW-NEXT: vpor %xmm10, %xmm9, %xmm9 ; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1} +; AVX512BW-NEXT: vmovdqu8 %xmm9, %xmm4 {%k1} ; AVX512BW-NEXT: movw $8772, %di # imm = 0x2244 ; AVX512BW-NEXT: kmovd %edi, %k3 -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm7 {%k3} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero -; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512BW-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1} +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm9 {%k3} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[2,9],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm10[4,11] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero +; AVX512BW-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX512BW-NEXT: vmovdqu8 %xmm10, %xmm9 {%k1} ; AVX512BW-NEXT: movw $9288, %di # imm = 0x2448 ; AVX512BW-NEXT: kmovd %edi, %k4 -; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm8 {%k4} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero +; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k4} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512BW-NEXT: vmovdqu8 %xmm10, %xmm8 {%k1} -; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm10 {%k2} -; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm7[5,12] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero +; AVX512BW-NEXT: vpor %xmm11, %xmm8, %xmm8 +; AVX512BW-NEXT: vmovdqu8 %xmm8, %xmm10 {%k1} +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm8 {%k2} +; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm11 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15],zero,zero -; AVX512BW-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX512BW-NEXT: vmovdqu8 %xmm9, %xmm10 {%k1} -; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k3} -; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm11, %xmm8, %xmm8 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero +; AVX512BW-NEXT: vpor %xmm7, %xmm11, %xmm7 +; AVX512BW-NEXT: vmovdqu8 %xmm7, %xmm8 {%k1} +; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm7 {%k3} +; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm11 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6],xmm0[7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm7, %xmm11, %xmm7 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0],xmm6[1,2],xmm5[3],xmm6[4,5,6],xmm5[7] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero ; AVX512BW-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512BW-NEXT: vmovdqu8 %xmm11, %xmm9 {%k1} -; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k4} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero +; AVX512BW-NEXT: vmovdqu8 %xmm11, %xmm7 {%k1} +; AVX512BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k4} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} -; AVX512BW-NEXT: vmovdqa %xmm4, (%rsi) -; AVX512BW-NEXT: vmovdqa %xmm6, (%rdx) -; AVX512BW-NEXT: vmovdqa %xmm7, (%rcx) -; AVX512BW-NEXT: vmovdqa %xmm8, (%r8) -; AVX512BW-NEXT: vmovdqa %xmm10, (%r9) -; AVX512BW-NEXT: vmovdqa %xmm9, (%r10) -; AVX512BW-NEXT: vmovdqa %xmm2, (%rax) +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm6[1,2,3],xmm5[4],xmm6[5,6],xmm5[7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm1[1,8,15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} +; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi) +; AVX512BW-NEXT: vmovdqa %xmm4, (%rdx) +; AVX512BW-NEXT: vmovdqa %xmm9, (%rcx) +; AVX512BW-NEXT: vmovdqa %xmm10, (%r8) +; AVX512BW-NEXT: vmovdqa %xmm8, (%r9) +; AVX512BW-NEXT: vmovdqa %xmm7, (%r10) +; AVX512BW-NEXT: vmovdqa %xmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %wide.vec = load <112 x i8>, ptr %in.vec, align 64 @@ -1808,16 +1812,16 @@ ; SSE-LABEL: load_i8_stride7_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $632, %rsp # imm = 0x278 -; SSE-NEXT: movdqa 208(%rdi), %xmm9 -; SSE-NEXT: movdqa 192(%rdi), %xmm6 -; SSE-NEXT: movdqa 176(%rdi), %xmm13 -; SSE-NEXT: movdqa 112(%rdi), %xmm4 +; SSE-NEXT: movdqa 96(%rdi), %xmm9 +; SSE-NEXT: movdqa 80(%rdi), %xmm6 +; SSE-NEXT: movdqa 64(%rdi), %xmm13 +; SSE-NEXT: movdqa (%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 @@ -1900,11 +1904,11 @@ ; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa 144(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa 160(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 @@ -1918,11 +1922,11 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa 128(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa 112(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 @@ -1941,11 +1945,11 @@ ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa 64(%rdi), %xmm14 +; SSE-NEXT: movdqa 176(%rdi), %xmm14 ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pandn %xmm14, %xmm0 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm11 +; SSE-NEXT: movdqa 192(%rdi), %xmm11 ; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm5, %xmm2 @@ -1958,7 +1962,7 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: movdqa 96(%rdi), %xmm2 +; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2843,29 +2847,29 @@ ; SSE-NEXT: por %xmm11, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm7 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r8) -; SSE-NEXT: movdqa %xmm4, (%r9) -; SSE-NEXT: movdqa %xmm3, 16(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: movdqa %xmm4, 16(%r9) +; SSE-NEXT: movdqa %xmm3, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm5, (%rax) -; SSE-NEXT: movdqa %xmm6, 16(%rax) +; SSE-NEXT: movdqa %xmm5, 16(%rax) +; SSE-NEXT: movdqa %xmm6, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm7, (%rax) -; SSE-NEXT: movdqa %xmm8, 16(%rax) +; SSE-NEXT: movdqa %xmm7, 16(%rax) +; SSE-NEXT: movdqa %xmm8, (%rax) ; SSE-NEXT: addq $632, %rsp # imm = 0x278 ; SSE-NEXT: retq ; @@ -4239,186 +4243,194 @@ ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31> -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31> -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm2, %zmm4 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u> -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm2, %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u> -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm2, %zmm11 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15> -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm2, %zmm12 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31> -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm2, %zmm10 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31> -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm2, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm4 +; AVX512BW-SLOW-NEXT: vmovdqa 96(%rdi), %ymm3 ; AVX512BW-SLOW-NEXT: movw $-28382, %r11w # imm = 0x9122 ; AVX512BW-SLOW-NEXT: kmovd %r11d, %k5 -; AVX512BW-SLOW-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5} -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: movw $992, %r11w # imm = 0x3E0 +; AVX512BW-SLOW-NEXT: vpblendmw %ymm1, %ymm2, %ymm0 {%k5} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: movw $9288, %r11w # imm = 0x2448 ; AVX512BW-SLOW-NEXT: kmovd %r11d, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX512BW-SLOW-NEXT: vpblendmw %ymm4, %ymm3, %ymm5 {%k1} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: movw $992, %r11w # imm = 0x3E0 +; AVX512BW-SLOW-NEXT: kmovd %r11d, %k2 +; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm5, %ymm0 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX512BW-SLOW-NEXT: movw $8772, %r11w # imm = 0x2244 -; AVX512BW-SLOW-NEXT: kmovd %r11d, %k1 -; AVX512BW-SLOW-NEXT: vpblendmw %ymm7, %ymm6, %ymm8 {%k1} -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm15 -; AVX512BW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm9 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm16 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] +; AVX512BW-SLOW-NEXT: kmovd %r11d, %k2 +; AVX512BW-SLOW-NEXT: vpblendmw %ymm6, %ymm5, %ymm7 {%k2} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm9 +; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm11 +; AVX512BW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm8 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm11[7] ; AVX512BW-SLOW-NEXT: movl $-524288, %edi # imm = 0xFFF80000 ; AVX512BW-SLOW-NEXT: kmovd %edi, %k4 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm13, %ymm1 {%k4} +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm9, %ymm0 {%k4} ; AVX512BW-SLOW-NEXT: movw $4644, %di # imm = 0x1224 -; AVX512BW-SLOW-NEXT: kmovd %edi, %k2 -; AVX512BW-SLOW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2} -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: movl $511, %edi # imm = 0x1FF ; AVX512BW-SLOW-NEXT: kmovd %edi, %k3 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k3} -; AVX512BW-SLOW-NEXT: movw $9288, %di # imm = 0x2448 -; AVX512BW-SLOW-NEXT: kmovd %edi, %k3 -; AVX512BW-SLOW-NEXT: vpblendmw %ymm7, %ymm6, %ymm13 {%k3} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u] -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,xmm13[4,11],zero,zero,xmm13[0,7,14,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm16 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k4} -; AVX512BW-SLOW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX512BW-SLOW-NEXT: vpblendmw %ymm1, %ymm2, %ymm9 {%k3} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm11 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[6,13],zero,zero,xmm11[2,9,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,8,15],zero,zero,xmm9[4,11],zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm11, %xmm9, %xmm11 +; AVX512BW-SLOW-NEXT: vpblendmw %ymm3, %ymm4, %ymm9 {%k5} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4,5],ymm12[6],ymm9[7,8,9],ymm12[10],ymm9[11,12,13],ymm12[14],ymm9[15] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: movl $511, %edi # imm = 0x1FF +; AVX512BW-SLOW-NEXT: kmovd %edi, %k6 +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k6} +; AVX512BW-SLOW-NEXT: vpblendmw %ymm6, %ymm5, %ymm11 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u] +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4} +; AVX512BW-SLOW-NEXT: vpblendmw %ymm3, %ymm4, %ymm11 {%k3} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm1, %ymm2, %ymm12 {%k2} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm13, %xmm12, %xmm12 ; AVX512BW-SLOW-NEXT: movl $261632, %edi # imm = 0x3FE00 ; AVX512BW-SLOW-NEXT: kmovd %edi, %k4 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpblendmw %ymm6, %ymm7, %ymm12 {%k5} -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm15, %xmm12, %xmm12 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7] -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-SLOW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k2} -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm12 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm5, %ymm6, %ymm11 {%k5} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[1,8,15,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,0,7,14],zero,zero,xmm11[3,10],zero,zero,zero,xmm11[u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm13, %xmm11, %xmm11 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5,6],ymm10[7] +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3,4,5,6,7],ymm12[8],ymm10[9,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm3, %ymm4, %ymm11 {%k2} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2],ymm12[3],ymm11[4,5,6],ymm12[7,8],ymm11[9,10],ymm12[11],ymm11[12,13,14],ymm12[15] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm1, %ymm2, %ymm12 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm12 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm5, %ymm6, %ymm11 {%k3} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm13, %xmm11, %xmm11 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512BW-SLOW-NEXT: vpor %xmm13, %xmm14, %xmm13 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512BW-SLOW-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-SLOW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512BW-SLOW-NEXT: kmovd %edi, %k5 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-SLOW-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-SLOW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm13, %ymm11 {%k5} +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm3, %ymm4, %ymm12 {%k1} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3],ymm13[4],ymm12[5,6],ymm13[7,8],ymm12[9,10,11],ymm13[12],ymm12[13,14],ymm13[15] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm2, %ymm1, %ymm13 {%k3} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u] ; AVX512BW-SLOW-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm5, %ymm6, %ymm12 {%k2} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm14, %xmm12, %xmm12 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero ; AVX512BW-SLOW-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-SLOW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm14, %ymm12 {%k5} +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm4, %ymm3, %ymm13 {%k3} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13,14,15] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm2, %ymm1, %ymm14 {%k2} ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u] ; AVX512BW-SLOW-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-SLOW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm13[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpblendmw %ymm5, %ymm6, %ymm13 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512BW-SLOW-NEXT: vpor %xmm15, %xmm13, %xmm13 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] -; AVX512BW-SLOW-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-SLOW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1} -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2} -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] -; AVX512BW-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5} -; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa %ymm1, (%rsi) -; AVX512BW-SLOW-NEXT: vmovdqa %ymm10, (%rdx) -; AVX512BW-SLOW-NEXT: vmovdqa %ymm12, (%rcx) +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm16 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14] +; AVX512BW-SLOW-NEXT: vporq %xmm15, %xmm16, %xmm15 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm15, %ymm13 {%k5} +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k2} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] +; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm2, %ymm1 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k4} = ymm3[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3} +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15] +; AVX512BW-SLOW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k5} +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-SLOW-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512BW-SLOW-NEXT: vmovdqa %ymm9, (%rdx) +; AVX512BW-SLOW-NEXT: vmovdqa %ymm10, (%rcx) ; AVX512BW-SLOW-NEXT: vmovdqa %ymm11, (%r8) -; AVX512BW-SLOW-NEXT: vmovdqa %ymm5, (%r9) -; AVX512BW-SLOW-NEXT: vmovdqa %ymm4, (%r10) -; AVX512BW-SLOW-NEXT: vmovdqa %ymm0, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa %ymm12, (%r9) +; AVX512BW-SLOW-NEXT: vmovdqa %ymm13, (%r10) +; AVX512BW-SLOW-NEXT: vmovdqa %ymm1, (%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; @@ -4426,185 +4438,186 @@ ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm10 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512BW-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX512BW-FAST-NEXT: movw $-28382, %r11w # imm = 0x9122 ; AVX512BW-FAST-NEXT: kmovd %r11d, %k5 -; AVX512BW-FAST-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5} -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm1, %xmm7 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpblendmw %ymm3, %ymm4, %ymm2 {%k5} +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512BW-FAST-NEXT: vpermi2w %ymm0, %ymm1, %ymm5 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FAST-NEXT: movw $992, %r11w # imm = 0x3E0 ; AVX512BW-FAST-NEXT: kmovd %r11d, %k1 -; AVX512BW-FAST-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512BW-FAST-NEXT: vmovdqa 160(%rdi), %ymm6 +; AVX512BW-FAST-NEXT: vmovdqu16 %ymm5, %ymm2 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512BW-FAST-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX512BW-FAST-NEXT: movw $8772, %r11w # imm = 0x2244 ; AVX512BW-FAST-NEXT: kmovd %r11d, %k1 -; AVX512BW-FAST-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k1} -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,2,4,6,1,2,4,6] -; AVX512BW-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vmovdqa 192(%rdi), %ymm13 -; AVX512BW-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] +; AVX512BW-FAST-NEXT: vpblendmw %ymm6, %ymm5, %ymm7 {%k1} +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,2,4,6,1,2,4,6] +; AVX512BW-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX512BW-FAST-NEXT: vpermd %ymm9, %ymm8, %ymm8 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] ; AVX512BW-FAST-NEXT: movl $-524288, %r11d # imm = 0xFFF80000 ; AVX512BW-FAST-NEXT: kmovd %r11d, %k4 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm11, %ymm1 {%k4} +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm7, %ymm2 {%k4} ; AVX512BW-FAST-NEXT: movw $4644, %r11w # imm = 0x1224 ; AVX512BW-FAST-NEXT: kmovd %r11d, %k2 -; AVX512BW-FAST-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k2} -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpblendmw %ymm3, %ymm4, %ymm7 {%k2} +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,8,15],zero,zero,xmm7[4,11],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm7, %xmm8, %xmm8 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,17,10,3,u,21,14,7,24,u,u,u,28,u,u,31> +; AVX512BW-FAST-NEXT: vpermi2w %ymm1, %ymm0, %ymm7 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FAST-NEXT: movl $511, %r11d # imm = 0x1FF ; AVX512BW-FAST-NEXT: kmovd %r11d, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm11, %ymm8 {%k3} +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm8, %ymm7 {%k3} ; AVX512BW-FAST-NEXT: movw $9288, %r11w # imm = 0x2448 ; AVX512BW-FAST-NEXT: kmovd %r11d, %k3 -; AVX512BW-FAST-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k3} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u] -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,3,4,6,1,3,4,6] -; AVX512BW-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm11, %ymm8 {%k4} -; AVX512BW-FAST-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k1} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX512BW-FAST-NEXT: vpblendmw %ymm6, %ymm5, %ymm8 {%k3} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u] +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[4,11],zero,zero,xmm8[0,7,14,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm10, %xmm8, %xmm8 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,3,4,6,1,3,4,6] +; AVX512BW-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm10 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4} +; AVX512BW-FAST-NEXT: vpblendmw %ymm3, %ymm4, %ymm8 {%k1} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm10, %xmm8, %xmm8 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,u,18,11,4,21,14,7,u,25,u,u,28,u,u,u> +; AVX512BW-FAST-NEXT: vpermi2w %ymm1, %ymm0, %ymm10 ; AVX512BW-FAST-NEXT: movl $261632, %r11d # imm = 0x3FE00 ; AVX512BW-FAST-NEXT: kmovd %r11d, %k4 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm11 {%k4} = ymm10[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpblendmw %ymm6, %ymm7, %ymm10 {%k5} -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[1,8,15,u,u,u,u] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 {%k4} = ymm10[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpblendmw %ymm5, %ymm6, %ymm10 {%k5} +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[1,8,15,u,u,u,u] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm12, %xmm10, %xmm10 +; AVX512BW-FAST-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,3,5,6,1,3,5,6] -; AVX512BW-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-FAST-NEXT: vpblendmw %ymm6, %ymm7, %ymm11 {%k2} -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm13 -; AVX512BW-FAST-NEXT: vmovdqa 208(%rdi), %xmm11 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[5,12] -; AVX512BW-FAST-NEXT: vmovdqa 192(%rdi), %xmm12 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512BW-FAST-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,3,5,6,1,3,5,6] +; AVX512BW-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpermd %ymm9, %ymm11, %ymm9 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] +; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm9[1,2,3,4,5,6,7],ymm8[8],ymm9[9,10,11,12,13,14,15] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-FAST-NEXT: vpblendmw %ymm5, %ymm6, %ymm9 {%k2} +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[2,9,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,1,8,15],zero,zero,xmm9[4,11],zero,zero,xmm9[u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm10 +; AVX512BW-FAST-NEXT: vmovdqa 208(%rdi), %xmm9 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12] +; AVX512BW-FAST-NEXT: vmovdqa 192(%rdi), %xmm11 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512BW-FAST-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512BW-FAST-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512BW-FAST-NEXT: kmovd %edi, %k5 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-FAST-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-FAST-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u] -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm12, %ymm10 {%k5} +; AVX512BW-FAST-NEXT: vpblendmw %ymm3, %ymm4, %ymm12 {%k3} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <8,1,18,11,4,u,22,15,u,25,u,u,u,29,u,u> +; AVX512BW-FAST-NEXT: vpermi2w %ymm1, %ymm0, %ymm13 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm12 {%k4} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3,4,5,6,7],ymm12[8],ymm10[9,10,11,12,13,14,15] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-FAST-NEXT: vpblendmw %ymm5, %ymm6, %ymm12 {%k1} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u] +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512BW-FAST-NEXT: vpor %xmm13, %xmm14, %xmm13 ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512BW-FAST-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-FAST-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k2} -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-FAST-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3} +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm13, %ymm12 {%k5} +; AVX512BW-FAST-NEXT: vpblendmw %ymm4, %ymm3, %ymm13 {%k2} +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <8,1,u,19,12,5,22,15,u,u,26,u,u,29,u,u> +; AVX512BW-FAST-NEXT: vpermi2w %ymm1, %ymm0, %ymm14 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm14[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-FAST-NEXT: vpblendmw %ymm5, %ymm6, %ymm13 {%k3} ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u] ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u] ; AVX512BW-FAST-NEXT: vpor %xmm14, %xmm13, %xmm13 ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[0,7,14] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] ; AVX512BW-FAST-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-FAST-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5} -; AVX512BW-FAST-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1} +; AVX512BW-FAST-NEXT: vpblendmw %ymm4, %ymm3, %ymm14 {%k1} ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u] ; AVX512BW-FAST-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FAST-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2} -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,25,18,3,28,21,u,7,u,u,10,u,u,u,14,u> +; AVX512BW-FAST-NEXT: vpermi2w %ymm0, %ymm1, %ymm15 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm15[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-FAST-NEXT: vmovdqu16 %ymm6, %ymm5 {%k2} +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] +; AVX512BW-FAST-NEXT: vpor %xmm6, %xmm9, %xmm6 ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[1,8,15] -; AVX512BW-FAST-NEXT: vpor %xmm7, %xmm11, %xmm7 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5} -; AVX512BW-FAST-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15] -; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-FAST-NEXT: vmovdqa %ymm1, (%rsi) -; AVX512BW-FAST-NEXT: vmovdqa %ymm8, (%rdx) -; AVX512BW-FAST-NEXT: vmovdqa %ymm10, (%rcx) -; AVX512BW-FAST-NEXT: vmovdqa %ymm9, (%r8) -; AVX512BW-FAST-NEXT: vmovdqa %ymm5, (%r9) -; AVX512BW-FAST-NEXT: vmovdqa %ymm4, (%r10) +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm6, %ymm5 {%k5} +; AVX512BW-FAST-NEXT: vmovdqu16 %ymm4, %ymm3 {%k3} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,25,18,u,4,29,22,7,u,u,u,11,u,u,14,u> +; AVX512BW-FAST-NEXT: vpermi2w %ymm0, %ymm1, %ymm4 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm3 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm5[1,2,3,4,5,6,7],ymm3[8],ymm5[9,10,11,12,13,14,15] +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-FAST-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512BW-FAST-NEXT: vmovdqa %ymm7, (%rdx) +; AVX512BW-FAST-NEXT: vmovdqa %ymm8, (%rcx) +; AVX512BW-FAST-NEXT: vmovdqa %ymm10, (%r8) +; AVX512BW-FAST-NEXT: vmovdqa %ymm12, (%r9) +; AVX512BW-FAST-NEXT: vmovdqa %ymm13, (%r10) ; AVX512BW-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -4630,18 +4643,18 @@ ; SSE-LABEL: load_i8_stride7_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $1512, %rsp # imm = 0x5E8 -; SSE-NEXT: movdqa 208(%rdi), %xmm10 -; SSE-NEXT: movdqa 192(%rdi), %xmm5 +; SSE-NEXT: movdqa 96(%rdi), %xmm10 +; SSE-NEXT: movdqa 80(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm7 +; SSE-NEXT: movdqa 64(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 @@ -4721,11 +4734,11 @@ ; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm0 +; SSE-NEXT: movdqa 144(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 272(%rdi), %xmm2 +; SSE-NEXT: movdqa 160(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill ; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 @@ -4741,11 +4754,11 @@ ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa 240(%rdi), %xmm0 +; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 224(%rdi), %xmm2 +; SSE-NEXT: movdqa 112(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm15, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 @@ -4763,11 +4776,11 @@ ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa 288(%rdi), %xmm3 +; SSE-NEXT: movdqa 176(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa 304(%rdi), %xmm3 +; SSE-NEXT: movdqa 192(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm12, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 @@ -4778,7 +4791,7 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: movdqa 320(%rdi), %xmm3 +; SSE-NEXT: movdqa 208(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4875,11 +4888,11 @@ ; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm0 +; SSE-NEXT: movdqa 256(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa 272(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm8, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 @@ -4893,11 +4906,11 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa 224(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm15, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 @@ -4916,11 +4929,11 @@ ; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm11 ; SSE-NEXT: por %xmm11, %xmm1 -; SSE-NEXT: movdqa 64(%rdi), %xmm2 +; SSE-NEXT: movdqa 288(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa 80(%rdi), %xmm2 +; SSE-NEXT: movdqa 304(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 @@ -4931,7 +4944,7 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: movdqa 96(%rdi), %xmm2 +; SSE-NEXT: movdqa 320(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6647,79 +6660,79 @@ ; SSE-NEXT: pandn %xmm2, %xmm7 ; SSE-NEXT: por %xmm3, %xmm7 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps %xmm2, 32(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rdx) +; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps %xmm2, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps %xmm2, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%rcx) +; SSE-NEXT: movaps %xmm2, (%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps %xmm2, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%r8) +; SSE-NEXT: movaps %xmm2, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%r8) +; SSE-NEXT: movaps %xmm2, (%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%r8) +; SSE-NEXT: movaps %xmm2, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%r9) +; SSE-NEXT: movaps %xmm2, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 48(%r9) +; SSE-NEXT: movaps %xmm2, (%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm6, (%rax) +; SSE-NEXT: movdqa %xmm6, 32(%rax) ; SSE-NEXT: movdqa %xmm12, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm7, (%rax) +; SSE-NEXT: movdqa %xmm7, 32(%rax) ; SSE-NEXT: movdqa %xmm10, 48(%rax) -; SSE-NEXT: movdqa %xmm1, 32(%rax) -; SSE-NEXT: movdqa %xmm0, 16(%rax) +; SSE-NEXT: movdqa %xmm1, 16(%rax) +; SSE-NEXT: movdqa %xmm0, (%rax) ; SSE-NEXT: addq $1512, %rsp # imm = 0x5E8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $744, %rsp # imm = 0x2E8 +; AVX1-ONLY-NEXT: subq $728, %rsp # imm = 0x2D8 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [128,128,6,13,0,0,0,128,128,128,6,13,0,0,0,128] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm12 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,128,5,12,0,0,0,128,128,128,5,12,0,0,0] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm4 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,7,14,128,128,0,0,0,0,7,14,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm7 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,0,0,0,3,10,128,128,128,0,0,0,3,10,128] @@ -6748,69 +6761,68 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [8,15,128,128,0,0,0,1,8,15,128,128,0,0,0,1] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [8,15,128,128,0,0,0,1,8,15,128,128,0,0,0,1] +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128] -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm13 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5] -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm13 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5] +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm9, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm2, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm2, %xmm9, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm0, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [9,128,128,128,0,0,0,2,9,128,128,128,0,0,0,2] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,0,7,14,0,0,0,128,128,0,7,14,0,0,0,128] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm13, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm10, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 ; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm3, %xmm12, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm0, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6819,106 +6831,104 @@ ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,7,14,128,128,0,0,0,0,7,14,128,128,0] ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [10,128,128,128,0,0,0,3,10,128,128,128,0,0,0,3] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,1,8,15,0,0,0,128,128,1,8,15,0,0,0,128] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm10 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [128,1,8,15,0,0,0,128,128,1,8,15,0,0,0,128] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm12 ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = ; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm3, %xmm12, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpblendvb %xmm10, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [128,128,128,6,13,0,0,0,128,128,128,6,13,0,0,0] -; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,128,128,6,13,0,0,0,128,128,128,6,13,0,0,0] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm0 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [1,8,15,128,128,0,0,0,1,8,15,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [9,128,128,2,9,128,128,2,9,128,128,2,9,128,128,2] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [9,128,128,2,9,128,128,2,9,128,128,2,9,128,128,2] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm0 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm15, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm12 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm3, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm3, %xmm12, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm13, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [2,9,128,128,128,0,0,0,2,9,128,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm8 +; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [2,9,128,128,128,0,0,0,2,9,128,128,128,0,0,0] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm2 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,0,7,14,0,0,0,128,128,0,7,14,0,0,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm10 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm7 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm13 ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u> -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm2, %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm13 +; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm2, %xmm10, %xmm2 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm6, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm6 +; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm1, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm11 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [3,10,128,128,128,0,0,0,3,10,128,128,128,0,0,0] ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm3 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,1,8,15,0,0,0,128,128,1,8,15,0,0,0] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm8, %xmm10 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm12 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,6,13,128,128,6,13,128,128,6,13,128,128,6,13,128] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm12 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [128,6,13,128,128,6,13,128,128,6,13,128,128,6,13,128] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm13 ; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm10, %xmm12, %xmm5 -; AVX1-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm10, %xmm12, %xmm4 +; AVX1-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm1, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,128,128,128,6,13,0,0,0,128,128,128,6,13,0] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] @@ -6926,377 +6936,434 @@ ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm1 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,1,8,15,128,128,0,0,0,1,8,15,128,128,0] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [128,2,9,128,128,2,9,128,128,2,9,128,128,2,9,128] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [128,2,9,128,128,2,9,128,128,2,9,128,128,2,9,128] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm12, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,2,9],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm14 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,2,9,128,128,128,0,0,0,2,9,128,128,128,0] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm0 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,128,128,0,7,14,0,0,0,128,128,0,7,14,0] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm12 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm13 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm11 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm12, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm10, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm15 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm1, %xmm0, %xmm12 +; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm10 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,3,10,128,128,128,0,0,0,3,10,128,128,128,0] ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,0,128,128,1,8,15,0,0,0,128,128,1,8,15,0] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,128,128,1,8,15,0,0,0,128,128,1,8,15,0] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm12 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm2, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm2, %xmm11, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm15, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm8 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm10 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX1-ONLY-NEXT: vpblendvb %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm1, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm11 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vpblendvb %xmm2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm7, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm12, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm13 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm12, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpblendvb %xmm2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm10, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm2, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm1 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm7 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,5,12,0,0,0,128,128,128,5,12,0,0,0,128,128] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [14,128,128,0,0,0,0,7,14,128,128,0,0,0,0,7] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [128,5,12,0,0,0,128,128,128,5,12,0,0,0,128,128] +; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [14,128,128,0,0,0,0,7,14,128,128,0,0,0,0,7] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm9 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0],xmm4[1,2],xmm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0],xmm5[1,2],xmm6[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm10 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm11 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm9, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm14, %xmm13 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm9, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm7 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,2,9,0,0,2,9,0,0,2,9,0,0,2,9] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm11 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm9 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u,0,1,2,3,8,15] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpblendvb %xmm1, %xmm0, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,2,9,0,0,2,9,0,0,2,9,0,0,2,9] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm14, %xmm3 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = mem[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,1,2,3,8,15,0,0,0,1,2,3,8,15] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,7,14,0,0,7,14,0,0,7,14,0,0,7,14,0] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm1, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm12, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,u,u,u,u,u,u,u,u,0,1,2,3,8,15] +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] ; AVX1-ONLY-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm0[7] -; AVX1-ONLY-NEXT: vpxor %xmm11, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6],xmm0[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[3,10] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm15[3,10] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],mem[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm11[3,10] ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,1,8,15,0,1,8,15,0,1,8,15,0,1,8,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,128,128,128,128,128,4,11,0,128,128,128,128,128,4,11] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm12, %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm12 +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,1,8,15,0,1,8,15,0,1,8,15,0,1,8,15] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm13 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],mem[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,128,128,128,128,128,4,11,0,128,128,128,128,128,4,11] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm1, %ymm14 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm1, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm1, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm14, %ymm15, %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] -; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u] +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] +; AVX1-ONLY-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = mem[0,1,2,3,4,5],xmm11[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; AVX1-ONLY-NEXT: vpxor %xmm14, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm14[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm0, %ymm10 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm10, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,2,9,128,128,128,0,0,0,2,9,128,128,128,0,0] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,128,128,0,7,14,0,0,0,128,128,0,7,14,0,0] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,128,128,128,128,128,5,12,0,128,128,128,128,128,5,12] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,128,128,0,7,14,0,0,0,128,128,0,7,14,0,0] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm9, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6],xmm14[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [0,128,128,128,128,128,5,12,0,128,128,128,128,128,5,12] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm11 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm10, %xmm10 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm7, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm1, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm10, %ymm15, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm15, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vxorps %xmm15, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm15[7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm10 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u],zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm11[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm2[u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,128,128,1,8,15,0,0,0,128,128,1,8,15,0,0] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm9, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm15[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm2[6,13] +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm1, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm13, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm13, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm15, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm14 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[u,u,u,u],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,5,6],mem[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm10[6,13] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload @@ -7304,60 +7371,60 @@ ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u],zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm12, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [13,0,0,0,128,128,128,6,13,0,0,0,128,128,128,6] +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm4[u,u,u,u,1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm10, %xmm3 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[u,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,128,128,1,8,15,0,0,0,128,128,1,8,15,0,0] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,u,u,u,u],zero,zero,xmm3[2,9,u,u,u] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,128,128,128,128,0,7,14,0,128,128,128,128,0,7,14] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,6],mem[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm10[6,13] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm14, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm1, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm0, %ymm10 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm10, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u,u,u,u],zero,zero,zero,xmm10[5,12,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm9, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,1,8,15],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm14, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u],zero,zero,xmm5[2,9,u,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm10, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],mem[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm15[6,13] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 @@ -7366,121 +7433,65 @@ ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u],zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm13, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm11 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [14,0,0,0,128,128,0,7,14,0,0,0,128,128,0,7] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm13 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,128,128,128,128,1,8,15,0,128,128,128,128,1,8,15] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm8, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,xmm6[2,9,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm4[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,128,128,128,128,0,7,14,0,128,128,128,128,0,7,14] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm1, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm13 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u],zero,zero,xmm13[2,9,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm14[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm11, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,0,0,0,2,9,128,128,128,0,0,0,2,9,128,128] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[u,u,u,u],zero,zero,xmm9[0,7,14,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm11[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm10 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,128,128,128,128,1,8,15,0,128,128,128,128,1,8,15] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm1, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm8 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm14, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm14, %xmm5 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 @@ -7512,7 +7523,7 @@ ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) -; AVX1-ONLY-NEXT: addq $744, %rsp # imm = 0x2E8 +; AVX1-ONLY-NEXT: addq $728, %rsp # imm = 0x2D8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -10744,416 +10755,408 @@ ; ; AVX512BW-ONLY-SLOW-LABEL: load_i8_stride7_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX512BW-ONLY-SLOW-NEXT: movw $-28382, %ax # imm = 0x9122 -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: kmovq %k1, %k2 -; AVX512BW-ONLY-SLOW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm22, %ymm0 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm4, %xmm1, %xmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm4, %xmm0, %xmm16 +; AVX512BW-ONLY-SLOW-NEXT: movw $9288, %ax # imm = 0x2448 +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k5 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm7, %ymm2, %ymm0 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm3, %ymm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm0, %ymm16 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX512BW-ONLY-SLOW-NEXT: movw $8772, %ax # imm = 0x2244 -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm8, %ymm5, %ymm3 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm9, %ymm6, %ymm0 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm7, %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm18 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm18, %xmm8, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm10, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm24[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 224(%rdi), %xmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm25[0,7,14],zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm10, %zmm25 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: movw $9288, %ax # imm = 0x2448 -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm10 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm14, %xmm10, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm16, %ymm15, %ymm10 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm10[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm14[2],ymm10[3,4,5],ymm14[6],ymm10[7,8,9],ymm14[10],ymm10[11,12,13],ymm14[14],ymm10[15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 288(%rdi), %ymm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm14, %ymm13, %ymm0 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm11, %xmm0, %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm11 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4,5],ymm12[6],ymm11[7,8,9],ymm12[10],ymm11[11,12,13],ymm12[14],ymm11[15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm10, %ymm19 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm11, %ymm20 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 416(%rdi), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm11 ; AVX512BW-ONLY-SLOW-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm14, %ymm10, %ymm20 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm20, %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm22, %xmm20, %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm11, %ymm12 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm12, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm12[4,11],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm12, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm0, %ymm21 ; AVX512BW-ONLY-SLOW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm22 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm22, %ymm12 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm12, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[1,8,15],zero,zero,xmm12[4,11],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm12, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm2, %ymm7, %ymm12 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1],ymm3[2],ymm12[3,4,5],ymm3[6],ymm12[7,8,9],ymm3[10],ymm12[11,12,13],ymm3[14],ymm12[15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: movl $511, %edi # imm = 0x1FF ; AVX512BW-ONLY-SLOW-NEXT: kmovd %edi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm22, %ymm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm8, %ymm5, %ymm22 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm1, %xmm22, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm11 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm0 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: movl $261632, %edi # imm = 0x3FE00 -; AVX512BW-ONLY-SLOW-NEXT: kmovd %edi, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm1, %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[1,8,15,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm17, %xmm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm9, %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm21[0],xmm17[0],xmm21[1],xmm17[1],xmm21[2],xmm17[2],xmm21[3],xmm17[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm3, %xmm17, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm0, %zmm17 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm0 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm1 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,1,8,15],zero,zero,xmm1[4,11],zero,zero,xmm1[u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm3, %xmm18, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm19, %ymm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm9, %ymm6, %ymm3 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm3[u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm3, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm10[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm23[0],xmm19[0],xmm23[1],xmm19[1],xmm23[2],xmm19[2],xmm23[3],xmm19[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm24[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm25[1,8,15],zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm4, %xmm19, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm3, %zmm12 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm2, %ymm7, %ymm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm22, %ymm4 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm4, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: movl $261632, %edi # imm = 0x3FE00 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %edi, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 {%k2} = ymm3[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm3, %xmm18, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm0, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm9, %ymm3 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm3, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm18, %xmm10, %xmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm18 = xmm19[0],xmm18[0],xmm19[1],xmm18[1],xmm19[2],xmm18[2],xmm19[3],xmm18[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm5 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm25[2,9],zero,zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = zero,zero,xmm24[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm5, %xmm18, %xmm5 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm4, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm2, %ymm7, %ymm3 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5,6],ymm4[7,8],ymm3[9,10],ymm4[11],ymm3[12,13,14],ymm4[15] +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm1, %ymm22, %ymm4 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 {%k2} = ymm3[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm9, %ymm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[5,12] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm5, %xmm19, %xmm5 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-ONLY-SLOW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 +; AVX512BW-ONLY-SLOW-NEXT: kmovd %edi, %k6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm5, %ymm3 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm25[3,10],zero,zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm24[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm5, %xmm19, %xmm5 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm4, %zmm19 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm20, %ymm19 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm21, %ymm20 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm25 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm0 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm16, %ymm15, %ymm1 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm14, %ymm10, %ymm1 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm11 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm0 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm16, %ymm1 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5,6],ymm3[7,8],ymm1[9,10],ymm3[11],ymm1[12,13,14],ymm3[15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm14, %ymm10, %ymm1 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm17 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm0 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm16, %ymm1 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6],ymm3[7,8],ymm1[9,10,11],ymm3[12],ymm1[13,14],ymm3[15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm10, %ymm14, %ymm1 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm3, %zmm16 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm14, %ymm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm4 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm11, %ymm4 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm3, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm14, %ymm3 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm4 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5,6],ymm5[7,8],ymm4[9,10],ymm5[11],ymm4[12,13,14],ymm5[15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm11, %ymm4 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm3, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm14, %ymm3 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm4 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6],ymm5[7,8],ymm4[9,10,11],ymm5[12],ymm4[13,14],ymm5[15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm11, %ymm15, %ymm4 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm3, %zmm19 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u] -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm1, %xmm0, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm15, %ymm16, %ymm0 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm0, %ymm19 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm16, %ymm15, %ymm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm14, %ymm13, %ymm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u] +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm4, %xmm3, %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm3, %ymm20 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm14, %ymm13, %ymm3 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm4 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: kmovq %k1, %k7 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7,8],ymm3[9],ymm1[10,11,12],ymm3[13],ymm1[14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm20 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm21 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm0, %ymm20 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm10, %ymm14, %ymm0 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm0, %ymm19 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm10, %ymm14, %ymm0 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm0, %ymm20 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm0 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm2, %ymm21 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm16, %ymm15 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm15[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1],ymm1[2],ymm15[3,4],ymm1[5],ymm15[6,7,8,9],ymm1[10],ymm15[11,12],ymm1[13],ymm15[14,15] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm13, %ymm12 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm12, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm11, %ymm15, %ymm3 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm20 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm11, %ymm15, %ymm3 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm21 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm17, %ymm0 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm14, %ymm13 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX512BW-ONLY-SLOW-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm12 {%k1} = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm14, %ymm10 {%k7} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm2, %ymm1 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm3 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm6, %ymm2 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm8, %ymm5 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm6, %ymm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm15, %ymm11 {%k7} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm2, %ymm7, %ymm14 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm9, %ymm3 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm7, %ymm2, %ymm13 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm22, %ymm1, %ymm4 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm7, %ymm2 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm22, %ymm1, %ymm5 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpblendmw %ymm6, %ymm9, %ymm7 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm22, %ymm1 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %ymm9, %ymm6 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm9, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[6,13] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm9, %xmm15, %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm9, %ymm3 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm10[0,7,14] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm9, %xmm15, %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm9, %ymm7 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm9, %xmm6 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm6, %ymm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm6, %ymm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm24, %zmm6, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm24, %zmm7, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm24, %zmm8, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm10[1,8,15] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm8, %ymm6 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm14[1,2,3],ymm8[4],ymm14[5,6],ymm8[7,8],ymm14[9,10,11],ymm8[12],ymm14[13,14],ymm8[15] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm9 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX512BW-ONLY-SLOW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 {%k1} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm25, %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm24[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512BW-ONLY-SLOW-NEXT: vextracti32x4 $1, %ymm21, %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm9, %xmm13, %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm3, %zmm9 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm6[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm26, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm3, %zmm2 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1],ymm13[2,3],ymm9[4],ymm13[5,6,7,8],ymm9[9],ymm13[10,11],ymm9[12],ymm13[13,14,15] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm10, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm9[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm24[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm25[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm7, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm24, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm25[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,u,u,u,u,0,7,14],zero,zero,xmm11[3,10],zero,zero,zero +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, (%rsi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rdx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, (%rcx) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rdi) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rdi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: load_i8_stride7_vf64: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 96(%rdi), %ymm25 ; AVX512BW-ONLY-FAST-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm7, %ymm0 {%k1} ; AVX512BW-ONLY-FAST-NEXT: kmovq %k1, %k2 ; AVX512BW-ONLY-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm4, %xmm0, %xmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm24, %ymm25, %ymm4 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm4, %ymm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm4, %ymm23 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX512BW-ONLY-FAST-NEXT: movw $8772, %ax # imm = 0x2244 ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm8, %ymm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: kmovq %k1, %k3 -; AVX512BW-ONLY-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm8, %ymm6, %ymm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: kmovq %k1, %k6 ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u] @@ -11161,734 +11164,740 @@ ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,6,1,2,4,6] ; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm19, %ymm5, %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm16 +; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm16, %ymm5, %ymm5 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm6 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm7 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm5, %xmm12, %xmm5 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm4, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 240(%rdi), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm9, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512BW-ONLY-FAST-NEXT: kmovq %rax, %k5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm4, %zmm2 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm9, %zmm23 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 288(%rdi), %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm11 ; AVX512BW-ONLY-FAST-NEXT: movw $9288, %ax # imm = 0x2448 ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k7 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm12 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm12[u,u,u,u,u,3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u],zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm12, %xmm21 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm11, %ymm9 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13],zero,zero,xmm9[u,u] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,xmm9[1,8,15],zero,zero,xmm9[4,11,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm10, %xmm9, %xmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 352(%rdi), %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm14, %ymm15, %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: movw $3968, %ax # imm = 0xF80 -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm10, %ymm21 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm10 +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm9, %ymm20 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 416(%rdi), %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm9 ; AVX512BW-ONLY-FAST-NEXT: movw $4644, %ax # imm = 0x1224 -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k6 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm10, %ymm20 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm22 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm22, %xmm20, %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22 +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k4 +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm13, %ymm9, %ymm10 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm10, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13],zero,zero,xmm17[2,9] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm10[4,11],zero,zero +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm10, %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm10, %ymm0, %ymm21 ; AVX512BW-ONLY-FAST-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm20 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm23 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[1,8,15],zero,zero,xmm20[4,11],zero,zero,xmm20[u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm7, %ymm10 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm10, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[6,13],zero,zero,xmm17[2,9,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,8,15],zero,zero,xmm10[4,11],zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm10, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,17,10,3,u,21,14,7,24,u,u,u,28,u,u,31> +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm25, %ymm24, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: movl $511, %r10d # imm = 0x1FF ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm20, %ymm13 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm8, %ymm20 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,6,13],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,4,6,1,3,4,6] -; AVX512BW-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm20 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm7[1,8,15],zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm15, %xmm20, %xmm15 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm14, %zmm13 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm14 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm17, %ymm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm8, %ymm6, %ymm17 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,6,13],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,xmm17[4,11],zero,zero,xmm17[0,7,14,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm17, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [1,3,4,6,1,3,4,6] +; AVX512BW-ONLY-FAST-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm16, %ymm17, %ymm17 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm1, %xmm17, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm10 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm7, %ymm0 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,18,11,4,21,14,7,u,25,u,u,28,u,u,u> +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm25, %ymm24, %ymm1 ; AVX512BW-ONLY-FAST-NEXT: movl $261632, %r10d # imm = 0x3FE00 ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k5 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm15 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,zero,xmm20[5,12],zero,zero,xmm20[1,8,15,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm15, %xmm15 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,5,6,1,3,5,6] -; AVX512BW-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm15, %xmm19, %xmm15 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm0, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm14, %zmm15 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm0 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm1[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm6, %ymm8, %ymm1 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm1, %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[1,8,15,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm1, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [1,3,5,6,1,3,5,6] +; AVX512BW-ONLY-FAST-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm16, %ymm17, %ymm16 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm2, %xmm16, %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm0, %zmm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm3, %ymm7, %ymm0 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm14 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,1,8,15],zero,zero,xmm14[4,11],zero,zero,xmm14[u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 208(%rdi), %xmm19 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm23, %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <8,1,18,11,4,u,22,15,u,25,u,u,u,29,u,u> +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm25, %ymm24, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm1[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm6, %ymm8, %ymm1 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[2,9,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,1,8,15],zero,zero,xmm1[4,11],zero,zero,xmm1[u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 208(%rdi), %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm18[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm18[5,12] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm19 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm19[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm2, %xmm17, %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-ONLY-FAST-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k2 ; AVX512BW-ONLY-FAST-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm18, %ymm14 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,xmm6[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm18, %xmm23, %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm18, %zmm14, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm0, %zmm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm22, %ymm21 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm2, %ymm1 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm2, %xmm17, %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm0, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm21, %ymm20 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 -; AVX512BW-ONLY-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm2 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u] +; AVX512BW-ONLY-FAST-NEXT: kmovq %rax, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm23 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm12, %ymm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm14, %ymm0 {%k3} -; AVX512BW-ONLY-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm10, %ymm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm14[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm14[5,12],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm14, %ymm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm14, %ymm15, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm1, %ymm0 {%k3} +; AVX512BW-ONLY-FAST-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm13, %ymm9, %ymm1 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm13 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[6,13,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm12, %ymm0 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm14, %ymm0 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm10, %ymm14 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm14[6,13],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm16, %xmm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm14, %ymm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm15, %ymm14, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm1, %ymm0 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm13, %ymm9, %ymm1 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm11, %ymm12, %ymm0 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm14, %ymm0 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm12, %ymm14 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm16 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm16, %xmm14, %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm14, %ymm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm15, %ymm14, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm1, %ymm0 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm9, %ymm13, %ymm1 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm18 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm14 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm17 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[1,8,15,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm14, %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %zmm0, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm11, %ymm0 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u] +; AVX512BW-ONLY-FAST-NEXT: vporq %xmm1, %xmm0, %xmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm15, %ymm14, %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm0, %ymm14 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm5, %ymm4, %ymm0 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm17 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u],zero,zero,zero,xmm17[6,13],zero,zero,xmm17[2,9,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm0, %ymm20 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm12, %ymm11, %ymm0 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,3,u,u,u,7,24,17,10,u,28,21,14,31> +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm14, %ymm15, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm21 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm21 {%k1} ; AVX512BW-ONLY-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm12, %ymm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm17 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[3,10],zero,zero,zero,xmm17[6,13] +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm9, %ymm13, %ymm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13] ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm14 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm10, %ymm12, %ymm0 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm20 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm9, %ymm13, %ymm0 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm16 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm9, %ymm3, %ymm0 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm9, %ymm3, %ymm17 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm21 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm9, %ymm3 {%k7} -; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm11, %ymm8 {%k6} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[u,u,2,9],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm20[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm11, %xmm22, %xmm11 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-ONLY-FAST-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm11, %ymm9 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm21[u,u,3,10],zero,zero,zero,xmm21[6,13],zero,zero,xmm21[u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm21, %xmm21 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u],zero,zero,xmm21[1,8,15],zero,zero,xmm21[4,11,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm11, %xmm21, %xmm11 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm20[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[0,7,14] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm22, %xmm21 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm21, %ymm11 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm8, %xmm21 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm8, %xmm8 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[1,8,15] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm19, %xmm19 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm19, %ymm8 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm19, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm20, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm1, %zmm21, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm21 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm0, %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm1[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm21 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm6[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm9, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm9, %zmm0 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm9 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[5,12],zero,zero,xmm17[1,8,15],zero,zero,xmm17[u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vporq %xmm9, %xmm17, %xmm9 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm6[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm20[0],xmm17[0],xmm20[1],xmm17[1],xmm20[2],xmm17[2],xmm20[3],xmm17[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm11, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm11, %zmm9 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm0, %ymm21 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm6, %ymm8, %ymm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm7, %ymm3, %ymm22 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,u,4,u,u,7,u,25,18,11,28,21,14,u> +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm14, %ymm15, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm12, %ymm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[u,u,u,u,2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u],zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm2, %xmm11, %xmm11 +; AVX512BW-ONLY-FAST-NEXT: movl $4186112, %eax # imm = 0x3FE000 +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 {%k1} = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm13, %ymm9 {%k6} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm7, %ymm3, %ymm1 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpblendmw %ymm6, %ymm8, %ymm2 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm7, %ymm3 {%k7} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm8, %ymm6 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm18[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm18[6,13] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm19[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-ONLY-FAST-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm7, %ymm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm19[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm18[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm7, %ymm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm19[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm18[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm18[1,8,15] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <8,1,u,19,12,5,22,15,u,u,26,u,u,29,u,u> +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm25, %ymm24, %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm8 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm0, %zmm1 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 +; AVX512BW-ONLY-FAST-NEXT: vextracti32x4 $1, %ymm22, %xmm8 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm22[5,12],zero,zero,xmm22[1,8,15],zero,zero,xmm22[u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm8, %xmm12, %xmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,25,18,3,28,21,u,7,u,u,10,u,u,u,14,u> +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm24, %ymm25, %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm2, %zmm8 {%k5} +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm3, %xmm11, %xmm3 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm19[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm1, %zmm3 {%k5} -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm12, %ymm10 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] -; AVX512BW-ONLY-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512BW-ONLY-FAST-NEXT: movl $4186112, %edi # imm = 0x3FE000 -; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,25,18,u,4,29,22,7,u,u,u,11,u,u,14,u> +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %ymm24, %ymm25, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm3[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu16 %zmm3, %zmm2 {%k5} ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15] -; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,u,u,u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero -; AVX512BW-ONLY-FAST-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] -; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm1, %zmm4, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm9, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] +; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,u,u,u,0,7,14],zero,zero,xmm9[3,10],zero,zero,zero +; AVX512BW-ONLY-FAST-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rdi) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, (%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%rdi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: load_i8_stride7_vf64: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm24, %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX512DQBW-SLOW-NEXT: movw $-28382, %ax # imm = 0x9122 -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm1 {%k1} -; AVX512DQBW-SLOW-NEXT: kmovq %k1, %k2 -; AVX512DQBW-SLOW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm22, %ymm0 {%k2} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm4, %xmm1, %xmm25 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm4, %xmm0, %xmm16 +; AVX512DQBW-SLOW-NEXT: movw $9288, %ax # imm = 0x2448 +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k5 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm7, %ymm2, %ymm0 {%k5} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm0, %ymm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX512DQBW-SLOW-NEXT: movw $8772, %ax # imm = 0x2244 -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k6 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm8, %ymm5, %ymm3 {%k6} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k3 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm9, %ymm6, %ymm0 {%k3} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm7, %xmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm9 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm18 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm18, %xmm8, %xmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm26 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqa 224(%rdi), %xmm4 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm10, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm24 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm24[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 224(%rdi), %xmm25 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm25[0,7,14],zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm0 ; AVX512DQBW-SLOW-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm25 {%k5} -; AVX512DQBW-SLOW-NEXT: vmovdqa 288(%rdi), %ymm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %ymm12 -; AVX512DQBW-SLOW-NEXT: movw $9288, %ax # imm = 0x2448 -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k3 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm10 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm14, %xmm10, %xmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm16, %ymm15, %ymm10 {%k6} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm10[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm14[2],ymm10[3,4,5],ymm14[6],ymm10[7,8,9],ymm14[10],ymm10[11,12,13],ymm14[14],ymm10[15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 288(%rdi), %ymm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %ymm13 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm14, %ymm13, %ymm0 {%k5} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm11, %xmm0, %xmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 352(%rdi), %ymm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm11 {%k3} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4,5],ymm12[6],ymm11[7,8,9],ymm12[10],ymm11[11,12,13],ymm12[14],ymm11[15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k7 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm10, %ymm19 {%k7} -; AVX512DQBW-SLOW-NEXT: vmovdqa 416(%rdi), %ymm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm11, %ymm20 {%k7} +; AVX512DQBW-SLOW-NEXT: vmovdqa 416(%rdi), %ymm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm11 ; AVX512DQBW-SLOW-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k4 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm14, %ymm10, %ymm20 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm20, %xmm22 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero -; AVX512DQBW-SLOW-NEXT: vporq %xmm22, %xmm20, %xmm20 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm11, %ymm12 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm12, %xmm19 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm12[4,11],zero,zero +; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm12, %xmm12 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm0, %ymm21 ; AVX512DQBW-SLOW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm22 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm23 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm22, %ymm12 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm12, %xmm19 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[1,8,15],zero,zero,xmm12[4,11],zero,zero,xmm12[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm12, %xmm19 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm2, %ymm7, %ymm12 {%k2} +; AVX512DQBW-SLOW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1],ymm3[2],ymm12[3,4,5],ymm3[6],ymm12[7,8,9],ymm3[10],ymm12[11,12,13],ymm3[14],ymm12[15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: movl $511, %edi # imm = 0x1FF -; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm22, %ymm11 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm8, %ymm5, %ymm22 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm22 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm23, %xmm22, %xmm22 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm1, %xmm22, %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm11 {%k5} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm0 {%k6} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k6 +; AVX512DQBW-SLOW-NEXT: kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm19, %ymm12 {%k6} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm9, %ymm6, %ymm3 {%k5} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm3[u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm10[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm23[0],xmm19[0],xmm23[1],xmm19[1],xmm23[2],xmm19[2],xmm23[3],xmm19[3] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm24[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm25[1,8,15],zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm4, %xmm19, %xmm4 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm3, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm2, %ymm7, %ymm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm22, %ymm4 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm4, %xmm4 ; AVX512DQBW-SLOW-NEXT: movl $261632, %edi # imm = 0x3FE00 -; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k5 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm1 {%k2} -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm1, %xmm17 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[1,8,15,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm17, %xmm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm9, %xmm17 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm21[0],xmm17[0],xmm21[1],xmm17[1],xmm21[2],xmm17[2],xmm21[3],xmm17[3] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm3, %xmm17, %xmm3 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm0, %zmm17 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm2, %ymm6, %ymm0 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm1 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,1,8,15],zero,zero,xmm1[4,11],zero,zero,xmm1[u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQBW-SLOW-NEXT: vporq %xmm3, %xmm18, %xmm3 +; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k6 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 {%k6} = ymm3[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm9, %ymm3 {%k2} +; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm3, %xmm19 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm18, %xmm10, %xmm18 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm18 = xmm19[0],xmm18[0],xmm19[1],xmm18[1],xmm19[2],xmm18[2],xmm19[3],xmm18[3] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm5 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm25[2,9],zero,zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = zero,zero,xmm24[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm5, %xmm18, %xmm5 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm18 +; AVX512DQBW-SLOW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm4, %zmm18 {%k2} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm2, %ymm7, %ymm3 {%k3} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5,6],ymm4[7,8],ymm3[9,10],ymm4[11],ymm3[12,13,14],ymm4[15] +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm1, %ymm22, %ymm4 {%k5} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 {%k6} = ymm3[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm9, %ymm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm5, %xmm3, %xmm3 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[5,12] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQBW-SLOW-NEXT: vporq %xmm5, %xmm19, %xmm5 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512DQBW-SLOW-NEXT: movl $-134217728, %edi # imm = 0xF8000000 -; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k2 -; AVX512DQBW-SLOW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm1 {%k2} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm18 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm3, %xmm18, %xmm3 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm0, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k1 +; AVX512DQBW-SLOW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm5, %ymm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm25[3,10],zero,zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm24[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm5, %xmm19, %xmm5 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm4, %zmm19 {%k2} ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm20, %ymm19 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm21, %ymm20 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm3 ; AVX512DQBW-SLOW-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 ; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm25 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm0 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm16, %ymm15, %ymm1 {%k3} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm14, %ymm10, %ymm1 {%k6} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm11 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm0 {%k6} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm16, %ymm1 {%k4} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5,6],ymm3[7,8],ymm1[9,10],ymm3[11],ymm1[12,13,14],ymm3[15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm14, %ymm10, %ymm1 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm17 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm12, %ymm13, %ymm0 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm16, %ymm1 {%k6} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6],ymm3[7,8],ymm1[9,10,11],ymm3[12],ymm1[13,14],ymm3[15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm10, %ymm14, %ymm1 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm3, %zmm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm14, %ymm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm4 {%k5} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k7} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm11, %ymm4 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10] +; AVX512DQBW-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm3, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm14, %ymm3 {%k3} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm4 {%k4} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5,6],ymm5[7,8],ymm4[9,10],ymm5[11],ymm4[12,13,14],ymm5[15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k7} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm11, %ymm4 {%k5} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11] +; AVX512DQBW-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm3, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm14, %ymm3 {%k5} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm4 {%k3} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6],ymm5[7,8],ymm4[9,10,11],ymm5[12],ymm4[13,14],ymm5[15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k7} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm11, %ymm15, %ymm4 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero +; AVX512DQBW-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm3, %zmm19 {%k1} ; AVX512DQBW-SLOW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u] -; AVX512DQBW-SLOW-NEXT: vporq %xmm1, %xmm0, %xmm19 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm15, %ymm16, %ymm0 {%k3} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm0, %ymm19 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k4} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm16, %ymm15, %ymm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm14, %ymm13, %ymm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u] +; AVX512DQBW-SLOW-NEXT: vporq %xmm4, %xmm3, %xmm20 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k5} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm3, %ymm20 {%k7} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm14, %ymm13, %ymm3 {%k4} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm17, %ymm0, %ymm4 {%k1} ; AVX512DQBW-SLOW-NEXT: kmovq %k1, %k7 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7,8],ymm3[9],ymm1[10,11,12],ymm3[13],ymm1[14,15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm20 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm21 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm0, %ymm20 {%k1} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm10, %ymm14, %ymm0 {%k6} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm0, %ymm19 {%k2} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm10, %ymm14, %ymm0 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm0, %ymm20 {%k2} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm22 {%k6} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm2, %ymm21 {%k6} -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm16, %ymm15 {%k4} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm15[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm13, %ymm12 {%k6} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm11, %ymm15, %ymm3 {%k3} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm20 {%k2} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm11, %ymm15, %ymm3 {%k5} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14] +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm21 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm17, %ymm0 {%k4} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm14, %ymm13 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX512DQBW-SLOW-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm12 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm14, %ymm10 {%k7} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm2, %ymm0 {%k4} -; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm5, %ymm8, %ymm1 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm6, %ymm2 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm8, %ymm5 {%k4} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm22[u,u,2,9],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm22, %xmm6 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm15, %ymm11 {%k7} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm2, %ymm7, %ymm14 {%k5} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm9, %ymm3 {%k3} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm7, %ymm2, %ymm13 {%k4} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm22, %ymm1, %ymm4 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm7, %ymm2 {%k3} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm22, %ymm1, %ymm5 {%k4} +; AVX512DQBW-SLOW-NEXT: vpblendmw %ymm6, %ymm9, %ymm7 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm22, %ymm1 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %ymm9, %ymm6 {%k4} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm9, %xmm3 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[6,13] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQBW-SLOW-NEXT: vpor %xmm9, %xmm15, %xmm9 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512DQBW-SLOW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm6, %ymm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14] -; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm9, %ymm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm10[0,7,14] +; AVX512DQBW-SLOW-NEXT: vpor %xmm9, %xmm15, %xmm9 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm9, %ymm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm9 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm9, %xmm6 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm6, %ymm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15] -; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm6, %ymm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm24, %zmm6, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm24, %zmm7, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u> -; AVX512DQBW-SLOW-NEXT: vpermw %zmm24, %zmm8, %zmm8 -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm9 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm10[1,8,15] +; AVX512DQBW-SLOW-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm8, %ymm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm8 +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm14[1,2,3],ymm8[4],ymm14[5,6],ymm8[7,8],ymm14[9,10,11],ymm8[12],ymm14[13,14],ymm8[15] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm9 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm0, %xmm9, %xmm0 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 {%k6} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm9 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm8, %xmm25, %xmm9 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm24[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm3, %zmm5 {%k6} ; AVX512DQBW-SLOW-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512DQBW-SLOW-NEXT: vextracti32x4 $1, %ymm21, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm9, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm1, %zmm3 {%k5} -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm6[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm8, %xmm26, %xmm2 -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k5} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm3[1],ymm13[2,3],ymm3[4],ymm13[5,6,7,8],ymm3[9],ymm13[10,11],ymm3[12],ymm13[13,14,15] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm9, %xmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 {%k6} = ymm3[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm24[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm25[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm3, %zmm4 {%k6} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k6} = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm8, %xmm24, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm25[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k6} ; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm19, %zmm0, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm20, %zmm0, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm2 +; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm20, %zmm0, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm21, %zmm0, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm2 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15] -; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero -; AVX512DQBW-SLOW-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,u,u,u,u,0,7,14],zero,zero,xmm11[3,10],zero,zero,zero +; AVX512DQBW-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1,2],ymm2[3,4,5,6,7],ymm12[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, (%rsi) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%rdx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, (%rcx) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, (%rdi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, (%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, (%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, (%rdi) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: load_i8_stride7_vf64: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512DQBW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512DQBW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm1, %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm7 ; AVX512DQBW-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 96(%rdi), %ymm26 ; AVX512DQBW-FAST-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm2 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm7, %ymm0 {%k1} ; AVX512DQBW-FAST-NEXT: kmovq %k1, %k2 ; AVX512DQBW-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm4, %xmm0, %xmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm25, %ymm26, %ymm4 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm4, %ymm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqa 160(%rdi), %ymm8 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm4, %ymm24 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX512DQBW-FAST-NEXT: vmovdqa 160(%rdi), %ymm6 ; AVX512DQBW-FAST-NEXT: movw $8772, %ax # imm = 0x2244 -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm8, %ymm4 {%k1} -; AVX512DQBW-FAST-NEXT: kmovq %k1, %k3 -; AVX512DQBW-FAST-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k7 +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm8, %ymm6, %ymm4 {%k7} ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u] @@ -11896,317 +11905,331 @@ ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,6,1,2,4,6] ; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %ymm19 -; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm5, %ymm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %ymm16 +; AVX512DQBW-FAST-NEXT: vpermd %ymm16, %ymm5, %ymm5 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQBW-FAST-NEXT: vmovdqa 240(%rdi), %xmm5 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[5,12,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqa 224(%rdi), %xmm7 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm6, %xmm12, %xmm6 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm4, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512DQBW-FAST-NEXT: vmovdqa 240(%rdi), %xmm4 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm9, %zmm9 ; AVX512DQBW-FAST-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512DQBW-FAST-NEXT: kmovq %rax, %k5 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm4, %zmm2 {%k5} -; AVX512DQBW-FAST-NEXT: vmovdqa 288(%rdi), %ymm6 -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm9, %zmm24 {%k5} +; AVX512DQBW-FAST-NEXT: vmovdqa 288(%rdi), %ymm12 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %ymm11 ; AVX512DQBW-FAST-NEXT: movw $9288, %ax # imm = 0x2448 -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k6 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm4, %ymm12 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm12[u,u,u,u,u,3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u],zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm12, %xmm21 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k3 +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm11, %ymm9 {%k3} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13],zero,zero,xmm9[u,u] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,xmm9[1,8,15],zero,zero,xmm9[4,11,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm10, %xmm9, %xmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512DQBW-FAST-NEXT: vmovdqa 352(%rdi), %ymm14 +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm14, %ymm15, %ymm9 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: movw $3968, %ax # imm = 0xF80 -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k7 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm10, %ymm21 {%k7} -; AVX512DQBW-FAST-NEXT: vmovdqa 416(%rdi), %ymm12 -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdi), %ymm10 +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k6 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm9, %ymm20 {%k6} +; AVX512DQBW-FAST-NEXT: vmovdqa 416(%rdi), %ymm13 +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdi), %ymm9 ; AVX512DQBW-FAST-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k4 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm10, %ymm20 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm22 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero -; AVX512DQBW-FAST-NEXT: vporq %xmm22, %xmm20, %xmm20 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22 +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm13, %ymm9, %ymm10 {%k4} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm10, %xmm17 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13],zero,zero,xmm17[2,9] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm10[4,11],zero,zero +; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm10, %xmm10 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm10, %ymm0, %ymm21 ; AVX512DQBW-FAST-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm20 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm23 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[1,8,15],zero,zero,xmm20[4,11],zero,zero,xmm20[u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm7, %ymm10 {%k4} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm10, %xmm17 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm17[6,13],zero,zero,xmm17[2,9,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,8,15],zero,zero,xmm10[4,11],zero,zero,xmm10[u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm10, %xmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,17,10,3,u,21,14,7,24,u,u,u,28,u,u,31> +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm26, %ymm25, %ymm10 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: movl $511, %r10d # imm = 0x1FF ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm20, %ymm13 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm8, %ymm20 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,6,13],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u] -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm20, %xmm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm23, %xmm20, %xmm20 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm14 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [1,3,4,6,1,3,4,6] -; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm5[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm7[1,8,15],zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm15, %xmm20, %xmm15 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm14, %zmm13 {%k5} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm14 {%k3} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm17, %ymm10 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm8, %ymm6, %ymm17 {%k3} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,6,13],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[u,u,u,u] +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm17 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u],zero,zero,xmm17[4,11],zero,zero,xmm17[0,7,14,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm17, %xmm17 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm17 = [1,3,4,6,1,3,4,6] +; AVX512DQBW-FAST-NEXT: # ymm17 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermd %ymm16, %ymm17, %ymm17 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm1, %xmm17, %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm10 {%k5} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm7, %ymm0 {%k7} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,18,11,4,21,14,7,u,25,u,u,28,u,u,u> +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm26, %ymm25, %ymm1 ; AVX512DQBW-FAST-NEXT: movl $261632, %r10d # imm = 0x3FE00 ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k5 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm15 {%k2} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm15, %xmm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,zero,xmm20[5,12],zero,zero,xmm20[1,8,15,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm15, %xmm15 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [1,3,5,6,1,3,5,6] -; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm19 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm15, %xmm19, %xmm15 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm0, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm14, %zmm15 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm9, %ymm0 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm1[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm8, %ymm1 {%k2} +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm1, %xmm17 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[1,8,15,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm1, %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm17 = [1,3,5,6,1,3,5,6] +; AVX512DQBW-FAST-NEXT: # ymm17 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermd %ymm16, %ymm17, %ymm16 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm2, %xmm16, %xmm2 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm16 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm3, %ymm7, %ymm0 {%k3} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm14 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm18 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,1,8,15],zero,zero,xmm14[4,11],zero,zero,xmm14[u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 208(%rdi), %xmm19 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12] -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %xmm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm23, %xmm18 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <8,1,18,11,4,u,22,15,u,25,u,u,u,29,u,u> +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm26, %ymm25, %ymm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm1[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm8, %ymm1 {%k4} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[2,9,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,1,8,15],zero,zero,xmm1[4,11],zero,zero,xmm1[u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 208(%rdi), %xmm18 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm18[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm18[5,12] +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %xmm19 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm19[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQBW-FAST-NEXT: vporq %xmm2, %xmm17, %xmm2 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQBW-FAST-NEXT: movl $-134217728, %edi # imm = 0xF8000000 ; AVX512DQBW-FAST-NEXT: kmovd %edi, %k2 ; AVX512DQBW-FAST-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm18, %ymm14 {%k2} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,xmm5[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm18, %xmm23, %xmm18 -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm18, %zmm14, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm18 {%k1} -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm2, %ymm1 {%k2} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm2, %xmm17, %xmm2 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm21, %ymm20 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm0 ; AVX512DQBW-FAST-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 -; AVX512DQBW-FAST-NEXT: kmovq %rax, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm2 {%k2} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm6, %ymm0 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u] +; AVX512DQBW-FAST-NEXT: kmovq %rax, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm12, %ymm0 {%k4} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm14, %ymm0 {%k7} -; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm10, %ymm14 {%k1} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm14[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm14[5,12],zero,zero -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10] -; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm14, %ymm0 {%k3} +; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm14, %ymm15, %ymm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm1, %ymm0 {%k6} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm13, %ymm9, %ymm1 {%k7} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10] +; AVX512DQBW-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm13 {%k2} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm6, %ymm0 {%k1} -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[6,13,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm12, %ymm0 {%k7} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm14, %ymm0 {%k7} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm10, %ymm14 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm14[6,13],zero,zero -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11] -; AVX512DQBW-FAST-NEXT: vporq %xmm16, %xmm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm14, %ymm0 {%k3} +; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm15, %ymm14, %ymm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm1, %ymm0 {%k6} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm13, %ymm9, %ymm1 {%k3} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11] +; AVX512DQBW-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm4, %ymm6, %ymm0 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm16 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm11, %ymm12, %ymm0 {%k3} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm14, %ymm0 {%k7} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm12, %ymm14 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm16 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero -; AVX512DQBW-FAST-NEXT: vporq %xmm16, %xmm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm14, %ymm0 {%k3} +; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm15, %ymm14, %ymm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm1, %ymm0 {%k6} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm9, %ymm13, %ymm1 {%k4} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero +; AVX512DQBW-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} ; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm18 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm14, %zmm16 -; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm4, %ymm14 {%k2} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm14, %xmm17 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[1,8,15,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm14, %xmm14 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm16, %ymm14 {%k7} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm4, %ymm16 {%k4} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm16, %xmm17 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u],zero,zero,zero,xmm17[6,13],zero,zero,xmm17[2,9,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm16, %xmm17 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm16 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu8 %zmm0, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm11, %ymm0 {%k1} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u] +; AVX512DQBW-FAST-NEXT: vporq %xmm1, %xmm0, %xmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm15, %ymm14, %ymm0 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm0, %ymm20 {%k6} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm12, %ymm11, %ymm0 {%k4} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,3,u,u,u,7,24,17,10,u,28,21,14,31> +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm14, %ymm15, %ymm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm21 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm17, %ymm16 {%k1} -; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm12, %ymm0 {%k1} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm0, %xmm17 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[3,10],zero,zero,zero,xmm17[6,13] +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm21 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm9, %ymm13, %ymm0 {%k7} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13] ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero -; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm14 {%k3} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm10, %ymm12, %ymm0 {%k6} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm20 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm9, %ymm13, %ymm0 {%k3} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14] -; AVX512DQBW-FAST-NEXT: vporq %xmm17, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm16 {%k3} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm9, %ymm3, %ymm21 {%k4} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm9, %ymm3, %ymm17 {%k1} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm0 {%k6} -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm9, %ymm3 {%k6} -; AVX512DQBW-FAST-NEXT: vpblendmw %ymm8, %ymm11, %ymm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm11, %ymm8 {%k4} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[u,u,2,9],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm20[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQBW-FAST-NEXT: vporq %xmm11, %xmm22, %xmm11 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQBW-FAST-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm11, %ymm9 {%k3} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[0,7,14] -; AVX512DQBW-FAST-NEXT: vporq %xmm11, %xmm22, %xmm11 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm11, %ymm0 {%k3} -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm8, %xmm11 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm11, %xmm8, %xmm8 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[1,8,15] -; AVX512DQBW-FAST-NEXT: vporq %xmm11, %xmm19, %xmm11 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm11, %ymm8 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm1, %zmm11, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512DQBW-FAST-NEXT: vpermw %zmm1, %zmm19, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512DQBW-FAST-NEXT: vpermw %zmm1, %zmm20, %zmm1 -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm21, %xmm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[4,11],zero,zero,xmm21[0,7,14],zero,zero,xmm21[u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm21, %xmm20 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm20 {%k5} = ymm1[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] -; AVX512DQBW-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm21 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm5[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm9, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm9, %zmm20 {%k5} -; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm17, %xmm9 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[5,12],zero,zero,xmm17[1,8,15],zero,zero,xmm17[u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vporq %xmm9, %xmm17, %xmm9 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm19[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm5[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm19 = xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm19[0],xmm17[0],xmm19[1],xmm17[1],xmm19[2],xmm17[2],xmm19[3],xmm17[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm0, %zmm9 {%k5} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm0, %ymm21 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm8, %ymm23 {%k7} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm7, %ymm3, %ymm22 {%k7} +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,u,u,4,u,u,7,u,25,18,11,28,21,14,u> +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm14, %ymm15, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm12, %ymm11 {%k7} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm11, %xmm2 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm2, %xmm11 +; AVX512DQBW-FAST-NEXT: movl $4186112, %eax # imm = 0x3FE000 +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm11 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm13, %ymm9 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm7, %ymm3, %ymm0 {%k4} +; AVX512DQBW-FAST-NEXT: vpblendmw %ymm6, %ymm8, %ymm1 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm7, %ymm3 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm8, %ymm6 {%k4} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm23[u,u,2,9],zero,zero,zero,xmm23[5,12],zero,zero,xmm23[u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm23, %xmm7 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm2, %xmm7, %xmm2 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm18[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm18[6,13] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm19[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQBW-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQBW-FAST-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm7, %ymm2 {%k1} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm7, %xmm1, %xmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm19[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm18[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14] +; AVX512DQBW-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm7, %ymm1 {%k1} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm19[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm18[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm18[1,8,15] +; AVX512DQBW-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQBW-FAST-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <8,1,u,19,12,5,22,15,u,u,26,u,u,29,u,u> +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm26, %ymm25, %ymm7 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512DQBW-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm8 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm2, %zmm0 {%k5} +; AVX512DQBW-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 +; AVX512DQBW-FAST-NEXT: vextracti32x4 $1, %ymm22, %xmm2 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm22[5,12],zero,zero,xmm22[1,8,15],zero,zero,xmm22[u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,25,18,3,28,21,u,7,u,u,10,u,u,u,14,u> +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm25, %ymm26, %ymm8 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm1, %zmm2 {%k5} +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm1, %zmm0 {%k5} -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm12, %ymm10 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] -; AVX512DQBW-FAST-NEXT: vpermw %zmm26, %zmm1, %zmm1 -; AVX512DQBW-FAST-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512DQBW-FAST-NEXT: vmovdqu16 %ymm6, %ymm4 {%k1} -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] -; AVX512DQBW-FAST-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512DQBW-FAST-NEXT: movl $4186112, %edi # imm = 0x3FE000 -; AVX512DQBW-FAST-NEXT: kmovd %edi, %k1 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm3 {%k1} = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,25,18,u,4,29,22,7,u,u,u,11,u,u,14,u> +; AVX512DQBW-FAST-NEXT: vpermi2w %ymm25, %ymm26, %ymm3 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm3[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm3 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu16 %zmm3, %zmm1 {%k5} ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm14, %zmm0, %zmm20 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm16, %zmm0, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm10, %xmm1 -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15] -; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u,u,u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero -; AVX512DQBW-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] -; AVX512DQBW-FAST-NEXT: vpermi2w %zmm1, %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm4, %zmm0, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm20, %zmm0, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm21, %zmm0, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm9, %xmm3 +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15] +; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,0,7,14],zero,zero,xmm9[3,10],zero,zero,zero +; AVX512DQBW-FAST-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0,1,2],ymm3[3,4,5,6,7],ymm11[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti32x8 $1, %ymm3, %zmm0, %zmm1 {%k1} ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%rsi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, (%rdx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, (%rcx) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, (%r8) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, (%r9) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rdi) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, (%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, (%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, (%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%rdi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %wide.vec = load <448 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll @@ -1884,439 +1884,434 @@ ; SSE-LABEL: load_i8_stride8_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $904, %rsp # imm = 0x388 -; SSE-NEXT: movdqa 64(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm12 +; SSE-NEXT: movdqa 192(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 224(%rdi), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm6 -; SSE-NEXT: movdqa 144(%rdi), %xmm13 -; SSE-NEXT: movdqa 160(%rdi), %xmm11 -; SSE-NEXT: movdqa 176(%rdi), %xmm14 -; SSE-NEXT: movdqa 192(%rdi), %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa 16(%rdi), %xmm13 +; SSE-NEXT: movdqa 32(%rdi), %xmm11 +; SSE-NEXT: movdqa 48(%rdi), %xmm7 +; SSE-NEXT: movdqa 64(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm7 -; SSE-NEXT: movdqa 224(%rdi), %xmm8 -; SSE-NEXT: movdqa 240(%rdi), %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0] +; SSE-NEXT: movdqa 96(%rdi), %xmm10 +; SSE-NEXT: movdqa 112(%rdi), %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0] ; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: movdqa 112(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: movdqa 240(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[0,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm15 +; SSE-NEXT: movdqa 176(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa 160(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: packuswb %xmm0, %xmm4 +; SSE-NEXT: movdqa 144(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa 128(%rdi), %xmm12 +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: packuswb %xmm0, %xmm3 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: packuswb %xmm0, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[0,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm6, %xmm6 +; SSE-NEXT: movdqa %xmm13, %xmm14 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE-NEXT: packuswb %xmm14, %xmm14 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; SSE-NEXT: packuswb %xmm9, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: packuswb %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm7 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: packuswb %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,1,1] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,2,2] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,1,1] +; SSE-NEXT: packuswb %xmm9, %xmm9 +; SSE-NEXT: pand %xmm10, %xmm9 +; SSE-NEXT: por %xmm7, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] +; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE-NEXT: packuswb %xmm8, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: packuswb %xmm9, %xmm9 ; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm9, %xmm1 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,1,1] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm13, %xmm6 -; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3],xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3],xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm12 +; SSE-NEXT: movdqa %xmm15, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm12 ; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm8 ; SSE-NEXT: por %xmm12, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,1,3] -; SSE-NEXT: packuswb %xmm1, %xmm3 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm6[8],xmm15[9],xmm6[9],xmm15[10],xmm6[10],xmm15[11],xmm6[11],xmm15[12],xmm6[12],xmm15[13],xmm6[13],xmm15[14],xmm6[14],xmm15[15],xmm6[15] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSE-NEXT: packuswb %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[1,1,1,1] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: packuswb %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[1,1,1,1] ; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm9, %xmm15 -; SSE-NEXT: por %xmm7, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1] -; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,2,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,2,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 +; SSE-NEXT: pand %xmm10, %xmm15 +; SSE-NEXT: por %xmm4, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm15, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: por %xmm15, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm4, %xmm4 ; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pandn %xmm7, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: por %xmm15, %xmm7 -; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,2,2,3] +; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm15, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: pandn %xmm15, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm15, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm9, %xmm15 -; SSE-NEXT: por %xmm10, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm15[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1] -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm10, %xmm15 +; SSE-NEXT: por %xmm5, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1] +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm10, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm11, %xmm10 -; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: por %xmm10, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm10, %xmm10 -; SSE-NEXT: movdqa %xmm9, %xmm15 -; SSE-NEXT: pandn %xmm10, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,5] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: pandn %xmm5, %xmm15 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm15, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,2,3] ; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -2324,36 +2319,37 @@ ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,2,3,3] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm14, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = mem[2,2,3,3] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm11, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,3] ; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -2361,40 +2357,43 @@ ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm7, %xmm1 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm4, %xmm4 ; SSE-NEXT: pand %xmm11, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 @@ -2405,14 +2404,14 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -2430,26 +2429,26 @@ ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshufd $231, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm5 ; SSE-NEXT: pandn %xmm4, %xmm5 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm4 ; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm13, %xmm1 @@ -2466,15 +2465,15 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -2482,43 +2481,43 @@ ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,1,3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[2,0,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,1,2,0,4,5,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,0,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 @@ -2528,16 +2527,16 @@ ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -2546,8 +2545,8 @@ ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] @@ -2555,7 +2554,7 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] @@ -2563,15 +2562,14 @@ ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,2,2,2] +; SSE-NEXT: packuswb %xmm3, %xmm12 +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm12, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,1,1] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pand %xmm10, %xmm7 +; SSE-NEXT: por %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm0[0],xmm8[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2595,25 +2593,25 @@ ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: packuswb %xmm1, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: packuswb %xmm1, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,1,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] @@ -2621,7 +2619,7 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] @@ -2630,200 +2628,201 @@ ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[1,1,1,1] -; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm9, %xmm15 -; SSE-NEXT: por %xmm4, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm15, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm15, %xmm1 -; SSE-NEXT: pshuflw $116, (%rsp), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[0,1,3,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm15, %xmm2 -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm9, %xmm15 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,1,1] +; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: pand %xmm10, %xmm14 +; SSE-NEXT: por %xmm7, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1] ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm14, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 ; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm14, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE-NEXT: pshuflw $116, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: por %xmm14, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,5,5,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm14, %xmm2 +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[0,1,3,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pandn %xmm14, %xmm3 +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm14, %xmm14 +; SSE-NEXT: pand %xmm10, %xmm14 +; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[2,2,3,3] +; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: pandn %xmm2, %xmm12 +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1] +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: por %xmm12, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm3[2],xmm12[3],xmm3[3] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1] ; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,1,2,3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm13 -; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: por %xmm2, %xmm13 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: pandn %xmm15, %xmm11 ; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,3,3] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm1, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,3,3] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,3,3,3] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm13[0],xmm0[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%r8) -; SSE-NEXT: movapd %xmm7, (%r9) -; SSE-NEXT: movapd %xmm6, 16(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%r8) +; SSE-NEXT: movapd %xmm9, 16(%r9) +; SSE-NEXT: movapd %xmm4, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm4, (%rax) -; SSE-NEXT: movapd %xmm8, 16(%rax) +; SSE-NEXT: movapd %xmm7, 16(%rax) +; SSE-NEXT: movapd %xmm8, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm12, (%rax) -; SSE-NEXT: movapd %xmm15, 16(%rax) +; SSE-NEXT: movapd %xmm14, 16(%rax) +; SSE-NEXT: movapd %xmm1, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm0, 16(%rax) -; SSE-NEXT: movapd %xmm5, (%rax) +; SSE-NEXT: movapd %xmm12, (%rax) ; SSE-NEXT: addq $904, %rsp # imm = 0x388 ; SSE-NEXT: retq ; @@ -4457,136 +4456,133 @@ ; AVX512F-SLOW-LABEL: load_i8_stride8_vf32: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm16 -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512F-SLOW-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vpmovqb %ymm3, %xmm3 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm17 +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm7 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm19 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm15 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm15, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512F-SLOW-NEXT: vpmovqb %ymm9, %xmm9 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm9 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm20 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm10[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] ; AVX512F-SLOW-NEXT: vpmovqb %zmm16, %xmm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-SLOW-NEXT: vmovdqa %xmm12, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm12 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm15, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa %xmm15, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm5, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm13 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm24 -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm21 +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm15, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm19 +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm12 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm13 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm10 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm20 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm23 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm23 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm10 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm5[5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm22 -; AVX512F-SLOW-NEXT: vmovdqa %xmm11, %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4],ymm10[5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm10 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 @@ -4594,43 +4590,38 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm25 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm26 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm10 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm5[5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4],ymm10[5],ymm14[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm26 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 @@ -4638,41 +4629,41 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm23 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm24 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm28 ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm24 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm23 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm8, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm27 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm16, %zmm2 @@ -4680,45 +4671,44 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm25 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm25 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm15, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa %xmm8, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm10 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm27 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm10 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm10 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm13, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm12, %xmm10 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm6, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm6, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm23 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm23 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm16, %zmm2 @@ -4727,42 +4717,36 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm13, %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm25 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm13, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm24 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm6, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm16, %zmm2 @@ -4770,44 +4754,42 @@ ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm16, %zmm3 ; AVX512F-SLOW-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, (%rsi) -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, (%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, (%rsi) +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, (%rdx) ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, (%r8) ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, (%r9) @@ -5057,291 +5039,293 @@ ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm16 -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa 240(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa 224(%rdi), %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm3 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm7 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm9 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm8 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX512BW-SLOW-NEXT: vpmovqb %ymm3, %xmm3 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; AVX512BW-SLOW-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm10[3] +; AVX512BW-SLOW-NEXT: vmovdqa 240(%rdi), %xmm1 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa 224(%rdi), %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm3 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm7 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-SLOW-NEXT: vmovdqa 176(%rdi), %xmm5 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm9 +; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %xmm7 +; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm8 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512BW-SLOW-NEXT: vpmovqb %ymm9, %xmm9 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa 112(%rdi), %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm9 +; AVX512BW-SLOW-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX512BW-SLOW-NEXT: vmovdqa 80(%rdi), %xmm9 +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm10 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] ; AVX512BW-SLOW-NEXT: vpmovqb %zmm16, %xmm10 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 ; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm19 -; AVX512BW-SLOW-NEXT: vmovdqa 144(%rdi), %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %xmm12 -; AVX512BW-SLOW-NEXT: vmovdqa 176(%rdi), %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa 144(%rdi), %xmm13 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm14 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm14 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm15 -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5,6],ymm10[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm17 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm7, %xmm15 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm11, %xmm18 +; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm13, %xmm18 ; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm19, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm10 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm10 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm10 -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm14 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm16, %zmm10 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm12 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm15[5],ymm12[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm14 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] +; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm16, %zmm12 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm12 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm15 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5,6],ymm10[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm5, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm14 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm17 ; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm19, %xmm15 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm10[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm10 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm16, %zmm10 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm12 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] +; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm16, %zmm12 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm12 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm15 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5,6],ymm10[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm5, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm14 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm17 ; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm19, %xmm15 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm10[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm10 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm16, %zmm10 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm12 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] +; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm16, %zmm12 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm12 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5,6],ymm10[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm17 -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm5, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm14 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm17[0],xmm14[1],xmm17[1],xmm14[2],xmm17[2],xmm14[3],xmm17[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm11, %xmm18 +; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm13, %xmm18 ; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm19, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm15 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm10[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm10 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm16, %zmm10 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm12 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] +; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm16, %zmm12 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm24 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm15 -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm12 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5,6],ymm10[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm17 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm7, %xmm15 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm11, %xmm18 +; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm13, %xmm18 ; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm19, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm14 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm10[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm10 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm16, %zmm10 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm12 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] +; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm16, %zmm12 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm10 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm14 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm14 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm15 -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5,6],ymm10[7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm17 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm7, %xmm15 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm11, %xmm18 +; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm13, %xmm18 ; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm19, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm15[5],ymm5[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm10 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm10 -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm14 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm16, %zmm10 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm9 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm9 -; AVX512BW-SLOW-NEXT: vpshufb %xmm7, %xmm12, %xmm7 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm10 -; AVX512BW-SLOW-NEXT: vpshufb %xmm9, %xmm19, %xmm9 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm4 +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm10 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm14 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] +; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm16, %zmm12 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm13, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm19, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm6, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] ; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm16, %zmm3 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm20, (%rsi) ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm21, (%rdx) ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm22, (%rcx) @@ -5589,21 +5573,21 @@ ; SSE-LABEL: load_i8_stride8_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $2024, %rsp # imm = 0x7E8 -; SSE-NEXT: movdqa 64(%rdi), %xmm6 +; SSE-NEXT: movdqa 192(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm8 +; SSE-NEXT: movdqa 208(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm11 +; SSE-NEXT: movdqa 224(%rdi), %xmm11 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm5 -; SSE-NEXT: movdqa 144(%rdi), %xmm10 -; SSE-NEXT: movdqa 160(%rdi), %xmm7 -; SSE-NEXT: movdqa 176(%rdi), %xmm13 -; SSE-NEXT: movdqa 192(%rdi), %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa 16(%rdi), %xmm10 +; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: movdqa 48(%rdi), %xmm13 +; SSE-NEXT: movdqa 64(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm15 -; SSE-NEXT: movdqa 224(%rdi), %xmm9 -; SSE-NEXT: movdqa 240(%rdi), %xmm12 +; SSE-NEXT: movdqa 80(%rdi), %xmm15 +; SSE-NEXT: movdqa 96(%rdi), %xmm9 +; SSE-NEXT: movdqa 112(%rdi), %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0] ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm0 @@ -5627,7 +5611,7 @@ ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: movdqa 112(%rdi), %xmm14 +; SSE-NEXT: movdqa 240(%rdi), %xmm14 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm3, %xmm3 @@ -5647,17 +5631,17 @@ ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm14 +; SSE-NEXT: movdqa 176(%rdi), %xmm14 ; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa 160(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movdqa 144(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa 128(%rdi), %xmm11 ; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: packuswb %xmm0, %xmm3 @@ -6134,11 +6118,13 @@ ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] -; SSE-NEXT: packuswb %xmm5, %xmm6 +; SSE-NEXT: packuswb %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm11[1,1,1,1] +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm6[1,1,1,1] ; SSE-NEXT: packuswb %xmm15, %xmm15 ; SSE-NEXT: pand %xmm9, %xmm15 ; SSE-NEXT: por %xmm5, %xmm15 @@ -6155,10 +6141,10 @@ ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm5 ; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pand %xmm2, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3] +; SSE-NEXT: movdqa %xmm7, %xmm11 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm10, %xmm0 @@ -6565,10 +6551,11 @@ ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,3,3] +; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,3,3] ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 @@ -6581,10 +6568,8 @@ ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm11[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[2,0,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 @@ -6636,9 +6621,10 @@ ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 @@ -6911,11 +6897,10 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3] ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 @@ -6944,9 +6929,10 @@ ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: packuswb %xmm2, %xmm8 +; SSE-NEXT: packuswb %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: pand %xmm9, %xmm3 @@ -7019,9 +7005,10 @@ ; SSE-NEXT: packuswb %xmm3, %xmm3 ; SSE-NEXT: pand %xmm9, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] @@ -7063,11 +7050,10 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] ; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,1,3] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 @@ -7088,12 +7074,11 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm3[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 @@ -7124,14 +7109,16 @@ ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 @@ -7167,8 +7154,7 @@ ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 @@ -7212,7 +7198,7 @@ ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshuflw $231, (%rsp), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm9, %xmm2 @@ -7257,7 +7243,7 @@ ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshuflw $231, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm9, %xmm2 @@ -7290,7 +7276,8 @@ ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,3,3] +; SSE-NEXT: pshufd $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,3,3] ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -7312,8 +7299,7 @@ ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm13, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 @@ -7358,8 +7344,7 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[2,2,3,3] ; SSE-NEXT: packuswb %xmm8, %xmm8 ; SSE-NEXT: pand %xmm12, %xmm8 ; SSE-NEXT: por %xmm1, %xmm8 @@ -7412,51 +7397,52 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 48(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 48(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 32(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 48(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 32(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%r8) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 16(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movapd %xmm15, 32(%rax) -; SSE-NEXT: movapd %xmm6, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, (%rax) +; SSE-NEXT: movaps %xmm4, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movaps %xmm4, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm5, 48(%rax) -; SSE-NEXT: movapd %xmm7, 32(%rax) +; SSE-NEXT: movapd %xmm5, 32(%rax) +; SSE-NEXT: movapd %xmm7, 48(%rax) ; SSE-NEXT: movapd %xmm11, 16(%rax) ; SSE-NEXT: movapd %xmm14, (%rax) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -7469,71 +7455,70 @@ ; ; AVX1-ONLY-LABEL: load_i8_stride8_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $808, %rsp # imm = 0x328 +; AVX1-ONLY-NEXT: subq $840, %rsp # imm = 0x348 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm7 +; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm7 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm8 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 @@ -7541,65 +7526,66 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 496(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 480(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 464(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 448(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 432(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 416(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 400(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 384(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -7608,44 +7594,45 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm0 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm15 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] @@ -7673,13 +7660,12 @@ ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] @@ -7712,39 +7698,40 @@ ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 @@ -7752,8 +7739,8 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] @@ -7770,15 +7757,14 @@ ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 @@ -7803,16 +7789,15 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] @@ -7822,29 +7807,30 @@ ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 @@ -7852,7 +7838,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload @@ -7863,20 +7849,19 @@ ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 @@ -7901,28 +7886,28 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] @@ -7931,21 +7916,20 @@ ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 @@ -7964,23 +7948,25 @@ ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -8008,19 +7994,20 @@ ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] @@ -8031,8 +8018,7 @@ ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -8040,7 +8026,7 @@ ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm15 @@ -8063,32 +8049,32 @@ ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -8099,13 +8085,13 @@ ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7] @@ -8118,8 +8104,8 @@ ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] @@ -8130,19 +8116,19 @@ ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 @@ -8168,40 +8154,43 @@ ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5],xmm0[6,7] @@ -8212,33 +8201,33 @@ ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm13 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 @@ -8247,43 +8236,41 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm13 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2,3,4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm11, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3],xmm14[4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 @@ -8291,110 +8278,112 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9) +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) -; AVX1-ONLY-NEXT: addq $808, %rsp # imm = 0x328 +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) +; AVX1-ONLY-NEXT: addq $840, %rsp # imm = 0x348 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: load_i8_stride8_vf64: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $840, %rsp # imm = 0x348 -; AVX2-SLOW-NEXT: vmovdqa 368(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm12, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm15 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm13 ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-SLOW-NEXT: vmovdqa 272(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm12 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovdqa 496(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa 400(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm8 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm8 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 @@ -8402,118 +8391,118 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 336(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, %xmm8 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm7 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 496(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqa 400(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm2 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm15, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm10, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm11, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm10, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm15 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm5 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm15 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm15 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8527,19 +8516,19 @@ ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] @@ -8548,28 +8537,28 @@ ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] @@ -8579,42 +8568,42 @@ ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm6 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8622,8 +8611,8 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] @@ -8649,7 +8638,7 @@ ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 @@ -8674,28 +8663,28 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] @@ -8712,11 +8701,12 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm14 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8724,26 +8714,25 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] @@ -8751,7 +8740,7 @@ ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 @@ -8776,7 +8765,8 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -8790,13 +8780,12 @@ ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] @@ -8818,7 +8807,7 @@ ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm14 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm15 @@ -8829,41 +8818,40 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm1 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -8879,8 +8867,8 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -8892,15 +8880,16 @@ ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] @@ -8911,21 +8900,21 @@ ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -8940,33 +8929,34 @@ ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm1 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm2 @@ -8980,29 +8970,27 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] @@ -9013,21 +9001,21 @@ ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -9048,21 +9036,22 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm6 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 @@ -9070,9 +9059,9 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm13, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm2 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 @@ -9086,11 +9075,10 @@ ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -9098,39 +9086,38 @@ ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0],xmm6[1],xmm14[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm15 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm13 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 @@ -9139,31 +9126,32 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm13 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm13 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm13 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm15 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm13 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm12 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm2 @@ -9184,38 +9172,38 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r9) +; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) +; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-SLOW-NEXT: addq $840, %rsp # imm = 0x348 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -9223,44 +9211,44 @@ ; AVX2-FAST-LABEL: load_i8_stride8_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $904, %rsp # imm = 0x388 -; AVX2-FAST-NEXT: vmovdqa 368(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm4 +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm14 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-NEXT: vmovdqa 272(%rdi), %xmm5 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm4 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] @@ -9271,86 +9259,87 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm15 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm12 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm9 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX2-FAST-NEXT: vmovdqa 368(%rdi), %xmm15 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm9 +; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm9 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 336(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm9 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; AVX2-FAST-NEXT: vmovdqa 272(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm8 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm10 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm12 @@ -9359,49 +9348,48 @@ ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm12 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm9 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm15, %xmm8 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm9 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] @@ -9421,107 +9409,107 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm11, %xmm15 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm12 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vmovdqa %xmm15, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm8 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm8 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm9 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm15 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm11 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm6, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm12 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm7 ; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 @@ -9530,32 +9518,32 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm7 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] @@ -9565,26 +9553,25 @@ ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm14, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2,3] @@ -9608,25 +9595,26 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm9 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm9 +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm9 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm13, %xmm6 -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm6 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm7 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm7 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1],xmm5[2,3] @@ -9650,59 +9638,59 @@ ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm11 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm12 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm11 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm12 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm13 +; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm13 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm14 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm15 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm15 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm9 ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm9 -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm11 +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] @@ -9726,61 +9714,64 @@ ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm11 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm12 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm13 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm13 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm14 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm15, %xmm15 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm15 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm10 +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm10 -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm10 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,22,26,30,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] @@ -9807,25 +9798,24 @@ ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm14 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm15 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm15 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] @@ -9834,65 +9824,68 @@ ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm4 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%r9) +; AVX2-FAST-NEXT: vmovaps %ymm2, (%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%r9) -; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) +; AVX2-FAST-NEXT: vmovaps %ymm2, (%r9) +; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) +; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-NEXT: addq $904, %rsp # imm = 0x388 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -9900,72 +9893,73 @@ ; AVX2-FAST-PERLANE-LABEL: load_i8_stride8_vf64: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $840, %rsp # imm = 0x348 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 368(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 272(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 496(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 400(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 @@ -9973,118 +9967,118 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 336(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 240(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 496(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 464(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 400(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm12, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm10, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10098,19 +10092,19 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] @@ -10119,28 +10113,28 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] @@ -10150,42 +10144,42 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10193,8 +10187,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] @@ -10220,7 +10214,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 @@ -10245,28 +10239,28 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] @@ -10283,11 +10277,12 @@ ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10295,26 +10290,25 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] @@ -10322,7 +10316,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 @@ -10347,7 +10341,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -10361,13 +10356,12 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] @@ -10389,7 +10383,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm15 @@ -10400,41 +10394,40 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -10450,8 +10443,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -10463,15 +10456,16 @@ ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] @@ -10482,21 +10476,21 @@ ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10511,33 +10505,34 @@ ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm8, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm2 @@ -10551,29 +10546,27 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] @@ -10584,21 +10577,21 @@ ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 @@ -10619,21 +10612,22 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 @@ -10641,9 +10635,9 @@ ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm3 @@ -10657,11 +10651,10 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -10669,39 +10662,38 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0],xmm6[1],xmm14[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 @@ -10710,31 +10702,32 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm10, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm2 @@ -10755,769 +10748,775 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-FAST-PERLANE-NEXT: addq $840, %rsp # imm = 0x348 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: load_i8_stride8_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm27 -; AVX512F-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 -; AVX512F-SLOW-NEXT: vpmovqb %zmm0, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 496(%rdi), %xmm15 +; AVX512F-SLOW-NEXT: subq $488, %rsp # imm = 0x1E8 +; AVX512F-SLOW-NEXT: vmovdqa 496(%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm17 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vmovdqa 432(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vpmovqb %ymm4, %xmm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vmovdqa 368(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm31 -; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vpmovqb %ymm5, %xmm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vmovdqa 368(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm20 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX512F-SLOW-NEXT: vpmovqb %zmm27, %xmm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm24 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm16 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] +; AVX512F-SLOW-NEXT: vpmovqb %zmm16, %xmm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 ; AVX512F-SLOW-NEXT: movb $-64, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm24 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512F-SLOW-NEXT: vpmovqb %zmm5, %xmm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm4 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm19 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm28 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm23 +; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512F-SLOW-NEXT: vpmovqb %ymm5, %xmm5 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm20 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm22 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm30 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm21 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm31 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm30 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm25 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512F-SLOW-NEXT: vpmovqb %zmm4, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm29 +; AVX512F-SLOW-NEXT: vpmovqb %zmm29, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm15, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm26 ; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm18 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm14, %xmm4 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm7 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm27 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 432(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm23 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] +; AVX512F-SLOW-NEXT: vmovdqa %xmm10, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 400(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 400(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm28 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] -; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm27, %zmm7 -; AVX512F-SLOW-NEXT: vpmovqb %zmm7, %xmm7 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm19 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm19 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm12, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm22 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm7 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] -; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm17 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm15, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm17 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm5, %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] +; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm16, %zmm6 +; AVX512F-SLOW-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm14 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm20 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm12, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm23 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm2 ; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm10, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm18, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 +; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm29, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm24 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm2 +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512F-SLOW-NEXT: vmovdqa %xmm13, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm19 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm8 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm8 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm27, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm13 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3,4],ymm8[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm13 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm21 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0,1,2],xmm8[3] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm16, %zmm13 +; AVX512F-SLOW-NEXT: vpmovqb %zmm13, %xmm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0,1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm8 {%k1} +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm17 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm13 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3,4,5,6],ymm7[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm12 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm18 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] -; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm21, %zmm2 -; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm29, %zmm1 +; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm13 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm22 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm27, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm8 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3,4],ymm8[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm8 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm13 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0,1,2],xmm8[3] +; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm16, %zmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm28 +; AVX512F-SLOW-NEXT: vpmovqb %zmm13, %xmm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0,1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm8 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm13 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3,4,5,6],ymm7[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm9 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm13 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm25 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm18 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm21, %zmm1 +; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm29, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm30 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm31 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm24 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm16 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm27, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3,4],ymm8[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm13 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0,1,2],xmm8[3] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm16, %zmm13 +; AVX512F-SLOW-NEXT: vpmovqb %zmm13, %xmm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0,1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm8 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm13 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3,4,5,6],ymm7[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm21, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm29, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm1 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm2 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm25 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm23 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm26 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm27, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm28 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm30 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm31 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm24 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3,4],ymm8[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm13 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0,1,2],xmm8[3] +; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm16, %zmm13 +; AVX512F-SLOW-NEXT: vpmovqb %zmm13, %xmm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0,1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm8 {%k1} +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm26 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm27 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm20 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3,4,5,6],ymm7[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm22 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm21 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm21, %zmm1 +; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm29, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm23 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm1 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm2 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm18 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm13 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm17 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm27, %zmm8 -; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm15 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm19 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm24 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm29 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3,4],ymm8[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm24 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0,1,2],xmm8[3] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm28, %zmm13 +; AVX512F-SLOW-NEXT: vpmovqb %zmm13, %xmm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0,1],xmm8[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm8 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm13 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3,4,5,6],ymm7[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm22, %zmm1 +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm29, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm20 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm25 ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm1 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm2 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm8 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm8 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm8 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm9 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm27, %zmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] +; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm28, %zmm8 ; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm7 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm9 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm22, %zmm1 +; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm29, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%rsi) ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -11529,12 +11528,13 @@ ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%r9) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, (%rax) +; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-SLOW-NEXT: addq $456, %rsp # imm = 0x1C8 +; AVX512F-SLOW-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -12107,663 +12107,646 @@ ; ; AVX512BW-SLOW-LABEL: load_i8_stride8_vf64: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: subq $744, %rsp # imm = 0x2E8 +; AVX512BW-SLOW-NEXT: subq $680, %rsp # imm = 0x2A8 ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa 496(%rdi), %xmm5 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm5, %xmm7 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 480(%rdi), %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm6, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm25 +; AVX512BW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa 496(%rdi), %xmm1 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm20 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm1, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm1, %xmm27 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa 480(%rdi), %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm1, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vmovdqa 464(%rdi), %xmm6 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm6, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa 448(%rdi), %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm6, %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm5 -; AVX512BW-SLOW-NEXT: vpmovqb %ymm5, %xmm5 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa 368(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm2, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512BW-SLOW-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm2, %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, %xmm9 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vmovdqa 336(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm12 -; AVX512BW-SLOW-NEXT: vmovdqa 320(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm13 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX512BW-SLOW-NEXT: vpmovqb %zmm1, %xmm12 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm20 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 464(%rdi), %xmm25 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm23 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-SLOW-NEXT: vpshufb %xmm23, %xmm25, %xmm7 +; AVX512BW-SLOW-NEXT: vmovdqa 448(%rdi), %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm23, %xmm1, %xmm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm1, %xmm31 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm6[7] +; AVX512BW-SLOW-NEXT: vmovdqa 432(%rdi), %xmm5 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm18 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-SLOW-NEXT: vpshufb %xmm18, %xmm5, %xmm9 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa 416(%rdi), %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm18, %xmm1, %xmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm1, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm10 +; AVX512BW-SLOW-NEXT: vpmovqb %ymm10, %xmm10 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0,1,2,3,4,5],ymm7[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 368(%rdi), %xmm26 +; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm26, %xmm10 +; AVX512BW-SLOW-NEXT: vmovdqa 352(%rdi), %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm1, %xmm11 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512BW-SLOW-NEXT: vmovdqa 336(%rdi), %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm23, %xmm1, %xmm14 +; AVX512BW-SLOW-NEXT: vmovdqa64 320(%rdi), %xmm19 +; AVX512BW-SLOW-NEXT: vpshufb %xmm23, %xmm19, %xmm15 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] +; AVX512BW-SLOW-NEXT: vpmovqb %zmm0, %xmm14 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm24 ; AVX512BW-SLOW-NEXT: movb $-64, %al ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm20 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa 240(%rdi), %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa 224(%rdi), %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm1, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, %xmm14 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm1, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm1, %xmm17 -; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm1, %xmm18 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm1, %xmm29 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm18[0],xmm13[0],xmm18[1],xmm13[1],xmm18[2],xmm13[2],xmm18[3],xmm13[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm13 -; AVX512BW-SLOW-NEXT: vpmovqb %ymm13, %xmm13 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm18 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm0, %xmm23 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm0, %xmm15 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm18[0],xmm15[1],xmm18[1],xmm15[2],xmm18[2],xmm15[3],xmm18[3] -; AVX512BW-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm21 -; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm19 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm2, %xmm15 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm19[0],xmm21[0],xmm19[1],xmm21[1],xmm19[2],xmm21[2],xmm19[3],xmm21[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm24 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 240(%rdi), %xmm29 +; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm29, %xmm14 +; AVX512BW-SLOW-NEXT: vmovdqa 224(%rdi), %xmm11 +; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm11, %xmm15 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm8 +; AVX512BW-SLOW-NEXT: vpshufb %xmm23, %xmm8, %xmm16 +; AVX512BW-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm21 +; AVX512BW-SLOW-NEXT: vpshufb %xmm23, %xmm21, %xmm17 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm16 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm1 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-SLOW-NEXT: vmovdqa64 176(%rdi), %xmm22 +; AVX512BW-SLOW-NEXT: vpshufb %xmm18, %xmm22, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb %xmm18, %xmm13, %xmm18 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm18[0],xmm1[0],xmm18[1],xmm1[1],xmm18[2],xmm1[2],xmm18[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm18 +; AVX512BW-SLOW-NEXT: vpmovqb %ymm18, %xmm18 +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm2 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 112(%rdi), %xmm28 +; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm28, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm4, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm4, %xmm17 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vmovdqa 80(%rdi), %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm23, %xmm4, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm4, %xmm18 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm23, %xmm4, %xmm23 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm4, %xmm20 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm23[0],xmm2[0],xmm23[1],xmm2[1],xmm23[2],xmm2[2],xmm23[3],xmm2[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm2 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa 384(%rdi), %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 400(%rdi), %xmm12 -; AVX512BW-SLOW-NEXT: vmovdqa 416(%rdi), %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 432(%rdi), %xmm16 -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm25, %xmm10 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm25, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 400(%rdi), %xmm24 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm27, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, %xmm19 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm24, %xmm24 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm26, %xmm25 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm25, %xmm9 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm25, %xmm25 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm31, %xmm27 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm25 = xmm27[0],xmm25[0],xmm27[1],xmm25[1],xmm27[2],xmm25[2],xmm27[3],xmm25[3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm27 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm16, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm16, %xmm22 -; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm13, %xmm25 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm13, %xmm18 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm25 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm30, %xmm23 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm30, %xmm27 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm25 = xmm27[0],xmm25[0],xmm27[1],xmm25[1],xmm27[2],xmm25[2],xmm27[3],xmm25[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm12, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm12, %xmm28 -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm4, %xmm25 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm31, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm31, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm9, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm24 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm24[0],xmm4[0],xmm24[1],xmm4[1],xmm24[2],xmm4[2],xmm24[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm6, %zmm4 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm4, %xmm4 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm8, %xmm12 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm14, %xmm16 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm17, %xmm20 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm17, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm29, %xmm31 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm24[0],xmm4[0],xmm24[1],xmm4[1],xmm24[2],xmm4[2],xmm24[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 176(%rdi), %xmm25 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm25, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm0, %xmm24 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm24[0],xmm4[0],xmm24[1],xmm4[1],xmm24[2],xmm4[2],xmm24[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm0, %xmm0 -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm8, %xmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm8, %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm30[0],xmm0[0],xmm30[1],xmm0[1],xmm30[2],xmm0[2],xmm30[3],xmm0[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm24, %xmm27 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm0, %xmm31 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm27 = xmm31[0],xmm27[0],xmm31[1],xmm27[1],xmm31[2],xmm27[2],xmm31[3],xmm27[3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5],ymm5[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm23, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm26, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm5 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm15, %xmm17 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm19, %xmm25 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm25[0],xmm5[0],xmm25[1],xmm5[1],xmm25[2],xmm5[2],xmm25[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3] +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm16, %zmm5 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm29, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm29, %xmm0 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm5 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm8, %xmm29 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm25 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm25[0],xmm5[0],xmm25[1],xmm5[1],xmm25[2],xmm5[2],xmm25[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm22, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm13, %xmm11 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm25 +; AVX512BW-SLOW-NEXT: vmovdqa64 144(%rdi), %xmm27 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm27, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm25, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm28, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm17, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm18, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm20, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm15, %zmm2 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] +; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm6, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512BW-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm10, %xmm18 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm19, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm26, %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm22, %xmm23 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm18, %xmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm18, %xmm22 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm23, %xmm30 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm28, %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm28, %xmm10 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm18, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm26[0],xmm0[0],xmm26[1],xmm0[1],xmm26[2],xmm0[2],xmm26[3],xmm0[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm24, %xmm31 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm23, %xmm1 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm31[0],xmm1[1],xmm31[1],xmm1[2],xmm31[2],xmm1[3],xmm31[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm13, %xmm29 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm5 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm26, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm19, %xmm31 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm31[0],xmm5[0],xmm31[1],xmm5[1],xmm31[2],xmm5[2],xmm31[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm13, %zmm5 +; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm16, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm20 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm12, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm16, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm20, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm31, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm15, %xmm16 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm5 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm29, %xmm19 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm29, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm21, %xmm31 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm31[0],xmm5[0],xmm31[1],xmm5[1],xmm31[2],xmm5[2],xmm31[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm4 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm5, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm24, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm27, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm25, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm1 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm21, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm17, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm17, %xmm28 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm29, %xmm0 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm15, %zmm3 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] +; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm8, %zmm3 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm18, %xmm11 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm18, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm19, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm28, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm10, %xmm18 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm23, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm26 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm26 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm10, %xmm30 -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm18, %xmm2 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm30[0],xmm2[1],xmm30[1],xmm2[2],xmm30[2],xmm2[3],xmm30[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa %xmm9, %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm17, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm24, %xmm31 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm23, %xmm2 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm31[0],xmm2[1],xmm31[1],xmm2[2],xmm31[2],xmm2[3],xmm31[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5],ymm2[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm29, %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm17, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm27, %xmm12 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm27, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm31 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm31[0],xmm5[0],xmm31[1],xmm5[1],xmm31[2],xmm5[2],xmm31[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm13, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm9 +; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm20, %zmm5 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm7, %xmm24 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm16, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm16, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm20, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm31, %xmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm31, %xmm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm19, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm21, %xmm31 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm31[0],xmm5[0],xmm31[1],xmm5[1],xmm31[2],xmm5[2],xmm31[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm22, %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm4 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm29, %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm27, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm27, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm25, %xmm30 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm25, %xmm20 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm11, %xmm16 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm19 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm19, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm29, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm21, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm28, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm9, %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm7, %zmm3 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm14, %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm15, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm19, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm28, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm18, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm23, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm26 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm31, %xmm30 -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm18, %xmm3 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm30[0],xmm3[1],xmm30[1],xmm3[2],xmm30[2],xmm3[3],xmm30[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm17, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm24, %xmm31 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm23, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, %xmm25 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm31[0],xmm3[1],xmm31[1],xmm3[2],xmm31[2],xmm3[3],xmm31[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm17, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm31 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm31[0],xmm5[0],xmm31[1],xmm5[1],xmm31[2],xmm5[2],xmm31[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] -; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm13, %zmm5 +; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm9, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm12 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm24, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm16, %xmm1 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm31 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm31[0],xmm5[0],xmm31[1],xmm5[1],xmm31[2],xmm5[2],xmm31[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm13, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm4 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm29, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm27, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm27, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm20, %xmm30 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm20, %xmm23 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm0 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm19, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm18, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm23, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm26, %xmm20 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm26, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm26, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $32, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm7, %zmm2 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm19, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm15, %xmm19 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm17, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm31, %xmm26 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm18, %xmm30 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm26 = xmm30[0],xmm26[0],xmm30[1],xmm26[1],xmm30[2],xmm26[2],xmm30[3],xmm26[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm24, %xmm31 +; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm25, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm30 = xmm30[0],xmm31[0],xmm30[1],xmm31[1],xmm30[2],xmm31[2],xmm30[3],xmm31[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm0, %ymm6 +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm0, %ymm6 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm6 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm17, %xmm11 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm17, %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm26[0],xmm6[0],xmm26[1],xmm6[1],xmm26[2],xmm6[2],xmm26[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm30[0],xmm6[0],xmm30[1],xmm6[1],xmm30[2],xmm6[2],xmm30[3],xmm6[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] -; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm13, %zmm6 +; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm12, %zmm6 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm6, %xmm6 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, %xmm17 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm24, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm16, %xmm1 ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm6 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm24 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm24, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm26[0],xmm6[0],xmm26[1],xmm6[1],xmm26[2],xmm6[2],xmm26[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm30[0],xmm6[0],xmm30[1],xmm6[1],xmm30[2],xmm6[2],xmm30[3],xmm6[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm25, %xmm6 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm22, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm29, %xmm6 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm27, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm27, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm23, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm16, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm0 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm19, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm18, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm23, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm20, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm26, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm14, %zmm2 +; AVX512BW-SLOW-NEXT: vpsrlq $40, %zmm7, %zmm2 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsp), %xmm25 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm25, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm23, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm19, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm17, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm31, %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm31, %xmm26 -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm18, %xmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm18, %xmm31 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm26 = xmm30[0],xmm26[0],xmm30[1],xmm26[1],xmm30[2],xmm26[2],xmm30[3],xmm26[3] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm0, %ymm4 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm30, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm13, %zmm5 -; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm24, %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm30[0],xmm6[0],xmm30[1],xmm6[1],xmm30[2],xmm6[2],xmm30[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm6 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm30 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm30[0],xmm6[0],xmm30[1],xmm6[1],xmm30[2],xmm6[2],xmm30[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] +; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm12, %zmm6 +; AVX512BW-SLOW-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm17, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm5 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm22, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm24, %xmm26 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm16, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm6 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm29, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm30 +; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm21, %xmm9 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm30[0],xmm6[0],xmm30[1],xmm6[1],xmm30[2],xmm6[2],xmm30[3],xmm6[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm25, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm9, %xmm18 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm29, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm27, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm22, %xmm3 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm27, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm23, %xmm5 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm16, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm20, %xmm0 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm19, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm18, %xmm0 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm21, %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm20, %xmm3 ; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm26, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm14, %zmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512BW-SLOW-NEXT: vpsrlq $48, %zmm7, %zmm2 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm25, %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm23, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm19, %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm17, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm31, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm6 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm31, %xmm9 -; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm24, %xmm8 +; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm11, %xmm21 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm21[0],xmm8[0],xmm21[1],xmm8[1],xmm21[2],xmm8[2],xmm21[3],xmm8[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7] +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5],ymm8[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm30, %xmm5 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm7 +; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm7 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm8 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm7 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm8 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] -; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm13, %zmm7 +; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm12, %zmm7 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm7, %xmm7 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm5 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm5 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm17, %xmm2 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm16, %xmm2 ; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm28, %xmm7 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm22, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm24, %xmm8 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm29, %xmm7 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm8 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm25, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm18, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm13, %xmm7 +; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm4 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm29, %xmm7 -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm27, %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm27, %xmm7 +; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm23, %xmm6 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm16, %xmm4 -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm20, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm19, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm18, %xmm1 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm21, %xmm4 +; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm20, %xmm4 ; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm14, %zmm3 +; AVX512BW-SLOW-NEXT: vpsrlq $56, %zmm30, %zmm3 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -12785,7 +12768,7 @@ ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-SLOW-NEXT: addq $744, %rsp # imm = 0x2E8 +; AVX512BW-SLOW-NEXT: addq $680, %rsp # imm = 0x2A8 ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll @@ -83,25 +83,57 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rdx) ; AVX1-ONLY-NEXT: retq ; -; AVX2-ONLY-LABEL: store_i16_stride2_vf8: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq +; AVX2-SLOW-LABEL: store_i16_stride2_vf8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq ; -; AVX512F-LABEL: store_i16_stride2_vf8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX2-FAST-LABEL: store_i16_stride2_vf8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15,16,17,20,21,18,19,22,23,24,25,28,29,26,27,30,31] +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: store_i16_stride2_vf8: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX512F-SLOW-LABEL: store_i16_stride2_vf8: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: store_i16_stride2_vf8: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15,16,17,20,21,18,19,22,23,24,25,28,29,26,27,30,31] +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride2_vf8: ; AVX512BW: # %bb.0: @@ -128,15 +160,15 @@ ; SSE-NEXT: movdqa (%rsi), %xmm2 ; SSE-NEXT: movdqa 16(%rsi), %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: movdqa %xmm1, 32(%rdx) -; SSE-NEXT: movdqa %xmm2, 48(%rdx) -; SSE-NEXT: movdqa %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm4, 16(%rdx) +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa %xmm1, 48(%rdx) +; SSE-NEXT: movdqa %xmm2, 32(%rdx) +; SSE-NEXT: movdqa %xmm0, 16(%rdx) +; SSE-NEXT: movdqa %xmm4, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride2_vf16: @@ -213,25 +245,25 @@ ; SSE-NEXT: movdqa 32(%rsi), %xmm6 ; SSE-NEXT: movdqa 48(%rsi), %xmm7 ; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] ; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSE-NEXT: movdqa %xmm3, 96(%rdx) -; SSE-NEXT: movdqa %xmm6, 112(%rdx) -; SSE-NEXT: movdqa %xmm2, 64(%rdx) -; SSE-NEXT: movdqa %xmm5, 80(%rdx) -; SSE-NEXT: movdqa %xmm1, 32(%rdx) -; SSE-NEXT: movdqa %xmm4, 48(%rdx) -; SSE-NEXT: movdqa %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm8, 16(%rdx) +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSE-NEXT: movdqa %xmm3, 112(%rdx) +; SSE-NEXT: movdqa %xmm6, 96(%rdx) +; SSE-NEXT: movdqa %xmm2, 80(%rdx) +; SSE-NEXT: movdqa %xmm5, 64(%rdx) +; SSE-NEXT: movdqa %xmm1, 48(%rdx) +; SSE-NEXT: movdqa %xmm4, 32(%rdx) +; SSE-NEXT: movdqa %xmm0, 16(%rdx) +; SSE-NEXT: movdqa %xmm8, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride2_vf32: @@ -244,22 +276,22 @@ ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 16(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 48(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 96(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm6, 112(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 64(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 80(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 80(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 48(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 16(%rdx) ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i16_stride2_vf32: @@ -270,16 +302,16 @@ ; AVX2-ONLY-NEXT: vmovdqa 32(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 96(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 64(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -350,102 +382,102 @@ ; SSE-NEXT: movdqa 32(%rsi), %xmm14 ; SSE-NEXT: movdqa 48(%rsi), %xmm15 ; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] ; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7] ; SSE-NEXT: movdqa %xmm3, %xmm15 -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] ; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] ; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] ; SSE-NEXT: movdqa 112(%rsi), %xmm11 ; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE-NEXT: movdqa %xmm0, 224(%rdx) -; SSE-NEXT: movdqa %xmm7, 240(%rdx) -; SSE-NEXT: movdqa %xmm6, 192(%rdx) -; SSE-NEXT: movdqa %xmm12, 208(%rdx) -; SSE-NEXT: movdqa %xmm4, 160(%rdx) -; SSE-NEXT: movdqa %xmm13, 176(%rdx) -; SSE-NEXT: movdqa %xmm3, 128(%rdx) -; SSE-NEXT: movdqa %xmm15, 144(%rdx) -; SSE-NEXT: movdqa %xmm5, 96(%rdx) -; SSE-NEXT: movdqa %xmm14, 112(%rdx) -; SSE-NEXT: movdqa %xmm2, 64(%rdx) -; SSE-NEXT: movdqa %xmm10, 80(%rdx) -; SSE-NEXT: movdqa %xmm1, 32(%rdx) -; SSE-NEXT: movdqa %xmm9, 48(%rdx) -; SSE-NEXT: movdqa %xmm8, (%rdx) +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; SSE-NEXT: movdqa %xmm0, 240(%rdx) +; SSE-NEXT: movdqa %xmm7, 224(%rdx) +; SSE-NEXT: movdqa %xmm6, 208(%rdx) +; SSE-NEXT: movdqa %xmm12, 192(%rdx) +; SSE-NEXT: movdqa %xmm4, 176(%rdx) +; SSE-NEXT: movdqa %xmm13, 160(%rdx) +; SSE-NEXT: movdqa %xmm3, 144(%rdx) +; SSE-NEXT: movdqa %xmm15, 128(%rdx) +; SSE-NEXT: movdqa %xmm5, 112(%rdx) +; SSE-NEXT: movdqa %xmm14, 96(%rdx) +; SSE-NEXT: movdqa %xmm2, 80(%rdx) +; SSE-NEXT: movdqa %xmm10, 64(%rdx) +; SSE-NEXT: movdqa %xmm1, 48(%rdx) +; SSE-NEXT: movdqa %xmm9, 32(%rdx) +; SSE-NEXT: movdqa %xmm8, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride2_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm11 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm13 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, 224(%rdx) +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, 224(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 240(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 48(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 96(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm13, 112(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 192(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, 192(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm14, 208(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 80(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm12, 16(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 160(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 176(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 128(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 160(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 176(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 128(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 144(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 96(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 112(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 64(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 80(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rdx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rdx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rdx) ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i16_stride2_vf64: @@ -460,82 +492,82 @@ ; AVX2-ONLY-NEXT: vmovdqa 96(%rsi), %ymm7 ; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15] ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15] ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[12],ymm6[12],ymm2[13],ymm6[13],ymm2[14],ymm6[14],ymm2[15],ymm6[15] ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm3[4],ymm7[4],ymm3[5],ymm7[5],ymm3[6],ymm7[6],ymm3[7],ymm7[7],ymm3[12],ymm7[12],ymm3[13],ymm7[13],ymm3[14],ymm7[14],ymm3[15],ymm7[15] ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[8],ymm7[8],ymm3[9],ymm7[9],ymm3[10],ymm7[10],ymm3[11],ymm7[11] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm3[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm8[0,1] -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 192(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 224(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 128(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 160(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 96(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 32(%rdx) +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm3[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 224(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 192(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 160(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 128(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 64(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: store_i16_stride2_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX512F-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX512F-NEXT: vmovdqa 48(%rsi), %xmm5 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX512F-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-NEXT: vmovdqa 80(%rsi), %xmm3 -; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-NEXT: vmovdqa 96(%rsi), %xmm5 -; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm6 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-NEXT: vmovdqa 112(%rsi), %xmm6 -; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm7 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512F-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512F-NEXT: vmovdqa 16(%rsi), %xmm9 -; AVX512F-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX512F-NEXT: vmovdqa 48(%rsi), %xmm11 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm14 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX512F-NEXT: vmovdqa %xmm9, 48(%rdx) -; AVX512F-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX512F-NEXT: vmovdqa %xmm7, 16(%rdx) -; AVX512F-NEXT: vmovdqa %xmm14, (%rdx) -; AVX512F-NEXT: vmovdqa %xmm11, 112(%rdx) -; AVX512F-NEXT: vmovdqa %xmm13, 96(%rdx) -; AVX512F-NEXT: vmovdqa %xmm10, 80(%rdx) -; AVX512F-NEXT: vmovdqa %xmm15, 64(%rdx) -; AVX512F-NEXT: vmovdqa %xmm6, 240(%rdx) -; AVX512F-NEXT: vmovdqa %xmm8, 224(%rdx) -; AVX512F-NEXT: vmovdqa %xmm5, 208(%rdx) -; AVX512F-NEXT: vmovdqa %xmm4, 192(%rdx) -; AVX512F-NEXT: vmovdqa %xmm3, 176(%rdx) -; AVX512F-NEXT: vmovdqa %xmm2, 160(%rdx) -; AVX512F-NEXT: vmovdqa %xmm1, 144(%rdx) -; AVX512F-NEXT: vmovdqa64 %xmm16, 128(%rdx) +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm8 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-NEXT: vmovdqa 80(%rsi), %xmm9 +; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm11 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512F-NEXT: vmovdqa 96(%rsi), %xmm11 +; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm13 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX512F-NEXT: vmovdqa 112(%rsi), %xmm13 +; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX512F-NEXT: vmovdqa %xmm13, 224(%rdx) +; AVX512F-NEXT: vmovdqa %xmm0, 240(%rdx) +; AVX512F-NEXT: vmovdqa %xmm11, 192(%rdx) +; AVX512F-NEXT: vmovdqa %xmm14, 208(%rdx) +; AVX512F-NEXT: vmovdqa %xmm9, 160(%rdx) +; AVX512F-NEXT: vmovdqa %xmm12, 176(%rdx) +; AVX512F-NEXT: vmovdqa %xmm8, 128(%rdx) +; AVX512F-NEXT: vmovdqa %xmm10, 144(%rdx) +; AVX512F-NEXT: vmovdqa %xmm5, 96(%rdx) +; AVX512F-NEXT: vmovdqa %xmm7, 112(%rdx) +; AVX512F-NEXT: vmovdqa %xmm4, 64(%rdx) +; AVX512F-NEXT: vmovdqa %xmm6, 80(%rdx) +; AVX512F-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX512F-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX512F-NEXT: vmovdqa %xmm1, (%rdx) +; AVX512F-NEXT: vmovdqa64 %xmm16, 16(%rdx) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride2_vf64: @@ -544,17 +576,17 @@ ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm1, %zmm4 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64 @@ -567,9 +599,6 @@ ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX1: {{.*}} ; AVX2: {{.*}} -; AVX2-FAST: {{.*}} -; AVX2-FAST-PERLANE: {{.*}} -; AVX2-SLOW: {{.*}} ; AVX512: {{.*}} ; AVX512BW-FAST: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} @@ -579,10 +608,8 @@ ; AVX512DQ-SLOW: {{.*}} ; AVX512DQBW-FAST: {{.*}} ; AVX512DQBW-SLOW: {{.*}} -; AVX512F-FAST: {{.*}} ; AVX512F-ONLY-FAST: {{.*}} ; AVX512F-ONLY-SLOW: {{.*}} -; AVX512F-SLOW: {{.*}} ; FALLBACK0: {{.*}} ; FALLBACK1: {{.*}} ; FALLBACK10: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll @@ -24,8 +24,8 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] ; SSE-NEXT: movq %xmm0, (%rcx) ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: movd %xmm0, 8(%rcx) @@ -102,65 +102,117 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,u,u,2,3,10,11,u,u,4,5,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,4,5,8,9,14,15,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,u,u,4,5,6,7,u,u,8,9,10,11] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rcx) ; AVX1-ONLY-NEXT: vmovq %xmm2, 16(%rcx) ; AVX1-ONLY-NEXT: retq ; -; AVX2-ONLY-LABEL: store_i16_stride3_vf4: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-ONLY-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-ONLY-NEXT: vmovq %xmm1, 16(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rcx) -; AVX2-ONLY-NEXT: vzeroupper -; AVX2-ONLY-NEXT: retq +; AVX2-SLOW-LABEL: store_i16_stride3_vf4: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,u,u,2,3,10,11,u,u,4,5,12,13,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vmovq %xmm1, 16(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq ; -; AVX512F-LABEL: store_i16_stride3_vf4: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vmovq %xmm1, 16(%rcx) -; AVX512F-NEXT: vmovdqa %xmm0, (%rcx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX2-FAST-LABEL: store_i16_stride3_vf4: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,0,0,1,1,u,u> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,u,u,2,3,10,11,u,u,4,5,12,13,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vmovq %xmm1, 16(%rcx) +; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: store_i16_stride3_vf4: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,u,u,2,3,10,11,u,u,4,5,12,13,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm1, 16(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX512F-SLOW-LABEL: store_i16_stride3_vf4: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13],zero,zero,ymm0[22,23,30,31],zero,zero,ymm0[u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-SLOW-NEXT: vmovq %xmm1, 16(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, (%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: store_i16_stride3_vf4: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13],zero,zero,ymm0[22,23,30,31],zero,zero,ymm0[u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-FAST-NEXT: vmovq %xmm1, 16(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %xmm0, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride3_vf4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u> -; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vmovq %xmm1, 16(%rcx) -; AVX512BW-NEXT: vmovdqa %xmm0, (%rcx) +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,8,16,1,9,17,2,10,18,3,11,19,u,u,u,u> +; AVX512BW-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, 16(%rcx) +; AVX512BW-NEXT: vmovdqa %xmm1, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64 @@ -311,20 +363,21 @@ ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512F-SLOW-NEXT: vpermd %ymm2, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, (%rdx), %zmm0, %zmm3 ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6],xmm3[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-SLOW-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-SLOW-NEXT: vzeroupper @@ -378,124 +431,124 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride3_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm6 -; SSE-NEXT: movdqa (%rsi), %xmm2 -; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa (%rdx), %xmm4 -; SSE-NEXT: movdqa 16(%rdx), %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm10, %xmm5 -; SSE-NEXT: por %xmm8, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,0,0,0] -; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm5 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 +; SSE-NEXT: movdqa (%rdx), %xmm7 +; SSE-NEXT: movdqa 16(%rdx), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm9, %xmm10 ; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: por %xmm6, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm7[1,1,2,2] ; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm11, %xmm8 ; SSE-NEXT: por %xmm10, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,2,2] -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: pandn %xmm11, %xmm12 -; SSE-NEXT: por %xmm10, %xmm12 -; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,2,2] -; SSE-NEXT: pandn %xmm10, %xmm3 -; SSE-NEXT: por %xmm12, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: por %xmm11, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm9, %xmm11 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm6 -; SSE-NEXT: por %xmm11, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,0,0] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm9, %xmm10 +; SSE-NEXT: por %xmm11, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,2,2] +; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm12, %xmm9 +; SSE-NEXT: por %xmm11, %xmm9 +; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,2,2] +; SSE-NEXT: pandn %xmm11, %xmm6 +; SSE-NEXT: por %xmm9, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm7, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: por %xmm11, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm10 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm10, 32(%rcx) -; SSE-NEXT: movdqa %xmm6, 80(%rcx) +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: por %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm9, 80(%rcx) +; SSE-NEXT: movdqa %xmm4, 32(%rcx) +; SSE-NEXT: movdqa %xmm6, 64(%rcx) +; SSE-NEXT: movdqa %xmm10, 48(%rcx) +; SSE-NEXT: movdqa %xmm8, 16(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: movdqa %xmm3, 16(%rcx) -; SSE-NEXT: movdqa %xmm8, 48(%rcx) -; SSE-NEXT: movdqa %xmm5, 64(%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride3_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm2[1,2],xmm7[3],xmm2[4,5],xmm7[6],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1,2],xmm8[3],xmm7[4,5],xmm8[6],xmm7[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6],xmm10[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2],xmm8[3,4],xmm10[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3,4],xmm12[5],xmm10[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6],xmm12[7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6],xmm3[7] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, 48(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 80(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 16(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 64(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 80(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 64(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride3_vf16: @@ -549,7 +602,7 @@ ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm5 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm7 @@ -593,7 +646,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm7 @@ -698,343 +751,339 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride3_vf32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 16(%rdi), %xmm6 -; SSE-NEXT: movdqa 32(%rdi), %xmm4 -; SSE-NEXT: movdqa 48(%rdi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: movdqa (%rsi), %xmm10 ; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa 32(%rsi), %xmm8 -; SSE-NEXT: movdqa 48(%rsi), %xmm11 -; SSE-NEXT: movdqa 32(%rdx), %xmm10 -; SSE-NEXT: movdqa 48(%rdx), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,2] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,2,2] +; SSE-NEXT: movdqa (%rdx), %xmm1 +; SSE-NEXT: movdqa 16(%rdx), %xmm8 +; SSE-NEXT: movdqa 32(%rdx), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm5, %xmm9 -; SSE-NEXT: por %xmm3, %xmm9 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm4, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm5, %xmm13 -; SSE-NEXT: por %xmm3, %xmm13 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,2] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,2] +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] +; SSE-NEXT: movdqa %xmm8, %xmm11 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm13, %xmm8 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,2] +; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,2,2] +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: por %xmm2, %xmm12 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa 16(%rdx), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm5, %xmm15 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: movdqa (%rdi), %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2] -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,2,2] -; SSE-NEXT: pandn %xmm12, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,5] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,0,0] -; SSE-NEXT: pandn %xmm12, %xmm5 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm13, %xmm14 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: movdqa 48(%rdx), %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm1[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm12, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa 48(%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,4,5] +; SSE-NEXT: pand %xmm13, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm1[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm11, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: pand %xmm4, %xmm13 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: por %xmm13, %xmm4 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,1,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm10 +; SSE-NEXT: por %xmm13, %xmm10 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,1,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[2,2,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: pandn %xmm7, %xmm13 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,1,0,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: por %xmm13, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,1,0,4,5,6,7] -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm12 -; SSE-NEXT: por %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm12, 32(%rcx) -; SSE-NEXT: movdqa %xmm1, 80(%rcx) -; SSE-NEXT: movdqa %xmm0, 128(%rcx) +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm11 +; SSE-NEXT: por %xmm1, %xmm11 ; SSE-NEXT: movdqa %xmm11, 176(%rcx) -; SSE-NEXT: movdqa %xmm5, (%rcx) -; SSE-NEXT: movdqa %xmm2, 16(%rcx) -; SSE-NEXT: movdqa %xmm15, 48(%rcx) -; SSE-NEXT: movdqa %xmm14, 64(%rcx) -; SSE-NEXT: movdqa %xmm13, 96(%rcx) -; SSE-NEXT: movdqa %xmm10, 112(%rcx) -; SSE-NEXT: movdqa %xmm9, 144(%rcx) +; SSE-NEXT: movdqa %xmm3, 128(%rcx) +; SSE-NEXT: movdqa %xmm0, 80(%rcx) +; SSE-NEXT: movdqa %xmm10, 32(%rcx) +; SSE-NEXT: movdqa %xmm4, 160(%rcx) +; SSE-NEXT: movdqa %xmm2, 144(%rcx) +; SSE-NEXT: movdqa %xmm15, 112(%rcx) +; SSE-NEXT: movdqa %xmm14, 96(%rcx) +; SSE-NEXT: movdqa %xmm12, 64(%rcx) +; SSE-NEXT: movdqa %xmm8, 48(%rcx) +; SSE-NEXT: movdqa %xmm9, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rcx) +; SSE-NEXT: movaps %xmm0, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride3_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,1,2,2] -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm4[2],xmm0[3,4],xmm4[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm4[1,2],xmm7[3],xmm4[4,5],xmm7[6],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3,4],xmm4[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3,4],xmm4[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm4[1,2],xmm7[3],xmm4[4,5],xmm7[6],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2],xmm0[3,4],xmm7[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1,2],xmm7[3],xmm0[4,5],xmm7[6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm11[2],xmm7[3,4],xmm11[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm7[1,2],xmm11[3],xmm7[4,5],xmm11[6],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm14[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3,4],xmm14[5],xmm11[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm15 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm11[1,2],xmm14[3],xmm11[4,5],xmm14[6],xmm11[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1],xmm11[2],xmm0[3,4],xmm11[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0],xmm11[1,2],xmm14[3],xmm11[4,5],xmm14[6],xmm11[7] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm4[2],xmm14[3,4],xmm4[5],xmm14[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm11[1],xmm4[2,3],xmm11[4],xmm4[5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3],xmm0[4],xmm9[5,6],xmm0[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3,4],xmm10[5],xmm9[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2,3],xmm12[4],xmm7[5,6],xmm12[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6],xmm10[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6],xmm11[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3,4],xmm12[5],xmm10[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm12[2],xmm1[3,4],xmm12[5],xmm1[6,7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 80(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm9, 48(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 96(%rcx) +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 176(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 128(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm14, 144(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 128(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 144(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 96(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 80(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 32(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 48(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm14, (%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, 160(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rcx) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rcx) ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride3_vf32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm5 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <5,5,u,6,6,u,7,7> -; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm5, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm6, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm14, %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3,4],xmm12[5],xmm9[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm9, %ymm7 -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm7, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm9, %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[1,1,2,2] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3,4],xmm12[5],xmm9[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <5,5,u,6,6,u,7,7> +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm7, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm13 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm14, %xmm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,2] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm13[2],xmm10[3,4],xmm13[5],xmm10[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm8, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[1,1,2,2] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm10, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[1,1,2,2] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm10, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-SLOW-NEXT: vpermd (%rdi), %ymm9, %ymm10 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm4, %ymm10, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm10, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <2,u,3,3,u,4,4,u> ; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm10, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermd 32(%rdi), %ymm9, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermd 32(%rdi), %ymm9, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm10, %ymm1 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 96(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 64(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 160(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 96(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 160(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 64(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1042,75 +1091,75 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm4 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm9 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm5 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1],xmm5[2],xmm9[3,4],xmm5[5],xmm9[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm9 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <5,5,u,6,6,u,7,7> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm3, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm9 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1],xmm9[2],xmm13[3,4],xmm9[5],xmm13[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm8 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm8 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm2, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-NEXT: vpermd (%rdi), %ymm7, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vpermd 32(%rdi), %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,u,3,3,u,4,4,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpermd (%rdi), %ymm8, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm7, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm5 +; AVX2-FAST-NEXT: vpermd 32(%rdi), %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,u,3,3,u,4,4,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm5, %ymm6, %ymm5 ; AVX2-FAST-NEXT: vmovdqa %ymm5, 128(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 96(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 160(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 160(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 96(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -1119,156 +1168,153 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm4, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1],xmm5[2],xmm9[3,4],xmm5[5],xmm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm5, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm9, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm9, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <5,5,u,6,6,u,7,7> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm7, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm3, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1],xmm9[2],xmm13[3,4],xmm9[5],xmm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm2, %ymm5, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm7, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm8, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <2,u,3,3,u,4,4,u> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm8, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm9, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm9, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm5, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <2,u,3,3,u,4,4,u> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm8, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm5, %ymm6, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 128(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-LABEL: store_i16_stride3_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX512F-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512F-NEXT: vprold $16, %xmm2, %xmm4 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] +; AVX512F-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] +; AVX512F-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-NEXT: vmovdqa 16(%rsi), %xmm4 +; AVX512F-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX512F-NEXT: vprold $16, %xmm3, %xmm7 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm8 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX512F-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512F-NEXT: vprold $16, %xmm3, %xmm3 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] -; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10> -; AVX512F-NEXT: vpermd (%rdx), %zmm4, %zmm4 -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] -; AVX512F-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] -; AVX512F-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX512F-NEXT: vpor %ymm3, %ymm6, %ymm3 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX512F-NEXT: vmovdqa 48(%rsi), %xmm9 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX512F-NEXT: vpshufb %xmm7, %xmm10, %xmm7 -; AVX512F-NEXT: vprold $16, %xmm9, %xmm9 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2],xmm6[3,4],xmm9[5],xmm6[6,7] +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1],xmm7[2],xmm10[3,4],xmm7[5],xmm10[6,7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512F-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] +; AVX512F-NEXT: vpshufb %ymm3, %ymm10, %ymm11 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512F-NEXT: vpermd %ymm10, %ymm12, %ymm10 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-NEXT: vpandn %ymm10, %ymm12, %ymm10 +; AVX512F-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm10 +; AVX512F-NEXT: vprold $16, %xmm6, %xmm7 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1],xmm7[2],xmm11[3,4],xmm7[5],xmm11[6,7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX512F-NEXT: vpshufb %xmm8, %xmm6, %xmm6 ; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm6[4,5,6,7] -; AVX512F-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512F-NEXT: vmovdqa 32(%rdx), %ymm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <5,5,u,6,6,u,7,7> -; AVX512F-NEXT: vpermd %ymm7, %ymm9, %ymm9 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-NEXT: vpandn %ymm9, %ymm10, %ymm9 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] -; AVX512F-NEXT: vpshufb %ymm10, %ymm7, %ymm7 -; AVX512F-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512F-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512F-NEXT: vpshufb %ymm8, %ymm5, %ymm5 -; AVX512F-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512F-NEXT: vprold $16, %xmm0, %xmm5 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-NEXT: vpshufb %ymm10, %ymm6, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-NEXT: vpermd %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX512F-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512F-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] +; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 +; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10> +; AVX512F-NEXT: vpermd (%rdx), %zmm5, %zmm5 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 48(%rsi), %xmm4 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX512F-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX512F-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <5,5,u,6,6,u,7,7> +; AVX512F-NEXT: vpermd %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-NEXT: vpandn %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm1, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm10, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1668,197 +1714,201 @@ ; AVX1-ONLY-LABEL: store_i16_stride3_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[1,1,2,2] -; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[1,1,2,2] -; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm11 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3],xmm15[4],xmm1[5,6],xmm15[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2],xmm15[3,4],xmm14[5],xmm15[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm15[2],xmm14[3,4],xmm15[5],xmm14[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm15[2],xmm0[3,4],xmm15[5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm15[2],xmm14[3,4],xmm15[5],xmm14[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm15[2],xmm0[3,4],xmm15[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm15[1],xmm10[2,3],xmm15[4],xmm10[5,6],xmm15[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm8[1],xmm14[2,3],xmm8[4],xmm14[5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm4[2],xmm8[3,4],xmm4[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm14[2],xmm4[3,4],xmm14[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3],xmm15[4],xmm14[5,6],xmm15[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm12 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm15[1],xmm10[2,3],xmm15[4],xmm10[5,6],xmm15[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm7[1],xmm10[2,3],xmm7[4],xmm10[5,6],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3],xmm5[4],xmm3[5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm15[2],xmm7[3,4],xmm15[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3],xmm14[4],xmm12[5,6],xmm14[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5,6],xmm6[7] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm15[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm11[2],xmm5[3,4],xmm11[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm10, %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1],xmm10[2],xmm6[3,4],xmm10[5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6],xmm6[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 48(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 80(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 288(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm9, 368(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm14, 320(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 336(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 96(%rcx) +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3,4],xmm8[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3,4],xmm8[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3],xmm9[4],xmm8[5,6],xmm9[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 368(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 320(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 336(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 288(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 272(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 224(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 240(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 192(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1866,40 +1916,45 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rcx) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rcx) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rcx) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rcx) ; AVX1-ONLY-NEXT: addq $296, %rsp # imm = 0x128 ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride3_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 80(%rsi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdx), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm5 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm6 @@ -1908,81 +1963,78 @@ ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <5,5,u,6,6,u,7,7> -; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <5,5,u,6,6,u,7,7> +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm11, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm6 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm13 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm8, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm6, %ymm4 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm11, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm4 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 112(%rsi), %xmm10 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm11 +; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 80(%rsi), %xmm13 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm14, %xmm14 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm10[2],xmm6[3,4],xmm10[5],xmm6[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm8, %ymm10 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm10, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm13 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm14, %xmm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm11, %ymm7 -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,1,2,2] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm12[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm13[2],xmm6[3,4],xmm13[5],xmm6[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm11, %ymm13 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm6, %ymm13, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 112(%rsi), %xmm14 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm15 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm13, %ymm7 +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm11, %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm7, %ymm11, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[1,1,2,2] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm12, %ymm9 +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm10 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm9, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm9, %ymm10, %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[1,1,2,2] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm15[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm14[2],xmm10[3,4],xmm14[5],xmm10[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm12, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm10, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %xmm14 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[3,3,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm14 -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm10 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm9, %ymm10, %ymm9 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm14[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm10[2],xmm4[3,4],xmm10[5],xmm4[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm15[2],xmm4[3,4],xmm15[5],xmm4[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] ; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm10, %ymm4 -; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm12, %ymm10 +; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm12, %ymm10 ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm4, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %xmm4 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[3,3,3,3,4,5,6,7] @@ -1994,7 +2046,7 @@ ; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm11 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm12, %ymm5 +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm12, %ymm5 ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm11 @@ -2002,184 +2054,185 @@ ; AVX2-SLOW-NEXT: vpermd (%rdi), %ymm12, %ymm13 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm11 -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm13 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm13 -; AVX2-SLOW-NEXT: vpermd 64(%rdi), %ymm12, %ymm15 +; AVX2-SLOW-NEXT: vpermd 32(%rdi), %ymm12, %ymm15 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm15, %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm15 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm15, %ymm15 -; AVX2-SLOW-NEXT: vpermd 32(%rdi), %ymm12, %ymm6 +; AVX2-SLOW-NEXT: vpermd 64(%rdi), %ymm12, %ymm6 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm15, %ymm6, %ymm6 ; AVX2-SLOW-NEXT: vmovdqa 96(%rsi), %ymm15 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm15, %ymm5 ; AVX2-SLOW-NEXT: vpermd 96(%rdi), %ymm12, %ymm12 ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm12, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <2,u,3,3,u,4,4,u> -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm12, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm12, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm13, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm12, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 320(%rcx) +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm12, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 320(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 224(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 224(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm4, 288(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, 96(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 192(%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 64(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, 192(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, 96(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 352(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride3_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm3 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm4 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm6 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 80(%rsi), %xmm6 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm12, %ymm6 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2],xmm11[3,4],xmm3[5],xmm11[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <5,5,u,6,6,u,7,7> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm7 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1],xmm7[2],xmm15[3,4],xmm7[5],xmm15[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm6, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm14 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm14, %xmm14 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3,4],xmm14[5],xmm7[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm14 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm15 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm15, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm12, %ymm15 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm7, %ymm15, %ymm7 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm14, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2],xmm14[3,4],xmm6[5],xmm14[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm14 -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm4 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm4, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm12 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm14, %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2],xmm6[3,4],xmm12[5],xmm6[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm12 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm6, %ymm12, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm12, %xmm14 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm15 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm14[2],xmm2[3,4],xmm14[5],xmm2[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm14 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm12 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm9, %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm15 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm15[2],xmm3[3,4],xmm15[5],xmm3[6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm15 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 80(%rsi), %xmm2 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm9 ; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 112(%rsi), %xmm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa 112(%rsi), %xmm2 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3,4],xmm4[5],xmm0[6,7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm4, %ymm10 -; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm11, %ymm3 ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpermd (%rdi), %ymm5, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpermd 64(%rdi), %ymm5, %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm6, %ymm12, %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm12 -; AVX2-FAST-NEXT: vpermd 32(%rdi), %ymm5, %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm12 +; AVX2-FAST-NEXT: vpermd (%rdi), %ymm5, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpermd 32(%rdi), %ymm5, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm7, %ymm11, %ymm7 +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpermd 64(%rdi), %ymm5, %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm11, %ymm13, %ymm11 ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm13 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm3 ; AVX2-FAST-NEXT: vpermd 96(%rdi), %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm5, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <2,u,3,3,u,4,4,u> ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm12, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm7, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm11, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vpermd %ymm15, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm5, %ymm3 ; AVX2-FAST-NEXT: vmovdqa %ymm3, 320(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 128(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 224(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 224(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 128(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 352(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm8, 288(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 352(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm14, 96(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 160(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 256(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm12, 192(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 160(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-FAST-NEXT: vzeroupper @@ -2188,141 +2241,142 @@ ; AVX2-FAST-PERLANE-LABEL: store_i16_stride3_vf64: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rsi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <5,5,u,6,6,u,7,7> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm12, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm9, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2],xmm11[3,4],xmm3[5],xmm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <5,5,u,6,6,u,7,7> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm11, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1],xmm7[2],xmm15[3,4],xmm7[5],xmm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm10, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm6, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3,4],xmm14[5],xmm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm12, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm15, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm3, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2],xmm14[3,4],xmm6[5],xmm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm9, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2],xmm6[3,4],xmm12[5],xmm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm11, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm6, %ymm12, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm14[2],xmm2[3,4],xmm14[5],xmm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm14, %ymm9, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm15[2],xmm3[3,4],xmm15[5],xmm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm9, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rsi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm14, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rsi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rsi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2],xmm0[3,4],xmm4[5],xmm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm12, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm4, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm12, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm11, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm5, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermd 64(%rdi), %ymm5, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm6, %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm12, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm5, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermd (%rdi), %ymm5, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermd 32(%rdi), %ymm5, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm11, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpermd 64(%rdi), %ymm5, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm11, %ymm13, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm13, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpermd 96(%rdi), %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <2,u,3,3,u,4,4,u> ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm12, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm4, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm14, %ymm5, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm11, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm15, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 320(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 128(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 352(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 288(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 352(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 256(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 160(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -2330,155 +2384,148 @@ ; ; AVX512F-LABEL: store_i16_stride3_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] -; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm1, %ymm7 -; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] -; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512F-NEXT: vmovdqa64 16(%rsi), %xmm24 -; AVX512F-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX512F-NEXT: vprold $16, %xmm5, %xmm8 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512F-NEXT: vmovdqa64 16(%rdi), %xmm25 -; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-NEXT: vpshufb %xmm0, %xmm9, %xmm9 -; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512F-NEXT: vmovdqa 32(%rdx), %ymm8 -; AVX512F-NEXT: vmovdqa 64(%rdx), %ymm14 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] -; AVX512F-NEXT: vpshufb %ymm9, %ymm3, %ymm11 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm19 = -; AVX512F-NEXT: vpermd %ymm3, %ymm19, %ymm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-NEXT: vpandnq %ymm3, %ymm16, %ymm3 -; AVX512F-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-NEXT: vpternlogq $248, %zmm17, %zmm10, %zmm3 -; AVX512F-NEXT: vmovdqa 96(%rsi), %xmm10 -; AVX512F-NEXT: vprold $16, %xmm10, %xmm11 -; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX512F-NEXT: vpshufb %xmm0, %xmm10, %xmm10 -; AVX512F-NEXT: vmovdqa64 %xmm0, %xmm26 -; AVX512F-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512F-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX512F-NEXT: vmovdqa 80(%rsi), %xmm13 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX512F-NEXT: vpshufb %xmm11, %xmm15, %xmm15 -; AVX512F-NEXT: vprold $16, %xmm13, %xmm13 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] -; AVX512F-NEXT: vinserti128 $1, %xmm15, %ymm12, %ymm12 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm12[0,1,2,3],zmm10[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = <5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10> -; AVX512F-NEXT: vpermd 64(%rdx), %zmm20, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-NEXT: vpternlogq $184, %zmm15, %zmm21, %zmm10 -; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm15 -; AVX512F-NEXT: vmovdqa %ymm7, %ymm1 -; AVX512F-NEXT: vpshufb %ymm7, %ymm15, %ymm15 -; AVX512F-NEXT: vmovdqa 96(%rsi), %ymm12 -; AVX512F-NEXT: vpshufb %ymm2, %ymm12, %ymm12 -; AVX512F-NEXT: vpor %ymm15, %ymm12, %ymm12 -; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm15 -; AVX512F-NEXT: vmovdqa 112(%rsi), %xmm13 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; AVX512F-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX512F-NEXT: vprold $16, %xmm13, %xmm13 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm13[2],xmm15[3,4],xmm13[5],xmm15[6,7] -; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm13, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm12[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-NEXT: vmovdqa 96(%rdx), %ymm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm23 = <5,5,u,6,6,u,7,7> -; AVX512F-NEXT: vpermd %ymm12, %ymm23, %ymm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-NEXT: vpandnq %ymm15, %ymm22, %ymm15 -; AVX512F-NEXT: vpshufb %ymm9, %ymm12, %ymm12 -; AVX512F-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm18 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-NEXT: vpternlogq $248, %zmm12, %zmm5, %zmm18 -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512F-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm15 -; AVX512F-NEXT: vpshufb %ymm2, %ymm15, %ymm15 -; AVX512F-NEXT: vpor %ymm5, %ymm15, %ymm5 -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm15 -; AVX512F-NEXT: vprold $16, %xmm15, %xmm0 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm13 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; AVX512F-NEXT: vmovdqa64 %xmm26, %xmm15 -; AVX512F-NEXT: vpshufb %xmm15, %xmm7, %xmm7 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] +; AVX512F-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm14 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] +; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX512F-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512F-NEXT: vprold $16, %xmm3, %xmm4 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1],xmm4[2],xmm8[3,4],xmm4[5],xmm8[6,7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm12 +; AVX512F-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512F-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX512F-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX512F-NEXT: vmovdqa 96(%rdx), %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] +; AVX512F-NEXT: vpshufb %ymm4, %ymm2, %ymm13 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm18 = +; AVX512F-NEXT: vpermd %ymm2, %ymm18, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-NEXT: vpandn %ymm2, %ymm15, %ymm2 +; AVX512F-NEXT: vinserti64x4 $1, %ymm13, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-NEXT: vpternlogq $248, %zmm13, %zmm12, %zmm2 +; AVX512F-NEXT: vprold $16, %xmm7, %xmm12 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm12[2],xmm0[3,4],xmm12[5],xmm0[6,7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; AVX512F-NEXT: vpshufb %xmm9, %xmm7, %xmm7 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-NEXT: vpshufb %ymm9, %ymm14, %ymm5 -; AVX512F-NEXT: vpermd %ymm14, %ymm19, %ymm7 -; AVX512F-NEXT: vpandnq %ymm7, %ymm16, %ymm7 -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512F-NEXT: vpternlogq $248, %zmm17, %zmm0, %zmm5 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX512F-NEXT: vpshufb %xmm7, %xmm11, %xmm11 +; AVX512F-NEXT: vprold $16, %xmm5, %xmm5 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1],xmm5[2],xmm10[3,4],xmm5[5],xmm10[6,7] +; AVX512F-NEXT: vinserti128 $1, %xmm11, %ymm5, %ymm5 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = <5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10> +; AVX512F-NEXT: vpermd (%rdx), %zmm16, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm5 +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX512F-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa64 %ymm14, %ymm19 +; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm12 +; AVX512F-NEXT: vmovdqa %ymm1, %ymm14 +; AVX512F-NEXT: vpshufb %ymm1, %ymm12, %ymm12 +; AVX512F-NEXT: vpor %ymm0, %ymm12, %ymm0 +; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm12 +; AVX512F-NEXT: vprold $16, %xmm12, %xmm10 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512F-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpshufb %ymm4, %ymm8, %ymm1 +; AVX512F-NEXT: vpermd %ymm8, %ymm18, %ymm8 +; AVX512F-NEXT: vpandn %ymm8, %ymm15, %ymm8 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm8 +; AVX512F-NEXT: vpternlogq $248, %zmm13, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqa 96(%rsi), %xmm0 +; AVX512F-NEXT: vprold $16, %xmm0, %xmm1 +; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm10 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1],xmm1[2],xmm11[3,4],xmm1[5],xmm11[6,7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX512F-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 80(%rsi), %xmm9 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; AVX512F-NEXT: vpshufb %xmm7, %xmm10, %xmm10 +; AVX512F-NEXT: vprold $16, %xmm9, %xmm9 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2],xmm1[3,4],xmm9[5],xmm1[6,7] +; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpermd 64(%rdx), %zmm16, %zmm1 +; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm7 -; AVX512F-NEXT: vpshufb %ymm2, %ymm7, %ymm7 -; AVX512F-NEXT: vpor %ymm0, %ymm7, %ymm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX512F-NEXT: vmovdqa 48(%rsi), %xmm13 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] -; AVX512F-NEXT: vpshufb %xmm11, %xmm14, %xmm14 +; AVX512F-NEXT: vmovdqa64 %ymm19, %ymm12 +; AVX512F-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX512F-NEXT: vpshufb %ymm14, %ymm9, %ymm9 +; AVX512F-NEXT: vpor %ymm0, %ymm9, %ymm0 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm9 +; AVX512F-NEXT: vmovdqa 48(%rsi), %xmm10 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512F-NEXT: vpshufb %xmm7, %xmm11, %xmm11 +; AVX512F-NEXT: vprold $16, %xmm10, %xmm10 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3,4],xmm10[5],xmm9[6,7] +; AVX512F-NEXT: vinserti128 $1, %xmm11, %ymm9, %ymm9 +; AVX512F-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <5,5,u,6,6,u,7,7> +; AVX512F-NEXT: vpermd %ymm6, %ymm9, %ymm10 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-NEXT: vpandn %ymm10, %ymm11, %ymm10 +; AVX512F-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX512F-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-NEXT: vpternlogq $248, %zmm10, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm0 +; AVX512F-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 96(%rsi), %ymm12 +; AVX512F-NEXT: vpshufb %ymm14, %ymm12, %ymm12 +; AVX512F-NEXT: vpor %ymm0, %ymm12, %ymm0 +; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm12 +; AVX512F-NEXT: vmovdqa 112(%rsi), %xmm13 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512F-NEXT: vpshufb %xmm7, %xmm14, %xmm7 ; AVX512F-NEXT: vprold $16, %xmm13, %xmm13 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm13[2],xmm7[3,4],xmm13[5],xmm7[6,7] -; AVX512F-NEXT: vinserti128 $1, %xmm14, %ymm7, %ymm7 -; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-NEXT: vpermd %ymm8, %ymm23, %ymm7 -; AVX512F-NEXT: vpandnq %ymm7, %ymm22, %ymm7 -; AVX512F-NEXT: vpshufb %ymm9, %ymm8, %ymm8 -; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512F-NEXT: vpternlogq $248, %zmm12, %zmm0, %zmm7 -; AVX512F-NEXT: vprold $16, %xmm6, %xmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm0[2],xmm8[3,4],xmm0[5],xmm8[6,7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512F-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm6 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; AVX512F-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512F-NEXT: vprold $16, %xmm24, %xmm4 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm25[1,1,2,2] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-NEXT: vpermd (%rdx), %zmm20, %zmm1 -; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm21, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm18, 320(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm10, 256(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm3, (%rcx) +; AVX512F-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] +; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm12, %ymm7 +; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512F-NEXT: vpermd %ymm3, %ymm9, %ymm7 +; AVX512F-NEXT: vpandn %ymm7, %ymm11, %ymm7 +; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 +; AVX512F-NEXT: vpternlogq $248, %zmm10, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm3, 320(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm1, 256(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm8, 192(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512F-NEXT: vmovdqa64 %zmm2, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2495,27 +2542,26 @@ ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31] ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u> -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm3, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm14, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm8, %zmm1 -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm11, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm2, %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 320(%rcx) +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm9, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31] +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm11, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u> +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63] +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm1, %zmm6 +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm8, %zmm6 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm1, %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, 320(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 256(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2532,6 +2578,7 @@ ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX: {{.*}} ; AVX2: {{.*}} +; AVX2-ONLY: {{.*}} ; AVX512: {{.*}} ; AVX512BW-FAST: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll @@ -20,47 +20,104 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, (%r8) ; SSE-NEXT: retq ; -; AVX1-LABEL: store_i16_stride4_vf2: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa (%rdx), %xmm1 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15] -; AVX1-NEXT: vmovdqa %xmm0, (%r8) -; AVX1-NEXT: retq +; AVX1-ONLY-LABEL: store_i16_stride4_vf2: +; AVX1-ONLY: # %bb.0: +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%r8) +; AVX1-ONLY-NEXT: retq ; -; AVX512F-LABEL: store_i16_stride4_vf2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15] -; AVX512F-NEXT: vmovdqa %xmm0, (%r8) -; AVX512F-NEXT: retq +; AVX2-SLOW-LABEL: store_i16_stride4_vf2: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastd (%rdx), %xmm2 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%r8) +; AVX2-SLOW-NEXT: retq ; -; AVX512BW-LABEL: store_i16_stride4_vf2: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11] -; AVX512BW-NEXT: vpermi2w %xmm1, %xmm0, %xmm2 -; AVX512BW-NEXT: vmovdqa %xmm2, (%r8) -; AVX512BW-NEXT: retq +; AVX2-FAST-LABEL: store_i16_stride4_vf2: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm1 +; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm2 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vmovdqa %xmm0, (%r8) +; AVX2-FAST-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: store_i16_stride4_vf2: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%r8) +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX512F-SLOW-LABEL: store_i16_stride4_vf2: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpbroadcastd (%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vpbroadcastd (%rdx), %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, (%r8) +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: store_i16_stride4_vf2: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vpbroadcastd (%rdx), %xmm1 +; AVX512F-FAST-NEXT: vpunpckldq (%rcx){1to4}, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-FAST-NEXT: vmovdqa %xmm0, (%r8) +; AVX512F-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: store_i16_stride4_vf2: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-SLOW-NEXT: vpbroadcastd (%rcx), %xmm1 +; AVX512BW-SLOW-NEXT: vpbroadcastd (%rdx), %xmm2 +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, (%r8) +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: store_i16_stride4_vf2: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-FAST-NEXT: vpbroadcastd (%rcx), %xmm1 +; AVX512BW-FAST-NEXT: vpbroadcastd (%rdx), %xmm2 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [4,14,5,15,4,14,5,15] +; AVX512BW-FAST-NEXT: vpermi2w %xmm1, %xmm2, %xmm3 +; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512BW-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX512BW-FAST-NEXT: vmovdqa %xmm0, (%r8) +; AVX512BW-FAST-NEXT: retq %in.vec0 = load <2 x i16>, ptr %in.vecptr0, align 64 %in.vec1 = load <2 x i16>, ptr %in.vecptr1, align 64 %in.vec2 = load <2 x i16>, ptr %in.vecptr2, align 64 @@ -236,10 +293,10 @@ ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm0, 32(%r8) -; SSE-NEXT: movdqa %xmm1, 48(%r8) +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa %xmm0, 48(%r8) +; SSE-NEXT: movdqa %xmm1, 32(%r8) ; SSE-NEXT: movdqa %xmm5, 16(%r8) ; SSE-NEXT: movdqa %xmm6, (%r8) ; SSE-NEXT: retq @@ -351,33 +408,33 @@ ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: movdqa %xmm4, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] ; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movdqa %xmm1, 96(%r8) -; SSE-NEXT: movdqa %xmm6, 112(%r8) -; SSE-NEXT: movdqa %xmm8, 64(%r8) -; SSE-NEXT: movdqa %xmm10, 80(%r8) -; SSE-NEXT: movdqa %xmm0, 32(%r8) -; SSE-NEXT: movdqa %xmm5, 48(%r8) -; SSE-NEXT: movdqa %xmm2, (%r8) -; SSE-NEXT: movdqa %xmm3, 16(%r8) +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: movdqa %xmm1, 112(%r8) +; SSE-NEXT: movdqa %xmm6, 96(%r8) +; SSE-NEXT: movdqa %xmm8, 80(%r8) +; SSE-NEXT: movdqa %xmm10, 64(%r8) +; SSE-NEXT: movdqa %xmm0, 48(%r8) +; SSE-NEXT: movdqa %xmm5, 32(%r8) +; SSE-NEXT: movdqa %xmm2, 16(%r8) +; SSE-NEXT: movdqa %xmm3, (%r8) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride4_vf16: @@ -408,26 +465,26 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) ; AVX1-ONLY-NEXT: vzeroupper @@ -461,26 +518,26 @@ ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] -; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7] -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 64(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 96(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 96(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 64(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm5, 32(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%r8) ; AVX2-ONLY-NEXT: vzeroupper @@ -492,27 +549,27 @@ ; AVX512F-NEXT: vmovdqa 16(%rcx), %xmm1 ; AVX512F-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512F-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm1 -; AVX512F-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa (%rsi), %xmm2 ; AVX512F-NEXT: vmovdqa 16(%rsi), %xmm4 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm5 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX512F-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27] -; AVX512F-NEXT: vpermt2d %zmm1, %zmm6, %zmm4 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm6, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512F-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,16,1,17,2,18,3,19,8,24,9,25,10,26,11,27] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm2 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm2, (%r8) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -563,104 +620,104 @@ ; SSE-NEXT: movdqa %xmm5, %xmm6 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm15[2],xmm6[3],xmm15[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] ; SSE-NEXT: movdqa %xmm13, %xmm15 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; SSE-NEXT: movdqa %xmm11, %xmm7 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] ; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm15[2],xmm7[3],xmm15[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] ; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm13[2],xmm8[3],xmm13[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] ; SSE-NEXT: movdqa %xmm10, %xmm15 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] ; SSE-NEXT: movdqa %xmm4, %xmm13 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] ; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; SSE-NEXT: movdqa 48(%rdx), %xmm15 ; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] ; SSE-NEXT: movdqa 48(%rcx), %xmm12 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] ; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] ; SSE-NEXT: movdqa %xmm15, %xmm10 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] ; SSE-NEXT: movdqa 48(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm10[2],xmm3[3],xmm10[3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] -; SSE-NEXT: movdqa %xmm2, 224(%r8) -; SSE-NEXT: movdqa %xmm1, 240(%r8) -; SSE-NEXT: movdqa %xmm3, 192(%r8) -; SSE-NEXT: movdqa %xmm0, 208(%r8) -; SSE-NEXT: movdqa %xmm4, 160(%r8) -; SSE-NEXT: movdqa %xmm9, 176(%r8) -; SSE-NEXT: movdqa %xmm13, 128(%r8) -; SSE-NEXT: movdqa %xmm14, 144(%r8) -; SSE-NEXT: movdqa %xmm11, 96(%r8) -; SSE-NEXT: movdqa %xmm8, 112(%r8) -; SSE-NEXT: movdqa %xmm7, 64(%r8) +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE-NEXT: movdqa %xmm2, 240(%r8) +; SSE-NEXT: movdqa %xmm1, 224(%r8) +; SSE-NEXT: movdqa %xmm3, 208(%r8) +; SSE-NEXT: movdqa %xmm0, 192(%r8) +; SSE-NEXT: movdqa %xmm4, 176(%r8) +; SSE-NEXT: movdqa %xmm9, 160(%r8) +; SSE-NEXT: movdqa %xmm13, 144(%r8) +; SSE-NEXT: movdqa %xmm14, 128(%r8) +; SSE-NEXT: movdqa %xmm11, 112(%r8) +; SSE-NEXT: movdqa %xmm8, 96(%r8) +; SSE-NEXT: movdqa %xmm7, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%r8) -; SSE-NEXT: movdqa %xmm5, 32(%r8) +; SSE-NEXT: movaps %xmm0, 64(%r8) +; SSE-NEXT: movdqa %xmm5, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) -; SSE-NEXT: movdqa %xmm6, (%r8) +; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movdqa %xmm6, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r8) +; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride4_vf32: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm9, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm9, %ymm6 @@ -693,9 +750,9 @@ ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2],ymm8[3],ymm11[4],ymm8[5],ymm11[6],ymm8[7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] @@ -723,45 +780,45 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i16_stride4_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovdqa (%rcx), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa 16(%rcx), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 ; AVX2-ONLY-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX2-ONLY-NEXT: vmovdqa 48(%rcx), %xmm7 +; AVX2-ONLY-NEXT: vmovdqa 48(%rcx), %xmm0 ; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX2-ONLY-NEXT: vmovdqa 48(%rdx), %xmm8 -; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX2-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,0,1,1] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm9, %ymm2 ; AVX2-ONLY-NEXT: vmovdqa (%rsi), %xmm9 ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX2-ONLY-NEXT: vmovdqa 48(%rsi), %xmm12 +; AVX2-ONLY-NEXT: vmovdqa 16(%rsi), %xmm12 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7] -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm6[0],zero,xmm6[1],zero ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm9, %ymm6 @@ -794,9 +851,9 @@ ; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX2-ONLY-NEXT: vmovdqa 16(%rsi), %xmm12 +; AVX2-ONLY-NEXT: vmovdqa 48(%rsi), %xmm12 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2],ymm8[3],ymm11[4],ymm8[5],ymm11[6],ymm8[7] -; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 ; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,1,1] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] @@ -824,27 +881,27 @@ ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm9, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 96(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 64(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 224(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 192(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm4, 160(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm8, 128(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 224(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 192(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, (%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 96(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 64(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%r8) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: store_i16_stride4_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rcx), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rdx), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rcx), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rdx), %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,0,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] @@ -854,14 +911,14 @@ ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rsi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rsi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rsi), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rsi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] @@ -931,23 +988,23 @@ ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 192(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 128(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 128(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i16_stride4_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 48(%rcx), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 48(%rdx), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rcx), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 48(%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 48(%rdx), %xmm1 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,0,1,1] ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] @@ -957,14 +1014,14 @@ ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rsi), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 48(%rsi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rsi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 48(%rsi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] ; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] @@ -1034,23 +1091,23 @@ ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 192(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 128(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 128(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 64(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i16_stride4_vf32: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rsi), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rsi), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rsi), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] @@ -1060,14 +1117,14 @@ ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rcx), %xmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rcx), %xmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdx), %xmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdx), %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rcx), %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rcx), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdx), %xmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdx), %xmm4 ; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm15[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] @@ -1133,23 +1190,23 @@ ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 192(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 128(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 192(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 128(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i16_stride4_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa 48(%rsi), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rsi), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa 48(%rsi), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] @@ -1159,14 +1216,14 @@ ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rcx), %xmm13 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa 48(%rcx), %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdx), %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdx), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rcx), %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa 48(%rcx), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdx), %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdx), %xmm4 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm16 = xmm15[0,0,1,1] ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] @@ -1232,10 +1289,10 @@ ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 192(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 128(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 192(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 128(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -1493,134 +1550,134 @@ ; AVX1-ONLY-LABEL: store_i16_stride4_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $40, %rsp -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm12[0],zero,xmm12[1],zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm15, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2],ymm0[3],ymm12[4],ymm0[5],ymm12[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm2[0],zero,xmm2[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm11, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm2[1],ymm12[2],ymm2[3],ymm12[4],ymm2[5],ymm12[6],ymm2[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm11, %ymm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm11[0],zero,xmm11[1],zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4],ymm2[5],ymm14[6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm4[0],zero,xmm4[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm12, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm12[0],zero,xmm12[1],zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm15, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm4[1],ymm15[2],ymm4[3],ymm15[4],ymm4[5],ymm15[6],ymm4[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm4[1],ymm13[2],ymm4[3],ymm13[4],ymm4[5],ymm13[6],ymm4[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm7[0],zero,xmm7[1],zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4],ymm5[5],ymm7[6],ymm5[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm6[0],zero,xmm6[1],zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm10[0],zero,xmm10[1],zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2],ymm6[3],ymm10[4],ymm6[5],ymm10[6],ymm6[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7] +; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0],ymm7[1],ymm13[2],ymm7[3],ymm13[4],ymm7[5],ymm13[6],ymm7[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0],ymm8[1],ymm13[2],ymm8[3],ymm13[4],ymm8[5],ymm13[6],ymm8[7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4],ymm9[5],ymm10[6],ymm9[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] +; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm11[0],zero,xmm11[1],zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm8[0],zero,xmm8[1],zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] -; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm13, %ymm10 +; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm15[0],zero,xmm15[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm10[1],ymm0[2],ymm10[3],ymm0[4],ymm10[5],ymm0[6],ymm10[7] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -1630,14 +1687,14 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm0[1],ymm11[2],ymm0[3],ymm11[4],ymm0[5],ymm11[6],ymm0[7] -; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -1651,21 +1708,21 @@ ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm14 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4],ymm13[5],ymm0[6],ymm13[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 @@ -1674,27 +1731,27 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 480(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 448(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 416(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 384(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 352(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 416(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 384(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 352(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 288(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 256(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) ; AVX1-ONLY-NEXT: addq $40, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -1702,134 +1759,134 @@ ; AVX2-ONLY-LABEL: store_i16_stride4_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $40, %rsp -; AVX2-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX2-ONLY-NEXT: vmovdqa 48(%rcx), %xmm6 -; AVX2-ONLY-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX2-ONLY-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 16(%rcx), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX2-ONLY-NEXT: vmovdqa 48(%rcx), %xmm7 +; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa 16(%rdx), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX2-ONLY-NEXT: vmovdqa 48(%rdx), %xmm8 -; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-ONLY-NEXT: vmovdqa 16(%rsi), %xmm13 +; AVX2-ONLY-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX2-ONLY-NEXT: vmovdqa 48(%rsi), %xmm9 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm14 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm12[0],zero,xmm12[1],zero +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm15, %ymm12 +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2],ymm0[3],ymm12[4],ymm0[5],ymm12[6],ymm0[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm2[0],zero,xmm2[1],zero ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm11, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 64(%rcx), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdx), %xmm4 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,0,1,1] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa 64(%rsi), %xmm5 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm13, %ymm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm2[1],ymm12[2],ymm2[3],ymm12[4],ymm2[5],ymm12[6],ymm2[7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm11, %ymm2 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm11[0],zero,xmm11[1],zero +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm14, %ymm14 +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm2[1],ymm14[2],ymm2[3],ymm14[4],ymm2[5],ymm14[6],ymm2[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm4[0],zero,xmm4[1],zero ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 80(%rcx), %xmm5 -; AVX2-ONLY-NEXT: vmovdqa 80(%rdx), %xmm7 -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,0,1,1] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[0,0,1,1] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm12, %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 80(%rsi), %xmm13 -; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm12[0],zero,xmm12[1],zero -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm15, %ymm15 -; AVX2-ONLY-NEXT: vmovdqa 48(%rsi), %xmm12 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm4[1],ymm15[2],ymm4[3],ymm15[4],ymm4[5],ymm15[6],ymm4[7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm4[1],ymm13[2],ymm4[3],ymm13[4],ymm4[5],ymm13[6],ymm4[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,0,1,1] +; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 -; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm7[0],zero,xmm7[1],zero -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm13, %ymm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4],ymm5[5],ymm7[6],ymm5[7] -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[0,0,1,1] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm6[0],zero,xmm6[1],zero +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm10, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,0,1,1] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm10, %ymm6 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm10[0],zero,xmm10[1],zero +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2],ymm6[3],ymm10[4],ymm6[5],ymm10[6],ymm6[7] +; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm13, %ymm7 -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7] +; AVX2-ONLY-NEXT: vmovdqa 64(%rcx), %xmm9 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,0,1,1] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 +; AVX2-ONLY-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0],ymm7[1],ymm13[2],ymm7[3],ymm13[4],ymm7[5],ymm13[6],ymm7[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm8[1],ymm13[2],ymm8[3],ymm13[4],ymm8[5],ymm13[6],ymm8[7] ; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] +; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4],ymm9[5],ymm10[6],ymm9[7] -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] +; AVX2-ONLY-NEXT: vmovdqa 80(%rcx), %xmm11 +; AVX2-ONLY-NEXT: vmovdqa 80(%rdx), %xmm12 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[0,0,1,1] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm11[0],zero,xmm11[1],zero -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm13, %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] -; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm6 -; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm8[0],zero,xmm8[1],zero -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] -; AVX2-ONLY-NEXT: vmovdqa 96(%rcx), %xmm11 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdx), %xmm12 -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[0,0,1,1] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm13, %ymm8 -; AVX2-ONLY-NEXT: vmovdqa 96(%rsi), %xmm13 -; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 +; AVX2-ONLY-NEXT: vmovdqa 80(%rsi), %xmm13 +; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm15[0],zero,xmm15[1],zero ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0],ymm10[1],ymm0[2],ymm10[3],ymm0[4],ymm10[5],ymm0[6],ymm10[7] ; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,0,1,1] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -1839,14 +1896,14 @@ ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm0[1],ymm11[2],ymm0[3],ymm11[4],ymm0[5],ymm11[6],ymm0[7] -; AVX2-ONLY-NEXT: vmovdqa 112(%rcx), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa 112(%rdx), %xmm13 +; AVX2-ONLY-NEXT: vmovdqa 96(%rcx), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdx), %xmm13 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[0,0,1,1] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 -; AVX2-ONLY-NEXT: vmovdqa 112(%rsi), %xmm14 -; AVX2-ONLY-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovdqa 96(%rsi), %xmm14 +; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm15 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] @@ -1860,21 +1917,21 @@ ; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa (%rcx), %xmm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-ONLY-NEXT: vmovdqa 112(%rcx), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 112(%rdx), %xmm2 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 -; AVX2-ONLY-NEXT: vmovdqa (%rsi), %xmm14 -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovdqa 112(%rsi), %xmm14 +; AVX2-ONLY-NEXT: vmovdqa 112(%rdi), %xmm15 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4],ymm13[5],ymm0[6],ymm13[7] -; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 @@ -1883,810 +1940,808 @@ ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, 480(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm12, 448(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm11, 416(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm8, 384(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, 224(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm10, 192(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, 160(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 128(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm5, 352(%r8) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 480(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 448(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, 416(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm12, 384(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm11, 352(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, 320(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, 288(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, 256(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 224(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, 192(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm5, 160(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) ; AVX2-ONLY-NEXT: addq $40, %rsp ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: store_i16_stride4_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %xmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rcx), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rdx), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rsi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rdi), %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm3[4],xmm14[5],xmm3[5],xmm14[6],xmm3[6],xmm14[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm3[0],zero,xmm3[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm14, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: movw $-21846, %ax # imm = 0xAAAA -; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm9, %zmm17 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm15[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm14, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm9, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm0[0],zero,xmm0[1],zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rcx), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rdx), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm16, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm9[0],zero,xmm9[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm15, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm14, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 112(%rcx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 112(%rdx), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rsi), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rsi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 48(%rdi), %xmm6 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm15[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm14, %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 112(%rsi), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 112(%rdi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm3[0],zero,xmm3[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm16, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm14, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm15[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm15[4],xmm3[4],xmm15[5],xmm3[5],xmm15[6],xmm3[6],xmm15[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm9[0],zero,xmm9[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm9, %ymm16, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm3[0],zero,xmm3[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm15, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rcx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm9[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm9, %ymm16, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rsi), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm14[0],zero,xmm14[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm16, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm3[0],zero,xmm3[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm9, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm3, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: movw $-21846, %ax # imm = 0xAAAA +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm13, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm1[0],zero,xmm1[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm13, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm10, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm9, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm3[0],zero,xmm3[1],zero +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm9, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm7, %ymm3 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rcx), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdx), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rsi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 80(%rdi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm20, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,0,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm6, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm6, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 320(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 256(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 448(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 384(%r8) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 112(%rcx), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 112(%rdx), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 112(%rsi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 112(%rdi), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 448(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 384(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 320(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 256(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 192(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i16_stride4_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %xmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 48(%rcx), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 48(%rdx), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 48(%rsi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 48(%rdi), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm3[4],xmm14[5],xmm3[5],xmm14[6],xmm3[6],xmm14[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm3[0],zero,xmm3[1],zero -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm14, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm17 -; AVX512F-ONLY-FAST-NEXT: movw $-21846, %ax # imm = 0xAAAA -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm9, %zmm17 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm16 = xmm15[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm14, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm9, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm0[0],zero,xmm0[1],zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 48(%rcx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 48(%rdx), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm0, %ymm16, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm9[0],zero,xmm9[1],zero -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm15, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm14, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 112(%rcx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 112(%rdx), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rsi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 48(%rsi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 48(%rdi), %xmm6 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm16 = xmm15[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm14, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 112(%rsi), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 112(%rdi), %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm3[0],zero,xmm3[1],zero -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm16, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm14, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm16 = xmm15[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm15[4],xmm3[4],xmm15[5],xmm3[5],xmm15[6],xmm3[6],xmm15[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm9[0],zero,xmm9[1],zero -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm9, %ymm16, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm3[0],zero,xmm3[1],zero -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm15, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 80(%rcx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 80(%rdx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm16 = xmm9[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm9, %ymm16, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 80(%rsi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 80(%rdi), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm14[0],zero,xmm14[1],zero -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm14, %ymm16, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm3[0],zero,xmm3[1],zero -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm9, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm3, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: movw $-21846, %ax # imm = 0xAAAA +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm13, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm1[0],zero,xmm1[1],zero +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm13, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm10, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm10, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm9, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm3[0],zero,xmm3[1],zero +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm9, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm7, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm5, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 80(%rcx), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 80(%rdx), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 80(%rsi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 80(%rdi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm6, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm6, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 192(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 320(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 256(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 448(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 384(%r8) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%r8) +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 112(%rcx), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 112(%rdx), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 112(%rsi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 112(%rdi), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-ONLY-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 448(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 384(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 320(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 256(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 192(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i16_stride4_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rsi), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rsi), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm6[0],zero,xmm6[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm6 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rcx), %xmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rcx), %xmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdx), %xmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdx), %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rcx), %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rcx), %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdx), %xmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdx), %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm15[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm13, %ymm6 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 ; AVX512DQ-SLOW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm15, %zmm6, %zmm17 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %xmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %xmm13 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm6[0],zero,xmm6[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm13, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %xmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %xmm15 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm0[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm16, %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm15, %zmm13, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm15, %ymm13 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm13, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 112(%rsi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 112(%rdi), %xmm13 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm13, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa 112(%rcx), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 112(%rdx), %xmm15 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm6[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm10, %zmm11, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm10, %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm8, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm4, %zmm5, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm6, %ymm16, %ymm6 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm6, %zmm0, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm5 ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm13[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm13, %ymm16, %ymm13 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm13, %zmm0, %zmm15 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rsi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm13[0],zero,xmm13[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm13, %ymm16, %ymm13 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rcx), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdx), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm20 = xmm13[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm13, %ymm20, %ymm13 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm13, %zmm0, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm6[0],zero,xmm6[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm10, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm10 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm11, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm6, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm5, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rsi), %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rcx), %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 80(%rdx), %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm5, %zmm6, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 128(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 320(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 256(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 448(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 384(%r8) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 64(%r8) +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm8, %zmm6, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %xmm8 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm9, %zmm7, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 112(%rsi), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 112(%rdi), %xmm8 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 112(%rcx), %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 112(%rdx), %xmm9 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,0,1,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm10, %zmm8, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 448(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 384(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 320(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 256(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 192(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i16_stride4_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa 48(%rsi), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 48(%rsi), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm6[0],zero,xmm6[1],zero +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm6 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rcx), %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa 48(%rcx), %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdx), %xmm13 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdx), %xmm9 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rcx), %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa 48(%rcx), %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdx), %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdx), %xmm6 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm16 = xmm15[0,0,1,1] ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] ; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm13, %ymm6 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 ; AVX512DQ-FAST-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm15, %zmm6, %zmm17 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm13 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] -; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3] -; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm6[0],zero,xmm6[1],zero -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm13, %ymm6 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm13 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm15 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm16 = xmm0[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm0, %ymm16, %ymm0 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[0,0,1,1] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm15, %zmm13, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm13[0],zero,xmm13[1],zero ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm15, %ymm13 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm13, %zmm18 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 112(%rsi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 112(%rdi), %xmm13 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm13, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa 112(%rcx), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 112(%rdx), %xmm15 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm16 = xmm6[0,0,1,1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm10, %zmm11, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm10, %ymm7 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm7, %zmm8, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm4, %zmm5, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm6, %ymm16, %ymm6 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm6, %zmm0, %zmm19 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm15[0],zero,xmm15[1],zero -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm16, %ymm15 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm5 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm16 = xmm13[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm13, %ymm16, %ymm13 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm13, %zmm0, %zmm15 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 80(%rsi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 80(%rdi), %xmm6 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm16 = xmm13[0],zero,xmm13[1],zero -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm13, %ymm16, %ymm13 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa 80(%rcx), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 80(%rdx), %xmm6 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm20 = xmm13[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm13, %ymm20, %ymm13 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm13, %zmm0, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm6[0],zero,xmm6[1],zero -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm10, %ymm6 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm10 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm11, %ymm6 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm6, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm7, %zmm5, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 80(%rsi), %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa 80(%rcx), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa 80(%rdx), %xmm7 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm6, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 192(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 128(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 320(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 256(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 448(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 384(%r8) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 64(%r8) +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm8, %zmm6, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm6[0],zero,xmm6[1],zero +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm8 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm9, %zmm7, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 112(%rsi), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 112(%rdi), %xmm8 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm9[0],zero,xmm9[1],zero +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 112(%rcx), %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa 112(%rdx), %xmm9 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm10, %zmm8, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 448(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 384(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 320(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 256(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 192(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 128(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%r8) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -2700,32 +2755,32 @@ ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm10, %zmm11 ; AVX512BW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm9, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm12, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm16, %zmm17 ; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u> ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm8 @@ -2740,14 +2795,14 @@ ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm5 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 ; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64 @@ -2763,12 +2818,11 @@ } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX: {{.*}} +; AVX1: {{.*}} ; AVX2: {{.*}} ; AVX512: {{.*}} -; AVX512BW-FAST: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} ; AVX512BW-ONLY-SLOW: {{.*}} -; AVX512BW-SLOW: {{.*}} ; AVX512DQBW-FAST: {{.*}} ; AVX512DQBW-SLOW: {{.*}} ; FALLBACK0: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -18,54 +18,57 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride5_vf2: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movaps (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa (%r8), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,7,5] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,1] ; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movd %xmm0, 16(%r9) +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movd %xmm1, 16(%r9) ; SSE-NEXT: movdqa %xmm3, (%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride5_vf2: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,2,3,6,7,10,11] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-ONLY-NEXT: vmovd %xmm1, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%r9) +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,12,13,u,u,2,3,6,7,10,11] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovd %xmm0, 16(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%r9) ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride5_vf2: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX2-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-SLOW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,2,3,6,7,10,11,u,u,18,19,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -81,16 +84,20 @@ ; ; AVX2-FAST-LABEL: store_i16_stride5_vf2: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX2-FAST-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,ymm0[2,3,6,7,10,11],zero,zero,ymm0[18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[30,31],zero,zero,ymm0[30,31,30,31,16,17,18,19,28,29,30,31] -; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,2,3,6,7,10,11,u,u,18,19,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vmovd %xmm1, 16(%r9) ; AVX2-FAST-NEXT: vmovdqa %xmm0, (%r9) @@ -99,11 +106,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride5_vf2: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,ymm0[2,3,6,7,10,11],zero,zero,ymm0[18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -115,31 +124,55 @@ ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; -; AVX512F-LABEL: store_i16_stride5_vf2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,ymm0[2,3,6,7,10,11],zero,zero,ymm0[18,19,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vmovd %xmm1, 16(%r9) -; AVX512F-NEXT: vmovdqa %xmm0, (%r9) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-SLOW-LABEL: store_i16_stride5_vf2: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-SLOW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,ymm0[2,3,6,7,10,11],zero,zero,ymm0[18,19,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-SLOW-NEXT: vmovd %xmm1, 16(%r9) +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, (%r9) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: store_i16_stride5_vf2: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512F-FAST-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,ymm0[2,3,6,7,10,11],zero,zero,ymm0[18,19,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9],zero,zero,zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-FAST-NEXT: vmovd %xmm1, 16(%r9) +; AVX512F-FAST-NEXT: vmovdqa %xmm0, (%r9) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride5_vf2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 @@ -215,57 +248,60 @@ ; ; AVX1-ONLY-LABEL: store_i16_stride5_vf4: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[10,11,10,11,u,u,u,u,4,5,12,13,u,u,u,u] +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[10,11,10,11,u,u,u,u,4,5,12,13,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[3,1,2,1] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6],xmm2[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm4, 16(%r9) -; AVX1-ONLY-NEXT: vmovq %xmm3, 32(%r9) +; AVX1-ONLY-NEXT: vmovq %xmm1, 32(%r9) +; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride5_vf4: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovq %xmm0, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6],ymm0[7],ymm1[8,9],ymm0[10,11],ymm1[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-SLOW-NEXT: vmovq %xmm2, 32(%r9) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -273,25 +309,26 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovq %xmm0, 32(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6],ymm0[7],ymm1[8,9],ymm0[10,11],ymm1[12,13,14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-FAST-NEXT: vmovq %xmm2, 32(%r9) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -299,78 +336,79 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6],ymm0[7],ymm1[8,9],ymm0[10,11],ymm1[12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm2, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i16_stride5_vf4: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: movq (%r8), %rax ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpbroadcastq %rax, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vmovq %xmm0, 32(%r9) -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%r9) +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6],ymm0[7],ymm3[8,9],ymm0[10,11],ymm3[12,13,14],ymm0[15] +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovq %xmm1, 32(%r9) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%r9) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride5_vf4: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: movq (%r8), %rax ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vpbroadcastq %rax, %ymm3 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] -; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 -; AVX512F-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 -; AVX512F-FAST-NEXT: vmovq %xmm0, 32(%r9) -; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%r9) +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6],ymm0[7],ymm3[8,9],ymm0[10,11],ymm3[12,13,14],ymm0[15] +; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovq %xmm1, 32(%r9) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -382,14 +420,13 @@ ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512BW-NEXT: vmovq %xmm1, 32(%r9) -; AVX512BW-NEXT: vmovdqa %ymm0, (%r9) +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,4,8,12,32,1,5,9,13,33,2,6,10,14,34,3,7,11,15,35,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, 32(%r9) +; AVX512BW-NEXT: vmovdqa %ymm2, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64 @@ -410,146 +447,146 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride5_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa (%rsi), %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm6 ; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: movdqa (%rcx), %xmm3 -; SSE-NEXT: movdqa (%r8), %xmm6 +; SSE-NEXT: movdqa (%rcx), %xmm4 +; SSE-NEXT: movdqa (%r8), %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: por %xmm3, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,0,0,65535,65535] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm1, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm4[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm3, %xmm11 ; SSE-NEXT: pandn %xmm10, %xmm11 ; SSE-NEXT: por %xmm9, %xmm11 ; SSE-NEXT: pand %xmm8, %xmm11 -; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm7, %xmm8 ; SSE-NEXT: por %xmm11, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm5 -; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,1] ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm9[0,1,3,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[0,1,3,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,1,1] ; SSE-NEXT: pand %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: pandn %xmm9, %xmm10 ; SSE-NEXT: por %xmm11, %xmm10 ; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1] -; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,1,0,1] +; SSE-NEXT: pandn %xmm9, %xmm0 ; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; SSE-NEXT: psrlq $48, %xmm7 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm7[1] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm6[1] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm11 ; SSE-NEXT: pandn %xmm10, %xmm11 ; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,7,6] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,1,3,3] -; SSE-NEXT: pand %xmm7, %xmm12 +; SSE-NEXT: pand %xmm6, %xmm12 ; SSE-NEXT: por %xmm11, %xmm12 -; SSE-NEXT: pand %xmm1, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: por %xmm12, %xmm1 -; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm10, %xmm7 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7] +; SSE-NEXT: pand %xmm3, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: por %xmm12, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, 16(%r9) -; SSE-NEXT: movdqa %xmm4, 48(%r9) -; SSE-NEXT: movdqa %xmm1, 64(%r9) +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm9, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm6 +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm1, 48(%r9) +; SSE-NEXT: movdqa %xmm4, 16(%r9) +; SSE-NEXT: movdqa %xmm3, 64(%r9) ; SSE-NEXT: movdqa %xmm0, (%r9) -; SSE-NEXT: movdqa %xmm5, 32(%r9) +; SSE-NEXT: movdqa %xmm7, 32(%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride5_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm1[1],xmm4[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3,4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3],xmm9[4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm9[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3],xmm10[4,5],xmm8[6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3,4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3],xmm10[4,5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3],xmm11[4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3,4,5],xmm10[6],xmm9[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3,4],xmm9[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm6[1,2,3,4],xmm9[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3,4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2,3,4],xmm7[5],xmm6[6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm6, 48(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, (%r9) -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm9[2],xmm1[3,4,5,6],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 64(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 16(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 64(%r9) ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride5_vf8: @@ -607,24 +644,24 @@ ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm4 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,5,2,6,2,6,u,u> -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[2,3,6,7,2,3],zero,zero,zero,zero,ymm7[8,9,12,13,16,17],zero,zero,zero,zero,ymm7[18,19,22,23,28,29],zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <5,2,6,u,2,6,3,7> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3],zero,zero,zero,zero,zero,zero,ymm8[4,5,8,9],zero,zero,zero,zero,zero,zero,ymm8[18,19,22,23],zero,zero,zero,zero,zero,zero,ymm8[24,25,28,29] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,2,0] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[0,1,8,9,12,13],zero,zero,zero,zero,ymm7[2,3,18,19,18,19],zero,zero,zero,zero,ymm7[28,29,20,21,28,29],zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,2,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm8[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm8[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm8[22,23] ; AVX2-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,12,13],zero,zero,zero,zero,ymm6[2,3,18,19,18,19],zero,zero,zero,zero,ymm6[28,29,20,21,28,29],zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm5[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm5[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm5[22,23] +; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <1,5,2,6,2,6,u,u> +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[2,3,6,7,2,3],zero,zero,zero,zero,ymm6[8,9,12,13,16,17],zero,zero,zero,zero,ymm6[18,19,22,23,28,29],zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <5,2,6,u,2,6,3,7> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,3],zero,zero,zero,zero,zero,zero,ymm5[4,5,8,9],zero,zero,zero,zero,zero,zero,ymm5[18,19,22,23],zero,zero,zero,zero,zero,zero,ymm5[24,25,28,29] ; AVX2-FAST-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpsrlq $48, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] @@ -633,8 +670,8 @@ ; AVX2-FAST-NEXT: vpbroadcastd 12(%r8), %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] ; AVX2-FAST-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm4, 32(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm7, (%r9) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -799,325 +836,337 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa (%rsi), %xmm15 -; SSE-NEXT: movdqa 16(%rsi), %xmm13 -; SSE-NEXT: movdqa 16(%rdx), %xmm10 -; SSE-NEXT: movdqa (%rcx), %xmm14 +; SSE-NEXT: subq $24, %rsp +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm12 +; SSE-NEXT: movdqa 16(%rsi), %xmm14 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rcx), %xmm11 -; SSE-NEXT: movdqa 16(%r8), %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm9, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa (%rdx), %xmm11 +; SSE-NEXT: movdqa (%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rcx), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r8), %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: por %xmm6, %xmm12 -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: pandn %xmm7, %xmm13 +; SSE-NEXT: por %xmm5, %xmm13 +; SSE-NEXT: pand %xmm1, %xmm13 +; SSE-NEXT: por %xmm4, %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm12 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm8, %xmm2 -; SSE-NEXT: por %xmm12, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm15[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm2, %xmm12 -; SSE-NEXT: por %xmm7, %xmm12 +; SSE-NEXT: pand %xmm3, %xmm13 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm9, %xmm14 -; SSE-NEXT: pandn %xmm7, %xmm14 -; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[1,1,2,2] -; SSE-NEXT: pand %xmm9, %xmm15 -; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: movdqa (%r8), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: por %xmm14, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm7[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm8, %xmm13 +; SSE-NEXT: por %xmm5, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm15[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: pandn %xmm5, %xmm14 +; SSE-NEXT: movdqa 16(%rdx), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,2,2] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: por %xmm5, %xmm14 +; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: movdqa 16(%r8), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm14, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] -; SSE-NEXT: psrlq $48, %xmm13 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm13[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pandn %xmm12, %xmm13 -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,1,3,3] -; SSE-NEXT: pand %xmm2, %xmm15 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,0,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pandn %xmm5, %xmm13 +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm14[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[0,1,1,1] +; SSE-NEXT: pand %xmm3, %xmm15 ; SSE-NEXT: por %xmm13, %xmm15 -; SSE-NEXT: pand %xmm9, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,1] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm15 +; SSE-NEXT: por %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,2,2] +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: pandn %xmm12, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm14[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,2] -; SSE-NEXT: pand %xmm2, %xmm12 -; SSE-NEXT: por %xmm15, %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: pand %xmm15, %xmm12 -; SSE-NEXT: por %xmm12, %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] -; SSE-NEXT: movdqa %xmm12, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm11, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; SSE-NEXT: psrlq $48, %xmm11 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm11[1] -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: pandn %xmm1, %xmm11 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,5,7,6] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm11, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm9 +; SSE-NEXT: pandn %xmm2, %xmm9 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: psrlq $48, %xmm12 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm12[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm6[0,1,3,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,1,1] +; SSE-NEXT: pand %xmm3, %xmm11 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm11, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,3,2,3] -; SSE-NEXT: pandn %xmm11, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: por %xmm3, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: por %xmm6, %xmm14 +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; SSE-NEXT: pandn %xmm3, %xmm15 ; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,3,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,1] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm1, (%r9) -; SSE-NEXT: movdqa %xmm12, 16(%r9) -; SSE-NEXT: movdqa %xmm15, 48(%r9) -; SSE-NEXT: movdqa %xmm9, 64(%r9) -; SSE-NEXT: movdqa %xmm7, 80(%r9) -; SSE-NEXT: movdqa %xmm13, 96(%r9) -; SSE-NEXT: movdqa %xmm14, 128(%r9) +; SSE-NEXT: psrlq $48, %xmm7 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,5,7,6] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm10, 144(%r9) +; SSE-NEXT: movdqa %xmm15, 128(%r9) +; SSE-NEXT: movdqa %xmm14, 96(%r9) +; SSE-NEXT: movdqa %xmm8, 80(%r9) +; SSE-NEXT: movdqa %xmm4, 64(%r9) +; SSE-NEXT: movdqa %xmm9, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%r9) +; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%r9) +; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%r9) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%r9) +; SSE-NEXT: addq $24, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride5_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm10[1],xmm3[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm8, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2],xmm0[3,4,5,6],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1,2,3,4],xmm8[5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm8 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm2[1],xmm8[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm14 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm8[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm6, %ymm14 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm14, %ymm6 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4],xmm12[5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm7[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm10[0,1],xmm13[2],xmm10[3,4,5,6],xmm13[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm15, %ymm7 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm14[1],xmm0[2,3,4,5],xmm14[6],xmm0[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm12[1],xmm3[2,3,4,5],xmm12[6],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm11[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm14, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm15 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm15 = xmm7[1],xmm15[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm15 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm14[2],xmm4[3,4,5,6],xmm14[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0],xmm3[1,2,3,4],xmm14[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4,5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm13[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4,5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm7, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2],xmm5[3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2],xmm11[3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,1,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm12 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm15, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm13[1],xmm11[2,3,4,5],xmm13[6],xmm11[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm2[2],xmm6[3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm12[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3,4],xmm6[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1],xmm6[2],xmm12[3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm7 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4],xmm2[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 48(%r9) +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm15[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 112(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 96(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm9, 112(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 96(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 64(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 80(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 128(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 144(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 64(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 80(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -1144,19 +1193,20 @@ ; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[1,1,1,2,5,5,5,6] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5],ymm6[6],ymm10[7,8],ymm6[9],ymm10[10,11],ymm6[12],ymm10[13],ymm6[14],ymm10[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm2[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,3,2,2,6,7,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2],ymm6[3],ymm10[4,5],ymm6[6],ymm10[7,8],ymm6[9],ymm10[10],ymm6[11],ymm10[12,13],ymm6[14],ymm10[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2],ymm10[3,4],ymm11[5,6,7,8],ymm10[9],ymm11[10],ymm10[11,12],ymm11[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm6 -; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm6 ; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm10 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] @@ -1171,20 +1221,19 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[0,1,1,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3,4],ymm10[5,6,7,8],ymm9[9],ymm10[10],ymm9[11,12],ymm10[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[1,1,1,2,5,5,5,6] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm2[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] @@ -1198,9 +1247,9 @@ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 128(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm8, 96(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm7, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm6, 96(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 128(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -1240,31 +1289,31 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm1[0,1,1,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,2] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3,4],ymm9[5,6,7,8],ymm8[9],ymm9[10],ymm8[11,12],ymm9[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,2] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3,4],ymm10[5,6,7,8],ymm9[9],ymm10[10],ymm9[11,12],ymm10[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] @@ -1278,8 +1327,8 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 128(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 96(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 96(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 128(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm6, 32(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-NEXT: vzeroupper @@ -1320,31 +1369,31 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm1[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,1,2,1,4,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3,4],ymm9[5,6,7,8],ymm8[9],ymm9[10],ymm8[11,12],ymm9[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 16(%r8), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 24(%r8), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3,4],ymm10[5,6,7,8],ymm9[9],ymm10[10],ymm9[11,12],ymm10[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 24(%r8), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 16(%r8), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] @@ -1358,8 +1407,8 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 128(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 128(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -1367,156 +1416,146 @@ ; ; AVX512F-SLOW-LABEL: store_i16_stride5_vf16: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm4 -; AVX512F-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm6 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5],xmm5[6],xmm7[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,2,2,2] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,2] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512F-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3],xmm6[4],xmm8[5],xmm6[6],xmm8[7] ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm7, %ymm8, %ymm6 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm7, %zmm6 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm5 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm5 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm5, %ymm9, %ymm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm1, %ymm10 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm10 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm7 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm2[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm3, %ymm9 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7 +; AVX512F-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm6 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpandn %ymm7, %ymm8, %ymm7 +; AVX512F-SLOW-NEXT: vpandn %ymm6, %ymm8, %ymm6 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm2, %ymm9, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5,6,7,8],ymm0[9],ymm1[10],ymm0[11,12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] ; AVX512F-SLOW-NEXT: vpternlogq $202, 24(%r8){1to4}, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 128(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride5_vf16: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm4 -; AVX512F-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm5 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5],xmm5[6],xmm7[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,2,2,2] -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm8 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,2] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512F-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm6 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3],xmm6[4],xmm8[5],xmm6[6],xmm8[7] ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %ymm7, %ymm8, %ymm6 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm6[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm5 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm5 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,12,13,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %ymm5, %ymm9, %ymm7 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vprolq $16, %ymm1, %ymm10 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,22,23,u,u,20,21,u,u,24,25] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512F-FAST-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm10 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm5 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm2[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vprolq $16, %ymm3, %ymm9 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7 +; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm6 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpandn %ymm7, %ymm8, %ymm7 +; AVX512F-FAST-NEXT: vpandn %ymm6, %ymm8, %ymm6 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX512F-FAST-NEXT: vpternlogq $226, %ymm2, %ymm9, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,u,u,26,27,u,u,30,31,28,29,u,u,28,29] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,2] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5,6,7,8],ymm0[9],ymm1[10],ymm0[11,12],ymm1[13,14,15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] ; AVX512F-FAST-NEXT: vpternlogq $202, 24(%r8){1to4}, %ymm0, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa %ymm1, 128(%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 64(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, (%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -1529,19 +1568,19 @@ ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,16,32,48,u,1,17,33,49,u,2,18,34,50,u,3,19,35,51,u,4,20,36,52,u,5,21,37,53,u,6,22> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-NEXT: vpermi2w %ymm2, %ymm3, %ymm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,16,32,48,u,1,17,33,49,u,2,18,34,50,u,3,19,35,51,u,4,20,36,52,u,5,21,37,53,u,6,22> +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <6,22,u,39,55,7,23,u,40,56,8,24,u,41,57,9,25,u,42,58,10,26,u,43,59,11,27,u,44,60,12,28> -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm4 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <6,22,u,39,55,7,23,u,40,56,8,24,u,41,57,9,25,u,42,58,10,26,u,43,59,11,27,u,44,60,12,28> +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm3 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm4, %zmm0 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512BW-NEXT: vpermi2w %ymm2, %ymm3, %ymm0 -; AVX512BW-NEXT: vmovdqa %ymm0, 128(%r9) +; AVX512BW-NEXT: vmovdqa %ymm4, 128(%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <16 x i16>, ptr %in.vecptr0, align 64 @@ -1980,1118 +2019,1105 @@ ; ; AVX1-ONLY-LABEL: store_i16_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $72, %rsp -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2],xmm2[3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm12, %ymm7 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm8 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm5[1],xmm8[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2,3,4],xmm7[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm15[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm8, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm15[1],xmm7[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] +; AVX1-ONLY-NEXT: pushq %rax +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3,4,5],xmm13[6],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm13[4],xmm4[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4],xmm2[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm9[2],xmm4[3,4,5,6],xmm9[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm10[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm8[4],xmm2[5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm13[2],xmm2[3,4,5,6],xmm13[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm11, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm13 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm8[1],xmm6[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm1[3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0],xmm8[1,2,3,4],xmm1[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6],xmm2[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm6[1],xmm4[2,3,4,5],xmm6[6],xmm4[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm13[3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1],xmm4[2],xmm1[3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm4[0],xmm0[1,2,3,4],xmm4[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm7[3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm5[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,1,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm14, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm14 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm5[0,1,2,3],xmm15[4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm1[4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,3,3] +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm12[4],xmm1[5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4,5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm11, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm1[1],xmm4[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm8 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm7[2],xmm4[3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm2[1,2,3,4],xmm7[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm15 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm7 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm15[1],xmm2[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm11, %ymm12 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm8[3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,1,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm14, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm6, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm6 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm9[4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm15[4],xmm5[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm7, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1],xmm5[2],xmm12[3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm9[1,2,3,4],xmm5[5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 96(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 112(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 64(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 80(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r9) +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2,3,4],xmm5[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2],xmm3[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm8[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0],xmm8[1],xmm1[2,3,4,5],xmm8[6],xmm1[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm6[2],xmm9[3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3,4],xmm6[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 288(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 304(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 256(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 272(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 224(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 240(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 192(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 208(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm15, 160(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm14, 176(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm13, 128(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 144(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%r9) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%r9) -; AVX1-ONLY-NEXT: addq $72, %rsp +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%r9) +; AVX1-ONLY-NEXT: popq %rax ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride5_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $72, %rsp -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX2-SLOW-NEXT: subq $104, %rsp +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm8 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,5,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm12, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm12 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm11, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm11 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm5, %ymm11, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm5, %ymm12, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,5,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0],xmm6[1],xmm10[2],xmm6[3],xmm10[4,5],xmm6[6],xmm10[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm12, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm6, %ymm10, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm10[1],ymm15[2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8],ymm10[9],ymm15[10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm10 -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5],xmm11[6],xmm9[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm11 -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX2-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm12 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm6, %ymm12, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm13, %xmm13 +; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5],xmm14[6],xmm13[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm13[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [10,11,10,11,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm10 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0],xmm7[1],xmm10[2],xmm7[3],xmm10[4,5],xmm7[6],xmm10[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm15, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm0[0,1,1,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm7, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm11 +; AVX2-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm12 +; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,2,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1],xmm13[2],xmm8[3],xmm13[4,5],xmm8[6],xmm13[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,2,3,6,7,6,7,8,9,4,5,4,5,8,9,18,19,18,19,22,23,22,23,24,25,20,21,20,21,24,25] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2],ymm9[3,4],ymm14[5,6,7,8],ymm9[9],ymm14[10],ymm9[11,12],ymm14[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm15, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm4[0,1,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm14, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm10[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,2,6,7,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm14 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[3,2,3,3,7,6,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2],ymm4[3,4],ymm15[5,6,7,8],ymm4[9],ymm15[10],ymm4[11,12],ymm15[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm12, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm4, %ymm9 -; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm12 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[1,1,1,2,5,5,5,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5],ymm15[6],ymm4[7,8],ymm15[9],ymm4[10,11],ymm15[12],ymm4[13],ymm15[14],ymm4[15] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm12, %ymm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[1,1,1,2,5,5,5,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5],ymm14[6],ymm11[7,8],ymm14[9],ymm11[10,11],ymm14[12],ymm11[13],ymm14[14],ymm11[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm8, %ymm14, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm1[0,1,2,1,4,5,6,5] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm3[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0,1],ymm5[2],ymm15[3],ymm5[4],ymm15[5,6],ymm5[7],ymm15[8,9],ymm5[10],ymm15[11],ymm5[12],ymm15[13,14],ymm5[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[1,1,1,2,5,5,5,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10,11],ymm5[12],ymm1[13],ymm5[14],ymm1[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm10[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,2,1,4,5,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm11, %ymm14, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm13 +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[1,1,1,2,5,5,5,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13],ymm14[14],ymm13[15] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm14 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[0,1,2,1,4,5,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3],ymm5[4],ymm0[5,6],ymm5[7],ymm0[8,9],ymm5[10],ymm0[11],ymm5[12],ymm0[13,14],ymm5[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm13[2,3,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm5, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm11, %ymm1, %ymm11 +; AVX2-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm13 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3,4],ymm1[5,6,7,8],ymm5[9],ymm1[10],ymm5[11,12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7,8],ymm6[9],ymm1[10],ymm6[11],ymm1[12,13],ymm6[14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm9[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm8[3,2,3,3,7,6,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[3,4],ymm6[5,6,7,8],ymm7[9],ymm6[10],ymm7[11,12],ymm6[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm5 ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[3,0,3,0,7,4,7,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4,5],ymm6[6],ymm11[7,8],ymm6[9],ymm11[10],ymm6[11],ymm11[12,13],ymm6[14],ymm11[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm12, %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7,8],ymm6[9],ymm4[10],ymm6[11],ymm4[12,13],ymm6[14],ymm4[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm10[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm14, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[3,0,3,0,7,4,7,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] -; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm8[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm9, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,0,3,0,7,4,7,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm14, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm15[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 224(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm12, 128(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 288(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 256(%r9) +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 224(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 64(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 288(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm13, 256(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, 192(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 96(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r9) -; AVX2-SLOW-NEXT: addq $72, %rsp +; AVX2-SLOW-NEXT: addq $104, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride5_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $72, %rsp -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm5 -; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3],xmm7[4],xmm5[5],xmm7[6],xmm5[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm12 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2],xmm14[3],xmm12[4,5],xmm14[6],xmm12[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,0] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm12, %ymm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,1,1,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm12, %ymm0 +; AVX2-FAST-NEXT: subq $136, %rsp +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm11 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm14 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm14, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3],xmm13[4],xmm6[5],xmm13[6],xmm6[7] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2],xmm13[3],xmm7[4,5],xmm13[6],xmm7[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm14 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm10 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm15 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm12 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm13 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3],xmm13[4],xmm5[5],xmm13[6],xmm5[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [10,11,10,11,6,7,6,7,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2],xmm7[3],xmm9[4,5],xmm7[6],xmm9[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm10, %xmm7 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm9, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,1,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm6, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq (%r8), %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm8, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm6[0,1,1,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm7, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm10 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm9 +; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm9[0],xmm12[1],xmm9[2,3],xmm12[4],xmm9[5],xmm12[6],xmm9[7] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm15, %xmm13 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1],xmm13[2],xmm8[3],xmm13[4,5],xmm8[6],xmm13[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm12, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,1,1,1] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm11 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm7 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm8 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8],ymm15[9],ymm0[10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm15 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm12 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[3,2,3,3,7,6,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3,4],ymm12[5,6,7,8],ymm5[9],ymm12[10],ymm5[11,12],ymm12[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm5, %ymm12 -; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm13 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5],ymm14[6],ymm5[7,8],ymm14[9],ymm5[10,11],ymm14[12],ymm5[13],ymm14[14],ymm5[15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3],ymm6[4],ymm0[5,6],ymm6[7],ymm0[8,9],ymm6[10],ymm0[11],ymm6[12],ymm0[13,14],ymm6[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,1,2,5,5,5,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm1[0,1,2,1,4,5,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm14[2],ymm5[3],ymm14[4],ymm5[5,6],ymm14[7],ymm5[8,9],ymm14[10],ymm5[11],ymm14[12],ymm5[13,14],ymm14[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastq 48(%r8), %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7,8],ymm6[9],ymm9[10],ymm6[11],ymm9[12,13],ymm6[14],ymm9[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm7[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm6, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,0,3,0,7,4,7,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8],ymm5[9],ymm2[10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm8[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [2,3,2,3,6,7,6,7,8,9,4,5,4,5,8,9,18,19,18,19,22,23,22,23,24,25,20,21,20,21,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm11 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13],ymm12[14],ymm11[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [6,7,2,3,4,5,4,5,8,9,8,9,8,9,8,9,22,23,18,19,20,21,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm13 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3],ymm14[4],ymm13[5,6],ymm14[7],ymm13[8,9],ymm14[10],ymm13[11],ymm14[12],ymm13[13,14],ymm14[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm11, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm11, %ymm14 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10,11],ymm15[12],ymm14[13],ymm15[14],ymm14[15] +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm14 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm14, %ymm12 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,1,2,1,4,5,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1],ymm3[2],ymm12[3],ymm3[4],ymm12[5,6],ymm3[7],ymm12[8,9],ymm3[10],ymm12[11],ymm3[12],ymm12[13,14],ymm3[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastq 16(%r8), %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm13, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vpbroadcastq 48(%r8), %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm13 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [14,15,12,13,10,11,14,15,14,15,12,13,14,15,12,13,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,12,13,14,15,14,15,10,11,12,13,14,15,14,15,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3,4],ymm4[5,6,7,8],ymm5[9],ymm4[10],ymm5[11,12],ymm4[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm5[2,2,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm14, %ymm3 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[3,2,3,3,7,6,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3,4],ymm2[5,6,7,8],ymm3[9],ymm2[10],ymm3[11,12],ymm2[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm8[2,3,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm4 +; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpbroadcastq 56(%r8), %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[3,0,3,0,7,4,7,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm7[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5],ymm5[6],ymm10[7,8],ymm5[9],ymm10[10,11],ymm5[12],ymm10[13],ymm5[14],ymm10[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[3,0,3,0,7,4,7,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10],ymm2[11],ymm5[12,13],ymm2[14],ymm5[15] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm14, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm15[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpermq $165, (%rsp), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 224(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 96(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm13, 128(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm12, 288(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 256(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 224(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 288(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm13, 256(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 128(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm12, 96(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%r9) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-NEXT: addq $72, %rsp +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) +; AVX2-FAST-NEXT: addq $136, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride5_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $72, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3],xmm7[4],xmm5[5],xmm7[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2],xmm14[3],xmm12[4,5],xmm14[6],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm5, %ymm12, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm12, %ymm0 +; AVX2-FAST-PERLANE-NEXT: subq $136, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm3, %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm14, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 40(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3],xmm13[4],xmm6[5],xmm13[6],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm11, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2],xmm13[3],xmm7[4,5],xmm13[6],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm6, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm6, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3],xmm13[4],xmm5[5],xmm13[6],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [10,11,10,11,6,7,6,7,8,9,8,9,8,9,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2],xmm7[3],xmm9[4,5],xmm7[6],xmm9[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm10, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm9, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm6, %ymm9, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm7, %ymm9, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm8, %ymm11, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 32(%r8), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 32(%r8), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm10, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm6[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm14, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 40(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm12 = xmm9[0],xmm12[1],xmm9[2,3],xmm12[4],xmm9[5],xmm12[6],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm15, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1],xmm13[2],xmm8[3],xmm13[4,5],xmm8[6],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm12, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm10, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8],ymm15[9],ymm0[10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm15, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[3,2,3,3,7,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3,4],ymm12[5,6,7,8],ymm5[9],ymm12[10],ymm5[11,12],ymm12[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm5, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 24(%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5],ymm14[6],ymm5[7,8],ymm14[9],ymm5[10,11],ymm14[12],ymm5[13],ymm14[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[0,1,2,1,4,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3],ymm6[4],ymm0[5,6],ymm6[7],ymm0[8,9],ymm6[10],ymm0[11],ymm6[12],ymm0[13,14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm15, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,1,2,5,5,5,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm11, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm1[0,1,2,1,4,5,6,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm14[2],ymm5[3],ymm14[4],ymm5[5,6],ymm14[7],ymm5[8,9],ymm14[10],ymm5[11],ymm14[12],ymm5[13,14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 48(%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 16(%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm10, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,0,3,0,7,4,7,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7,8],ymm6[9],ymm9[10],ymm6[11],ymm9[12,13],ymm6[14],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm7[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm6, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm15, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,0,3,0,7,4,7,4] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8],ymm5[9],ymm2[10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm11, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm8[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [2,3,2,3,6,7,6,7,8,9,4,5,4,5,8,9,18,19,18,19,22,23,22,23,24,25,20,21,20,21,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm10, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13],ymm12[14],ymm11[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [6,7,2,3,4,5,4,5,8,9,8,9,8,9,8,9,22,23,18,19,20,21,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm4, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3],ymm14[4],ymm13[5,6],ymm14[7],ymm13[8,9],ymm14[10],ymm13[11],ymm14[12],ymm13[13,14],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm11, %ymm13, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm11, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10,11],ymm15[12],ymm14[13],ymm15[14],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm14, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,1,2,1,4,5,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1],ymm3[2],ymm12[3],ymm3[4],ymm12[5,6],ymm3[7],ymm12[8,9],ymm3[10],ymm12[11],ymm3[12],ymm12[13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 16(%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm13, %ymm1, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 48(%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [14,15,12,13,10,11,14,15,14,15,12,13,14,15,12,13,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,12,13,14,15,14,15,10,11,12,13,14,15,14,15,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3,4],ymm4[5,6,7,8],ymm5[9],ymm4[10],ymm5[11,12],ymm4[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm5[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm14, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[3,2,3,3,7,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3,4],ymm2[5,6,7,8],ymm3[9],ymm2[10],ymm3[11,12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm8[2,3,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 24(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 56(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[3,0,3,0,7,4,7,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm7[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5],ymm5[6],ymm10[7,8],ymm5[9],ymm10[10,11],ymm5[12],ymm10[13],ymm5[14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[3,0,3,0,7,4,7,4] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10],ymm2[11],ymm5[12,13],ymm2[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm14, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm15[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermq $165, (%rsp), %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 224(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 128(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 288(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 256(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 288(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 256(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 128(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 96(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: addq $72, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r9) +; AVX2-FAST-PERLANE-NEXT: addq $136, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i16_stride5_vf32: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm22 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm5[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,2] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %xmm24 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,10,11,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm20 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm21 +; AVX512F-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm1[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm2 +; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm19 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm19 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm4[1],ymm8[2],ymm4[3,4],ymm8[5,6,7,8],ymm4[9],ymm8[10],ymm4[11,12],ymm8[13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm4[2,2,3,2] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm5 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm9 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm9[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm15[1],ymm11[2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7,8],ymm15[9],ymm11[10],ymm15[11],ymm11[12,13],ymm15[14],ymm11[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm16, %zmm6 -; AVX512F-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm2 -; AVX512F-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm13 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm13, %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm17 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm17[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm11 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm11, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm17, %zmm7 +; AVX512F-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm5 +; AVX512F-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm11 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm7, %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512F-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm18 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm18[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3],ymm11[4],ymm14[5],ymm11[6],ymm14[7,8],ymm11[9],ymm14[10,11],ymm11[12],ymm14[13],ymm11[14],ymm14[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm13 ; AVX512F-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm14 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm14[1],xmm11[2,3],xmm14[4],xmm11[5],xmm14[6],xmm11[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2],ymm14[3],ymm8[4,5],ymm14[6],ymm8[7,8],ymm14[9],ymm8[10],ymm14[11],ymm8[12,13],ymm14[14],ymm8[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm10 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm10, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm11, %ymm23 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpandnq %ymm10, %ymm19, %ymm10 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm10 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm10 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm12 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2],xmm0[3],xmm12[4,5],xmm0[6],xmm12[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 -; AVX512F-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,2,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,1,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm17[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm13, %ymm12 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0,1],ymm7[2],ymm12[3],ymm7[4],ymm12[5,6],ymm7[7],ymm12[8,9],ymm7[10],ymm12[11],ymm7[12],ymm12[13,14],ymm7[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5],xmm14[6],xmm13[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm14, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm10[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[3],ymm15[4,5],ymm0[6],ymm15[7,8],ymm0[9],ymm15[10],ymm0[11],ymm15[12,13],ymm0[14],ymm15[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3],xmm6[4,5],xmm4[6],xmm6[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm12, %ymm4 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm12[0,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpandn %ymm11, %ymm12, %ymm11 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm11, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm11 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm18[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm7, %ymm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3],ymm0[4],ymm4[5,6],ymm0[7],ymm4[8,9],ymm0[10],ymm4[11],ymm0[12],ymm4[13,14],ymm0[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2],ymm4[3],ymm7[4,5],ymm4[6],ymm7[7,8],ymm4[9],ymm7[10],ymm4[11],ymm7[12,13],ymm4[14],ymm7[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,2] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,2,3,6,7,6,7,8,9,4,5,4,5,8,9,18,19,18,19,22,23,22,23,24,25,20,21,20,21,24,25] +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm14, %ymm7 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3],ymm12[4],ymm7[5],ymm12[6],ymm7[7,8],ymm12[9],ymm7[10,11],ymm12[12],ymm7[13],ymm12[14],ymm7[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm17[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm12 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512F-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm6, %ymm13 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13],ymm14[14],ymm13[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3,4],ymm6[5,6,7,8],ymm4[9],ymm6[10],ymm4[11,12],ymm6[13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm13, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm12, %zmm8, %zmm4 -; AVX512F-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm6 -; AVX512F-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm4 -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm3[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm5, %ymm5 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8,9],ymm3[10],ymm5[11],ymm3[12],ymm5[13,14],ymm3[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm4 -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10,11],ymm5[12],ymm1[13],ymm5[14],ymm1[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm16, %zmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm14[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3,4],ymm12[5,6,7,8],ymm10[9],ymm12[10],ymm10[11,12],ymm12[13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm7 +; AVX512F-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm0 +; AVX512F-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm10 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm7 +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm9, %ymm7 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5],ymm7[6],ymm10[7,8],ymm7[9],ymm10[10,11],ymm7[12],ymm10[13],ymm7[14],ymm10[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm9, %ymm9 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm8 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm17, %zmm2 ; AVX512F-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512F-SLOW-NEXT: vpandn %ymm3, %ymm4, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm4 -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm9, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 64(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 256(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 192(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r9) +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 192(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 128(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, (%r9) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride5_vf32: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm12[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm2 -; AVX512F-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm22 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpandn %ymm1, %ymm10, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm1 -; AVX512F-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm12[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vprolq $16, %ymm11, %ymm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,10,11,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm2 +; AVX512F-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm1[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm17, %zmm4 +; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,1,1,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm18 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm14 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm14[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX512F-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm21 ; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %ymm19 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm2 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm19[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[3,4],ymm2[5,6,7,8],ymm5[9],ymm2[10],ymm5[11,12],ymm2[13,14,15] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm9 -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm15 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm15[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8],ymm11[9],ymm1[10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm6 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm15[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5],ymm6[6],ymm11[7,8],ymm6[9],ymm11[10,11],ymm6[12],ymm11[13],ymm6[14],ymm11[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm12[2,3,2,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,2,3,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] +; AVX512F-FAST-NEXT: vmovdqa64 32(%rdx), %ymm16 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm16[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15] +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm22 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpandn %ymm2, %ymm7, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,12,13,14,15,14,15,10,11,12,13,14,15,14,15,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3,4],ymm0[5,6,7,8],ymm6[9],ymm0[10],ymm6[11,12],ymm0[13,14,15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vprolq $16, %ymm8, %ymm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm14[2],ymm8[3],ymm14[4],ymm8[5,6],ymm14[7],ymm8[8,9],ymm14[10],ymm8[11],ymm14[12],ymm8[13,14],ymm14[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm13, %ymm8 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm8[1],ymm14[2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7,8],ymm8[9],ymm14[10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13],ymm14[14],ymm13[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm13 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm13[0,1,1,1] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm15 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpandnq %ymm15, %ymm19, %ymm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm13 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm18 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm3 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm17[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm10, %zmm9 -; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm7, %zmm10, %zmm0 -; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm7 -; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm2, %zmm1 -; AVX512F-FAST-NEXT: vpbroadcastq 24(%r8), %ymm0 -; AVX512F-FAST-NEXT: vpbroadcastq 32(%r8), %ymm4 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm6, %zmm2, %zmm8 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 64(%r9) +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [14,15,12,13,10,11,14,15,14,15,12,13,14,15,12,13,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7,8],ymm8[9],ymm5[10],ymm8[11],ymm5[12,13],ymm8[14],ymm5[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,2] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm5 +; AVX512F-FAST-NEXT: vpbroadcastq 24(%r8), %ymm0 +; AVX512F-FAST-NEXT: vpbroadcastq 32(%r8), %ymm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vprolq $16, %ymm11, %ymm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm4 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm10[1],ymm4[2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7,8],ymm10[9],ymm4[10],ymm10[11],ymm4[12,13],ymm10[14],ymm4[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,2] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,2,3,6,7,6,7,8,9,4,5,4,5,8,9,18,19,18,19,22,23,22,23,24,25,20,21,20,21,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm10 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm16[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm2 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm16[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2],ymm11[3,4],ymm2[5,6,7,8],ymm11[9],ymm2[10],ymm11[11,12],ymm2[13,14,15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm10, %zmm2 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm17, %zmm2 +; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm4 +; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm4, %zmm4 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm2 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm6[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm2[1],ymm10[2,3],ymm2[4],ymm10[5],ymm2[6],ymm10[7,8],ymm2[9],ymm10[10,11],ymm2[12],ymm10[13],ymm2[14],ymm10[15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vprolq $16, %ymm9, %ymm9 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1],ymm6[2],ymm9[3],ymm6[4],ymm9[5,6],ymm6[7],ymm9[8,9],ymm6[10],ymm9[11],ymm6[12],ymm9[13,14],ymm6[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm6 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm19[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7,8],ymm6[9],ymm9[10],ymm6[11],ymm9[12,13],ymm6[14],ymm9[15] +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm19[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm8, %zmm3 +; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpandn %ymm2, %ymm5, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 256(%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 128(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 256(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, (%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 192(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 192(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, (%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -4044,511 +4070,506 @@ ; ; AVX1-ONLY-LABEL: store_i16_stride5_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $392, %rsp # imm = 0x188 -; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm10[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 80(%r8), %xmm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm0[1,2,3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm0[1],xmm3[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm12, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm12, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm13, %ymm7 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm13 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4],xmm11[5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4,5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3],xmm14[4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm14[1],xmm2[2,3,4,5],xmm14[6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm9[3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: subq $328, %rsp # imm = 0x148 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,1,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 64(%r8), %xmm8 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2,3],xmm4[4],xmm2[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,1,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 +; AVX1-ONLY-NEXT: vandps %ymm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm10[0,1,2,3],xmm12[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3,4,5],xmm12[6],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3,4,5,6],xmm12[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm15, %ymm8 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm9[4],xmm10[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vandps %ymm15, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm8, %ymm10, %ymm8 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm9[1],xmm2[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,0,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm12[2],xmm9[3,4,5,6],xmm12[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm11, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm12 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm6[1],xmm4[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm7[3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2,3,4],xmm7[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm7[2],xmm2[3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm4[2],xmm7[3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm8[3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm5[1],xmm3[2,3,4,5],xmm5[6],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3,4],xmm2[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3,4],xmm4[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm4[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm7 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm4[1],xmm7[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm7 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm5[4],xmm4[5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,1,3,3] +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm10, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm4[4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4,5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm11, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm4[1],xmm6[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm10, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm9 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm6[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm10[4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm11, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm11, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm9, %ymm13, %ymm9 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm13 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm10[1],xmm13[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1,2,3,4],xmm7[5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm7[2],xmm0[3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6],xmm1[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm3[1],xmm7[2,3,4,5],xmm3[6],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm2[1,2,3,4],xmm0[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm5[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm5[1],xmm7[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,1,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa 64(%r8), %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5],xmm6[6],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[2,1,3,3] +; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm10, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm5[4],xmm8[5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4,5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm11, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm5[1],xmm6[1] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 112(%r8), %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm4[1,2,3,4],xmm6[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm9 -; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm10[4],xmm13[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm11, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm11, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm9, %ymm13, %ymm9 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm13 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm10[1],xmm13[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%r8), %xmm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4],xmm5[5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm5[2],xmm0[3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 80(%r8), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm2[1],xmm7[2,3,4,5],xmm2[6],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm0[0],xmm3[1,2,3,4],xmm0[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm1[1],xmm5[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,1,1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm12, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1,2,3,4],xmm5[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 96(%r8), %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm4[0,1,2,3],xmm6[4],xmm4[5,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm5[2],xmm4[3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm9 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm10[1],xmm4[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm11[0,1,2,3,4,5,7,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,3,3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm12, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0],xmm6[1],xmm4[2,3,4,5],xmm6[6],xmm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm13[0,1],xmm4[2],xmm13[3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[0,1,3,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,1,1] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm13 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,3,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,3,3] +; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,0,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4],xmm6[5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1,2,3],xmm7[4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3,4,5],xmm7[6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm10[4],xmm6[5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1],xmm6[2],xmm12[3,4,5,6],xmm6[7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1],xmm6[2],xmm15[3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm11, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2],xmm7[3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4],xmm3[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4,5,6],xmm3[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2,3,4],xmm6[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 112(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 96(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 80(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r9) +; AVX1-ONLY-NEXT: vmovdqa 112(%r8), %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm7[1],xmm4[2,3,4,5],xmm7[6],xmm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0],xmm0[1,2,3,4],xmm6[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2],xmm0[3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 624(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 608(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 592(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 576(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 560(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 544(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 528(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 512(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 496(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 480(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 464(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm14, 448(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 496(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 480(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 560(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 544(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 528(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 368(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 512(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 624(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 608(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%r9) ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 592(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 576(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4558,34 +4579,30 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 368(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 464(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 448(%r9) -; AVX1-ONLY-NEXT: addq $392, %rsp # imm = 0x188 +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%r9) +; AVX1-ONLY-NEXT: addq $328, %rsp # imm = 0x148 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -4658,11 +4675,11 @@ ; AVX2-SLOW-NEXT: vpbroadcastq 96(%r8), %ymm0 ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm0 ; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,10,11,6,7,6,7,8,9,8,9,8,9,8,9] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[1,2,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6],xmm1[7] @@ -4725,8 +4742,7 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm8[0,1,1,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,2,3,6,7,6,7,8,9,4,5,4,5,8,9,18,19,18,19,22,23,22,23,24,25,20,21,20,21,24,25] ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,1,2,5,5,5,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] @@ -4877,11 +4893,11 @@ ; AVX2-SLOW-NEXT: vpbroadcastq 120(%r8), %ymm14 ; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm14, %ymm10, %ymm15 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm6[3,0,3,0,7,4,7,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2],ymm15[3],ymm12[4,5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10],ymm15[11],ymm12[12,13],ymm15[14],ymm12[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm11[1,1,2,2] @@ -4972,52 +4988,47 @@ ; AVX2-FAST-NEXT: subq $936, %rsp # imm = 0x3A8 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm8 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm12 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm8 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm3[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm3[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm9, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm14 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm15 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm8 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm13 -; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm10[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm13 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm7 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm12, %ymm13, %ymm12 +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm13 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm11 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm11[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm11 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm15 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm13, %ymm15, %ymm15 -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm13 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm14, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm14 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -5030,7 +5041,7 @@ ; AVX2-FAST-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm14, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm12, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq 64(%r8), %ymm3 ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm15, %ymm3, %ymm3 @@ -5038,45 +5049,45 @@ ; AVX2-FAST-NEXT: vpbroadcastq 96(%r8), %ymm3 ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2],xmm6[3],xmm5[4,5],xmm6[6],xmm5[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [10,11,10,11,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5],xmm11[6],xmm5[7] -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm5 +; AVX2-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3],xmm8[4],xmm5[5],xmm8[6],xmm5[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm9, %ymm5 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm9 -; AVX2-FAST-NEXT: vpbroadcastq 72(%rdi), %xmm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5],xmm10[6],xmm9[7] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm8 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[1,2,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm12 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm6 +; AVX2-FAST-NEXT: vpbroadcastq 72(%rdi), %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5],xmm8[6],xmm6[7] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm12 ; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm8, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm11, %xmm8 +; AVX2-FAST-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm9 = mem[1,2,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2],xmm9[3],xmm8[4,5],xmm9[6],xmm8[7] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm6 ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm9 ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-FAST-NEXT: vpbroadcastq 104(%rdi), %xmm8 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5],xmm8[6],xmm0[7] ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm8 @@ -5090,7 +5101,7 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,1,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 @@ -5099,20 +5110,18 @@ ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm5, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm4[0,1,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm7, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm6, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,2,5,5,5,6] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,2,3,6,7,6,7,8,9,4,5,4,5,8,9,18,19,18,19,22,23,22,23,24,25,20,21,20,21,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[1,1,1,2,5,5,5,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm12 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [6,7,2,3,4,5,4,5,8,9,8,9,8,9,8,9,22,23,18,19,20,21,20,21,24,25,24,25,24,25,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,1,4,5,6,5] @@ -5188,12 +5197,11 @@ ; AVX2-FAST-NEXT: vpbroadcastq 112(%r8), %ymm1 ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [14,15,12,13,10,11,14,15,14,15,12,13,14,15,12,13,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7,8],ymm7[9],ymm1[10],ymm7[11],ymm1[12,13],ymm7[14],ymm1[15] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,12,13,14,15,14,15,10,11,12,13,14,15,14,15,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm15 ; AVX2-FAST-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload @@ -5248,13 +5256,13 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq 120(%r8), %ymm11 ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm7, %ymm11, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm12 ; AVX2-FAST-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = mem[3,0,3,0,7,4,7,4] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm15 ; AVX2-FAST-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload @@ -5347,52 +5355,47 @@ ; AVX2-FAST-PERLANE-NEXT: subq $936, %rsp # imm = 0x3A8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm3[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm3[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm9, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm10[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm13, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm12, %ymm13, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm11[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm15, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm13, %ymm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm14, %ymm15, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rsi), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -5405,7 +5408,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 32(%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm14, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm12, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 64(%r8), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm15, %ymm3, %ymm3 @@ -5413,45 +5416,45 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 96(%r8), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm6, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2],xmm6[3],xmm5[4,5],xmm6[6],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [10,11,10,11,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm8, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 40(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5],xmm11[6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 40(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3],xmm8[4],xmm5[5],xmm8[6],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm10, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm5, %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 72(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5],xmm10[6],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm7, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[1,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 72(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5],xmm8[6],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm8, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm11, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd $169, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[1,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2],xmm9[3],xmm8[4,5],xmm9[6],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 104(%rdi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5],xmm8[6],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm8 @@ -5465,7 +5468,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 @@ -5474,20 +5477,18 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm5, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm4[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm7, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm6, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,2,5,5,5,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,2,3,6,7,6,7,8,9,4,5,4,5,8,9,18,19,18,19,22,23,22,23,24,25,20,21,20,21,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[1,1,1,2,5,5,5,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [6,7,2,3,4,5,4,5,8,9,8,9,8,9,8,9,22,23,18,19,20,21,20,21,24,25,24,25,24,25,24,25] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm12, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,1,4,5,6,5] @@ -5563,12 +5564,11 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 112(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [14,15,12,13,10,11,14,15,14,15,12,13,14,15,12,13,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7,8],ymm7[9],ymm1[10],ymm7[11],ymm1[12,13],ymm7[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm7 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,12,13,14,15,14,15,10,11,12,13,14,15,14,15,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload @@ -5623,13 +5623,13 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 120(%r8), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm7, %ymm11, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm0, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[3,0,3,0,7,4,7,4] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload @@ -5719,769 +5719,763 @@ ; ; AVX512F-SLOW-LABEL: store_i16_stride5_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm0 +; AVX512F-SLOW-NEXT: subq $488, %rsp # imm = 0x1E8 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [10,11,10,11,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa64 96(%rdx), %ymm26 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm3 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm7, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 96(%rdi), %ymm21 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm21[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm4 -; AVX512F-SLOW-NEXT: vpbroadcastq 104(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm1 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm1[12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17],zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpandn %ymm5, %ymm9, %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %ymm27 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm31 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm31[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4],ymm5[5,6,7,8],ymm4[9],ymm5[10],ymm4[11,12],ymm5[13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,2,3,2] +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13] +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm2 +; AVX512F-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm8 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0],xmm8[1],xmm2[2,3],xmm8[4],xmm2[5],xmm8[6],xmm2[7] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm6, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %ymm22 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm15 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm22[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm6[1],ymm9[2],ymm6[3,4],ymm9[5,6,7,8],ymm6[9],ymm9[10],ymm6[11,12],ymm9[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %ymm27 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm27[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm12 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm12[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7,8],ymm6[9],ymm9[10],ymm6[11],ymm9[12,13],ymm6[14],ymm9[15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm9 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm15[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm9[1],ymm11[2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10],ymm9[11],ymm11[12,13],ymm9[14],ymm11[15] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,2,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm4[0],xmm10[1],xmm4[2],xmm10[3],xmm4[4,5],xmm10[6],xmm4[7] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm10[0,1,0,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm2 +; AVX512F-SLOW-NEXT: vpbroadcastq 72(%rdi), %xmm10 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3],xmm10[4],xmm2[5],xmm10[6],xmm2[7] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm28 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8],ymm0[9],ymm3[10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13],ymm2[14],ymm4[15] +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm10 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm28 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,1,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm4, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm16 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpandn %ymm0, %ymm4, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm24 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm2 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm2 -; AVX512F-SLOW-NEXT: vpbroadcastq 72(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm27[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3,4],ymm4[5,6,7,8],ymm0[9],ymm4[10],ymm0[11,12],ymm4[13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm0 -; AVX512F-SLOW-NEXT: vpbroadcastq 8(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm0[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %ymm16 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4,5],ymm8[6],ymm3[7,8],ymm8[9],ymm3[10],ymm8[11],ymm3[12,13],ymm8[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm21 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm21[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2],ymm10[3,4],ymm0[5,6,7,8],ymm10[9],ymm0[10],ymm10[11,12],ymm0[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm9, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm8 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7,8],ymm8[9],ymm12[10],ymm8[11],ymm12[12,13],ymm8[14],ymm12[15] -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm11[1],xmm4[2],xmm11[3],xmm4[4,5],xmm11[6],xmm4[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm4, %ymm8 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm0[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2,3],ymm8[4],ymm11[5],ymm8[6],ymm11[7,8],ymm8[9],ymm11[10,11],ymm8[12],ymm11[13],ymm8[14],ymm11[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm18 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm10 -; AVX512F-SLOW-NEXT: vpbroadcastq 40(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm29 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,1,1,1] -; AVX512F-SLOW-NEXT: vpandn %ymm10, %ymm9, %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm8, %ymm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm22 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm25 -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm5 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,2,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2],xmm8[3],xmm5[4,5],xmm8[6],xmm5[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm24 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm7, %ymm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm5[2],ymm8[3],ymm5[4],ymm8[5,6],ymm5[7],ymm8[8,9],ymm5[10],ymm8[11],ymm5[12],ymm8[13,14],ymm5[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm23 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,2,6,7,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm19 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm9, %ymm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm26[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm9[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm26[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2],ymm8[3,4],ymm7[5,6,7,8],ymm8[9],ymm7[10],ymm8[11,12],ymm7[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm7, %ymm26 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm4, %ymm8 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm13 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm10 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm25[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4,5],ymm7[6],ymm0[7,8],ymm7[9],ymm0[10],ymm7[11],ymm0[12,13],ymm7[14],ymm0[15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 96(%rdx), %ymm29 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7,8],ymm7[9],ymm3[10],ymm7[11],ymm3[12,13],ymm7[14],ymm3[15] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[1,2,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 96(%rdi), %ymm30 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm13 +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm13, %ymm3 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm30[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13],ymm3[14],ymm5[15] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpbroadcastq 104(%rdi), %xmm7 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3],xmm7[4],xmm5[5],xmm7[6],xmm5[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm23 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm3 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,1,1,1] +; AVX512F-SLOW-NEXT: vpandn %ymm5, %ymm4, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm9 +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm11, %ymm5 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm31 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm11[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,2,3,6,7,6,7,8,9,4,5,4,5,8,9,18,19,18,19,22,23,22,23,24,25,20,21,20,21,24,25] +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm6, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13],ymm4[14],ymm1[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3,4],ymm1[5,6,7,8],ymm4[9],ymm1[10],ymm4[11,12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm30[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm13, %ymm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm4[0,1],ymm1[2],ymm4[3],ymm1[4],ymm4[5,6],ymm1[7],ymm4[8,9],ymm1[10],ymm4[11],ymm1[12],ymm4[13,14],ymm1[15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,2,6,7,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10],ymm4[11],ymm1[12,13],ymm4[14],ymm1[15] +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm29[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm13[1],ymm1[2,3],ymm13[4],ymm1[5],ymm13[6],ymm1[7,8],ymm13[9],ymm1[10,11],ymm13[12],ymm1[13],ymm13[14],ymm1[15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15] -; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5],ymm4[6],ymm0[7,8],ymm4[9],ymm0[10,11],ymm4[12],ymm0[13],ymm4[14],ymm0[15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,2,3,3,7,6,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm14, %ymm13 -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm16[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11],ymm2[12],ymm1[13,14],ymm2[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm9 -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm6, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm27[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpandn %ymm0, %ymm1, %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm27 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm11, %ymm14 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm28[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5],ymm14[6],ymm12[7,8],ymm14[9],ymm12[10,11],ymm14[12],ymm12[13],ymm14[14],ymm12[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm17[2,3,2,2] -; AVX512F-SLOW-NEXT: vprolq $16, %ymm11, %ymm14 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm28[0,1,2,1,4,5,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0,1],ymm11[2],ymm14[3],ymm11[4],ymm14[5,6],ymm11[7],ymm14[8,9],ymm11[10],ymm14[11],ymm11[12],ymm14[13,14],ymm11[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm21 -; AVX512F-SLOW-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm11, %ymm12 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[3,0,3,0,7,4,7,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] -; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm5 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm18[0,2,1,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,5,6] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm31[1,1,1,2,5,5,5,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5],ymm9[6],ymm5[7,8],ymm9[9],ymm5[10,11],ymm9[12],ymm5[13],ymm9[14],ymm5[15] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm17 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm9, %zmm18, %zmm17 -; AVX512F-SLOW-NEXT: vpbroadcastq 88(%r8), %ymm9 -; AVX512F-SLOW-NEXT: vpbroadcastq 96(%r8), %ymm20 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm9, %zmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm20, %zmm9 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm11, %zmm17 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm17, %zmm18, %zmm13 -; AVX512F-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm14 -; AVX512F-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm17 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm20, %zmm14 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm22[0,1,0,1] -; AVX512F-SLOW-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm17 = mem[0,1,0,0] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm12, %zmm5 -; AVX512F-SLOW-NEXT: vpbroadcastq 80(%r8), %ymm12 -; AVX512F-SLOW-NEXT: vpandn %ymm12, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm11 -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm12, %ymm11 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm25[0,1,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm24[0,1,0,0] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm19[2,3,2,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm15[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm26[2,2,3,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm29[3,2,3,3,7,6,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa %ymm8, %ymm7 +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm12, %ymm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm27[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2,3],ymm0[4],ymm14[5],ymm0[6],ymm14[7,8],ymm0[9],ymm14[10,11],ymm0[12],ymm14[13],ymm0[14],ymm14[15] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm12, %ymm12 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm27[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3],ymm14[4],ymm12[5,6],ymm14[7],ymm12[8,9],ymm14[10],ymm12[11],ymm14[12],ymm12[13,14],ymm14[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm27 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm22[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2],ymm0[3],ymm14[4,5],ymm0[6],ymm14[7,8],ymm0[9],ymm14[10],ymm0[11],ymm14[12,13],ymm0[14],ymm14[15] +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm6, %ymm8 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm22[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3],ymm14[4],ymm8[5],ymm14[6],ymm8[7,8],ymm14[9],ymm8[10,11],ymm14[12],ymm8[13],ymm14[14],ymm8[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm19 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm29 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm11, %zmm19, %zmm26 -; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm11, %zmm29, %zmm10 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm13, %zmm11 -; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm13 = mem[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpbroadcastq 16(%r8), %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpandn %ymm8, %ymm14, %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm6 +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm12, %ymm9 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm10, %ymm9 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm25[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5],ymm9[6],ymm7[7,8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13],ymm9[14],ymm7[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm12[0,1,1,1] +; AVX512F-SLOW-NEXT: vprolq $16, %ymm10, %ymm10 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm25[0,1,2,1,4,5,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3],ymm12[4],ymm10[5,6],ymm12[7],ymm10[8,9],ymm12[10],ymm10[11],ymm12[12],ymm10[13,14],ymm12[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm10 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm21[3,0,3,0,7,4,7,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8],ymm10[9],ymm12[10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15] +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpermq $186, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm11 = mem[2,2,3,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm21[1,1,1,2,5,5,5,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5],ymm12[6],ymm2[7,8],ymm12[9],ymm2[10,11],ymm12[12],ymm2[13],ymm12[14],ymm2[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm17[0,1,0,1] +; AVX512F-SLOW-NEXT: vpermq $174, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm21 = mem[2,3,2,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm10, %zmm2 +; AVX512F-SLOW-NEXT: vpbroadcastq 80(%r8), %ymm10 +; AVX512F-SLOW-NEXT: vpandn %ymm10, %ymm14, %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm14 +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm14, %ymm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm10 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm16 = mem[0,1,0,1,4,5,4,5] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm11, %zmm17, %zmm13 -; AVX512F-SLOW-NEXT: vpbroadcastq 64(%r8), %ymm11 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm10, %zmm17, %zmm16 +; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm10 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm16, %zmm10, %zmm9 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm12, %zmm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm13 -; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm19 = mem[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm13, %zmm17, %zmm19 -; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm13 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm12, %zmm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm12 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm25, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm12, %zmm17, %zmm13 -; AVX512F-SLOW-NEXT: vpbroadcastq 112(%r8), %ymm12 -; AVX512F-SLOW-NEXT: vpbroadcastq 120(%r8), %ymm15 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm15, %zmm12 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm17, %zmm3 -; AVX512F-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm4 -; AVX512F-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm15, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm18, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm21, %zmm18, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm3, %zmm2, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm3, %zmm5, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 384(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 256(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 576(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 192(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 128(%r9) +; AVX512F-SLOW-NEXT: vpshufd $216, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm12 = mem[0,2,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,5,6] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm21, %zmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm11, %zmm16, %zmm12 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm18[0,1,0,1] +; AVX512F-SLOW-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm18 = mem[0,1,0,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm11, %zmm11 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,1] +; AVX512F-SLOW-NEXT: vpermq $186, (%rsp), %ymm18 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm18 = mem[2,2,3,2] +; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpermq $174, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm22 = mem[2,3,2,2] +; AVX512F-SLOW-NEXT: vpshufd $216, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm6 = mem[0,2,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm31[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm26[2,3,2,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm20[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm19[2,2,3,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] +; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm20 = mem[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm11, %zmm17, %zmm20 +; AVX512F-SLOW-NEXT: vpbroadcastq 64(%r8), %ymm11 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm11, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm10, %zmm11 +; AVX512F-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm10 +; AVX512F-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm14, %zmm10 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm12 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm22, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm12, %zmm16, %zmm6 +; AVX512F-SLOW-NEXT: vpbroadcastq 88(%r8), %ymm12 +; AVX512F-SLOW-NEXT: vpbroadcastq 96(%r8), %ymm18 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm12, %zmm12 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm14, %zmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] +; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm28 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm23 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm6, %zmm28, %zmm24 +; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm6, %zmm23, %zmm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm25, %zmm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm14 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm17, %zmm14 +; AVX512F-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm6 +; AVX512F-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm18 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm14, %zmm18, %zmm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm17, %zmm1 +; AVX512F-SLOW-NEXT: vpbroadcastq 112(%r8), %ymm4 +; AVX512F-SLOW-NEXT: vpbroadcastq 120(%r8), %ymm13 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm18, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm27, %zmm16, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm16, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm1, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm1, %zmm2, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 384(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 64(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 576(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 256(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 512(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 448(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 320(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 448(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 512(%r9) -; AVX512F-SLOW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 192(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 128(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, (%r9) +; AVX512F-SLOW-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride5_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa64 96(%rdx), %ymm20 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm9 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9> -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %xmm15 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512F-FAST-NEXT: vmovdqa64 96(%rdi), %ymm23 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm23[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] -; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13> -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm31 -; AVX512F-FAST-NEXT: vpbroadcastq 104(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm14 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpandn %ymm1, %ymm13, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX512F-FAST-NEXT: vpbroadcastq 72(%rdi), %xmm5 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm10 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 32(%rdx), %ymm27 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm27[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm4 +; AVX512F-FAST-NEXT: subq $504, %rsp # imm = 0x1F8 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,10,11,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,10,11,10,11,6,7,8,9,8,9,12,13,12,13] +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm4 ; AVX512F-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm5 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7] ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm8 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa64 32(%rdx), %ymm18 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2],xmm6[3],xmm4[4,5],xmm6[6],xmm4[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,0] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm11, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 32(%rdi), %ymm23 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm23[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm10, %xmm4 +; AVX512F-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7] +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm12 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,1] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpandn %ymm6, %ymm3, %ymm6 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm4 +; AVX512F-FAST-NEXT: vpbroadcastq 72(%rdi), %xmm6 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7] +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,2,2,2] +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,2] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 -; AVX512F-FAST-NEXT: vmovdqa64 32(%rdi), %ymm30 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm30[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13],ymm1[14],ymm4[15] -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX512F-FAST-NEXT: vpbroadcastq 40(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,1] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpandn %ymm3, %ymm13, %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm17 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vprolq $16, %ymm8, %ymm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8,9],ymm1[10],ymm3[11],ymm1[12],ymm3[13,14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %ymm20 -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm15 -; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm9 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm9, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm30[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vprolq $16, %ymm7, %ymm8 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3],ymm0[4],ymm8[5,6],ymm0[7],ymm8[8,9],ymm0[10],ymm8[11],ymm0[12],ymm8[13,14],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm30[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4,5],ymm7[6],ymm0[7,8],ymm7[9],ymm0[10],ymm7[11],ymm0[12,13],ymm7[14],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm27[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5],ymm7[6],ymm0[7,8],ymm7[9],ymm0[10,11],ymm7[12],ymm0[13],ymm7[14],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm27[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm2[1],ymm0[2],ymm2[3,4],ymm0[5,6,7,8],ymm2[9],ymm0[10],ymm2[11,12],ymm0[13,14,15] -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %ymm27 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm27[3,2,3,3,7,6,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2],ymm6[3,4],ymm2[5,6,7,8],ymm6[9],ymm2[10],ymm6[11,12],ymm2[13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm7 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %ymm8 +; AVX512F-FAST-NEXT: vmovdqa64 96(%rdx), %ymm20 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm1 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] +; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm9 +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %xmm14 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[1,2,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 96(%rdi), %ymm24 +; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm24[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm6 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512F-FAST-NEXT: vpbroadcastq 104(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm29 +; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %ymm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,1,1] +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpandn %ymm1, %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %ymm22 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [10,11,12,13,14,15,14,15,10,11,12,13,14,15,14,15,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3,4],ymm0[5,6,7,8],ymm2[9],ymm0[10],ymm2[11,12],ymm0[13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %ymm27 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [14,15,12,13,10,11,14,15,14,15,12,13,14,15,12,13,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm5 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm27[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] +; AVX512F-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm23[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vprolq $16, %ymm11, %ymm10 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm7[2],ymm10[3],ymm7[4],ymm10[5,6],ymm7[7],ymm10[8,9],ymm7[10],ymm10[11],ymm7[12],ymm10[13,14],ymm7[15] +; AVX512F-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm7 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm23[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm10[1],ymm4[2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7,8],ymm10[9],ymm4[10],ymm10[11],ymm4[12,13],ymm10[14],ymm4[15] -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm6 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm10 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm23[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5],ymm1[6],ymm11[7,8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13],ymm1[14],ymm11[15] -; AVX512F-FAST-NEXT: vprolq $16, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm23[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm11[2],ymm5[3],ymm11[4],ymm5[5,6],ymm11[7],ymm5[8,9],ymm11[10],ymm5[11],ymm11[12],ymm5[13,14],ymm11[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm23 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm27[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm27[1,1,1,2,5,5,5,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10,11],ymm5[12],ymm0[13],ymm5[14],ymm0[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm27 -; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm31 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpandnq %ymm0, %ymm31, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm11 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm30 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm15[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5],ymm0[6],ymm11[7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13],ymm0[14],ymm11[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm17[0,1,0,0] -; AVX512F-FAST-NEXT: vprolq $16, %ymm9, %ymm9 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,1,2,1,4,5,6,5] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm15[2],ymm9[3],ymm15[4],ymm9[5,6],ymm15[7],ymm9[8,9],ymm15[10],ymm9[11],ymm15[12],ymm9[13,14],ymm15[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm9 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm20[3,0,3,0,7,4,7,4] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2],ymm9[3],ymm15[4,5],ymm9[6],ymm15[7,8],ymm9[9],ymm15[10],ymm9[11],ymm15[12,13],ymm9[14],ymm15[15] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm3 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,1] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7,8],ymm10[9],ymm7[10],ymm10[11],ymm7[12,13],ymm10[14],ymm7[15] +; AVX512F-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,2,3,6,7,6,7,8,9,4,5,4,5,8,9,18,19,18,19,22,23,22,23,24,25,20,21,20,21,24,25] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm10 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] +; AVX512F-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm10 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2],ymm11[3,4],ymm10[5,6,7,8],ymm11[9],ymm10[10],ymm11[11,12],ymm10[13,14,15] +; AVX512F-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %ymm18 +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm10 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm11 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm18[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2],ymm13[3,4],ymm11[5,6,7,8],ymm13[9],ymm11[10],ymm13[11,12],ymm11[13,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm31 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm9 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm11, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm11 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7,8],ymm13[9],ymm11[10],ymm13[11],ymm11[12,13],ymm13[14],ymm11[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm25 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm19 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm11, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm21 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm24[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vprolq $16, %ymm4, %ymm15 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0,1],ymm5[2],ymm15[3],ymm5[4],ymm15[5,6],ymm5[7],ymm15[8,9],ymm5[10],ymm15[11],ymm5[12],ymm15[13,14],ymm5[15] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm24[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm3 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm20[1,1,1,2,5,5,5,6] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3],ymm15[4],ymm3[5],ymm15[6],ymm3[7,8],ymm15[9],ymm3[10,11],ymm15[12],ymm3[13],ymm15[14],ymm3[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm9 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm9[0,1,1,1] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm9 -; AVX512F-FAST-NEXT: vpbroadcastq 80(%r8), %ymm20 -; AVX512F-FAST-NEXT: vpandnq %ymm20, %ymm31, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm8 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm20[3,2,3,3,7,6,7,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3,4],ymm8[5,6,7,8],ymm12[9],ymm8[10],ymm12[11,12],ymm8[13,14,15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm12 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm27[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5],ymm12[6],ymm15[7,8],ymm12[9],ymm15[10,11],ymm12[12],ymm15[13],ymm12[14],ymm15[15] +; AVX512F-FAST-NEXT: vprolq $16, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm27[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm15[2],ymm2[3],ymm15[4],ymm2[5,6],ymm15[7],ymm2[8,9],ymm15[10],ymm2[11],ymm15[12],ymm2[13,14],ymm15[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm20 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,12,13,12,13,0,1,14,15,14,15,14,15,14,15,16,17,28,29,28,29,16,17,30,31,30,31,30,31,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm12 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm22[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2],ymm12[3],ymm15[4,5],ymm12[6],ymm15[7,8],ymm12[9],ymm15[10],ymm12[11],ymm15[12,13],ymm12[14],ymm15[15] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm22[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5],ymm15[6],ymm1[7,8],ymm15[9],ymm1[10,11],ymm15[12],ymm1[13],ymm15[14],ymm1[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm12, %zmm1 +; AVX512F-FAST-NEXT: vpbroadcastq 16(%r8), %ymm12 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpandn %ymm12, %ymm15, %ymm12 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm13 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm13 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm23[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5],ymm13[6],ymm11[7,8],ymm13[9],ymm11[10,11],ymm13[12],ymm11[13],ymm13[14],ymm11[15] +; AVX512F-FAST-NEXT: vprolq $16, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[0,1,2,1,4,5,6,5] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3],ymm13[4],ymm0[5,6],ymm13[7],ymm0[8,9],ymm13[10],ymm0[11],ymm13[12],ymm0[13,14],ymm13[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm11 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm18[3,0,3,0,7,4,7,4] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7,8],ymm11[9],ymm13[10],ymm11[11],ymm13[12,13],ymm11[14],ymm13[15] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm7 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm10 = mem[0,1,0,0] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm14[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,0,0] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm16[2,2,3,2] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[1,1,1,2,5,5,5,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm7 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,1,1,1] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm10 = mem[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm9, %zmm16, %zmm10 +; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm9, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm10, %zmm9, %zmm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm10 +; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm13 = mem[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm10, %zmm16, %zmm13 +; AVX512F-FAST-NEXT: vpbroadcastq 64(%r8), %ymm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm13, %zmm9, %zmm10 +; AVX512F-FAST-NEXT: vpbroadcastq 80(%r8), %ymm9 +; AVX512F-FAST-NEXT: vpandn %ymm9, %ymm15, %ymm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm26[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq $174, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm13 = mem[2,3,2,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm19[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm15 = mem[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq $174, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm18 = mem[2,3,2,2] +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm19 = mem[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq $186, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm23 = mem[2,2,3,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm31[2,2,3,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm30[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm25[2,3,2,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm21[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm17[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,2] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm16 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm18 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm17 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm29 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm9, %zmm16, %zmm20 -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm9, %zmm18, %zmm29 -; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm9 = mem[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq $174, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm16 = mem[2,3,2,2] -; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm17 = mem[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq $186, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm18 = mem[2,2,3,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,3,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,1,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,3,2,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm6[0,1,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm25[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm24[2,3,2,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm22[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm21[2,2,3,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm19[0,1,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, (%rsp), %zmm19, %zmm19 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm21 = mem[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm19, %zmm22, %zmm21 -; AVX512F-FAST-NEXT: vpbroadcastq 64(%r8), %ymm19 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm19, %zmm15 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm21, %zmm19, %zmm15 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm21, %zmm11 -; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm21 = mem[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm11, %zmm22, %zmm21 -; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm11 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm5 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm21, %zmm19, %zmm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm9, %zmm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm11 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm9, %zmm22, %zmm11 -; AVX512F-FAST-NEXT: vpbroadcastq 112(%r8), %ymm9 -; AVX512F-FAST-NEXT: vpbroadcastq 120(%r8), %ymm16 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm9, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm11, %zmm16, %zmm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm26, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm28, %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm11, %zmm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm31, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm8 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm22, %zmm8 -; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm2 -; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm8, %zmm16, %zmm2 -; AVX512F-FAST-NEXT: vpbroadcastq 88(%r8), %ymm8 -; AVX512F-FAST-NEXT: vpbroadcastq 96(%r8), %ymm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm12, %zmm8 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm11, %zmm6 -; AVX512F-FAST-NEXT: vpbroadcastq 24(%r8), %ymm4 -; AVX512F-FAST-NEXT: vpbroadcastq 32(%r8), %ymm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm6, %zmm12, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm23, %zmm11, %zmm27 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm11, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $248, %zmm9, %zmm17, %zmm21 +; AVX512F-FAST-NEXT: vpternlogq $248, %zmm9, %zmm29, %zmm28 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm22, %zmm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm9, %zmm13, %zmm11 +; AVX512F-FAST-NEXT: vpbroadcastq 24(%r8), %ymm9 +; AVX512F-FAST-NEXT: vpbroadcastq 32(%r8), %ymm14 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm11, %zmm14, %zmm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm15, %zmm11 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm19, %zmm15 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm11, %zmm16, %zmm15 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm24, %zmm11 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm17 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm11, %zmm13, %zmm17 +; AVX512F-FAST-NEXT: vpbroadcastq 88(%r8), %ymm11 +; AVX512F-FAST-NEXT: vpbroadcastq 96(%r8), %ymm18 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm11, %zmm11 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm17, %zmm14, %zmm11 +; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm14 +; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm17 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm15, %zmm17, %zmm14 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm16, %zmm3 +; AVX512F-FAST-NEXT: vpbroadcastq 112(%r8), %ymm4 +; AVX512F-FAST-NEXT: vpbroadcastq 120(%r8), %ymm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm4 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm20, %zmm13, %zmm1 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm13, %zmm6 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm0, %zmm27, %zmm30 -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm0, %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 384(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 64(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 128(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 256(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 448(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 576(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 192(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 320(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 512(%r9) -; AVX512F-FAST-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX512F-FAST-NEXT: vpternlogq $248, %zmm0, %zmm1, %zmm12 +; AVX512F-FAST-NEXT: vpternlogq $248, %zmm0, %zmm6, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 384(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 64(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 576(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 448(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 256(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 128(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, 512(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 320(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, 192(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512F-FAST-NEXT: addq $504, %rsp # imm = 0x1F8 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride5_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm15 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2w %zmm6, %zmm16, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm17, %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm16 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm14, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38> +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm4, %zmm2 ; AVX512BW-NEXT: movl $415641996, %eax # imm = 0x18C6318C ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31] -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm18, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm19, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u> -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm7 -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm21, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm9 -; AVX512BW-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm9 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm23, %zmm24 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm25, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm18, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm19, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44> +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm9, %zmm8 ; AVX512BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm26, %zmm27 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44> -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm28, %zmm29 -; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm29 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm27, %zmm29 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm16, %zmm12 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31] +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm20, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm21, %zmm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm11, %zmm10 +; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31] +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm22, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm24 +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm23, %zmm24 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u> +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm25, %zmm13 +; AVX512BW-NEXT: movl $-1939662650, %eax # imm = 0x8C6318C6 +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm13 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31] +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm24, %zmm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = +; AVX512BW-NEXT: vpermt2w %zmm15, %zmm26, %zmm12 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u> +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm15, %zmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm0 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm18, %zmm0 -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm1, %zmm19 -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm8 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm20, %zmm8 -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm2, %zmm21 -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm10 {%k3} -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm22, %zmm10 -; AVX512BW-NEXT: vpermi2w %zmm6, %zmm2, %zmm23 -; AVX512BW-NEXT: vpermi2w %zmm4, %zmm1, %zmm25 -; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm25 {%k2} -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm24, %zmm25 -; AVX512BW-NEXT: vpermt2w %zmm4, %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm6, %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm2 {%k2} -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm27, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 512(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 576(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63] +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm12, %zmm0 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm6, %zmm14 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm1, %zmm4 +; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm4 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm18, %zmm4 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm1, %zmm19 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm6, %zmm9 +; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm9 {%k2} +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm20, %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm6, %zmm21 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm1, %zmm11 +; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm11 {%k2} +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm22, %zmm11 +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm6, %zmm23 +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm5, %zmm25 +; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm25 {%k3} +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm24, %zmm25 +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm26, %zmm1 +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm3 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm12, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm3, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 512(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64 @@ -6512,6 +6506,7 @@ ; AVX512DQ-SLOW: {{.*}} ; AVX512DQBW-FAST: {{.*}} ; AVX512DQBW-SLOW: {{.*}} +; AVX512F: {{.*}} ; AVX512F-ONLY-FAST: {{.*}} ; AVX512F-ONLY-SLOW: {{.*}} ; FALLBACK0: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -19,67 +19,67 @@ ; SSE-LABEL: store_i16_stride6_vf2: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movaps (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa (%r8), %xmm2 ; SSE-NEXT: movdqa (%r9), %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,4,6] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,3,3,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,0,3,4,5,6,7] -; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[1,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movq %xmm1, 16(%rax) +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride6_vf2: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm3 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,5,7,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,5,7,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rax) -; AVX1-ONLY-NEXT: vmovq %xmm1, 16(%rax) +; AVX1-ONLY-NEXT: vmovq %xmm3, 16(%rax) ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride6_vf2: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX2-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] @@ -92,16 +92,18 @@ ; AVX2-FAST-LABEL: store_i16_stride6_vf2: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX2-FAST-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -113,16 +115,18 @@ ; AVX2-FAST-PERLANE-LABEL: store_i16_stride6_vf2: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -134,16 +138,18 @@ ; AVX512F-SLOW-LABEL: store_i16_stride6_vf2: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm1 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX512F-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] @@ -156,16 +162,18 @@ ; AVX512F-FAST-LABEL: store_i16_stride6_vf2: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm1 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX512F-FAST-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,2,3,6,7,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -177,13 +185,15 @@ ; AVX512BW-LABEL: store_i16_stride6_vf2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm1 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u> ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 @@ -227,12 +237,12 @@ ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm6[0,2] ; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm1[1] ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,1,1,3] @@ -248,38 +258,41 @@ ; AVX1-ONLY-LABEL: store_i16_stride6_vf4: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,2,0] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,1,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -288,30 +301,33 @@ ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm4[0],xmm3[0] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3,4,5],ymm2[6],ymm6[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5],ymm0[6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %xmm3, 32(%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -320,30 +336,32 @@ ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm4[0],xmm3[0] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,6,1,3,4,6,1,3] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,28,29,u,u,u,u] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3,4,5],ymm2[6],ymm6[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm5[1,2],xmm4[3] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,6,1,5,2,6,1,5] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,28,29,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-NEXT: vmovdqa %xmm4, 32(%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -352,28 +370,30 @@ ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm4[0],xmm3[0] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3,4,5],ymm2[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm5[1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -382,32 +402,34 @@ ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4,5],ymm3[6],ymm4[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rax) +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,1,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1,2],xmm3[3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, 32(%rax) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -416,31 +438,33 @@ ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,6,1,3,4,6,1,3] -; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm5 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,28,29,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4,5],ymm5[6],ymm6[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7] -; AVX512F-FAST-NEXT: vpermi2d %ymm2, %ymm5, %ymm3 +; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rax) +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1,2],xmm3[3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,6,1,3,4,6,1,3] +; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,28,29,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7] +; AVX512F-FAST-NEXT: vpermi2d %ymm0, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa %xmm1, 32(%rax) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -449,19 +473,18 @@ ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) -; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) +; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,4,8,12,32,40,1,5,9,13,33,41,2,6,10,14,34,42,3,7,11,15,35,43,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) +; AVX512BW-NEXT: vmovdqa %ymm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64 @@ -485,85 +508,85 @@ ; SSE-LABEL: store_i16_stride6_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa (%rdx), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm0 ; SSE-NEXT: movdqa (%rcx), %xmm9 -; SSE-NEXT: movdqa (%r8), %xmm6 -; SSE-NEXT: movdqa (%r9), %xmm5 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE-NEXT: movdqa (%r8), %xmm4 +; SSE-NEXT: movdqa (%r9), %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm4[3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm6[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,2],xmm3[0,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm2, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,1,2,3] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: andnps %xmm11, %xmm3 -; SSE-NEXT: orps %xmm10, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm1[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,2],xmm9[0,1] +; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm3, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,1,2,1] +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: andnps %xmm11, %xmm5 +; SSE-NEXT: orps %xmm10, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,2],xmm9[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3] -; SSE-NEXT: andps %xmm2, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] -; SSE-NEXT: andnps %xmm9, %xmm2 -; SSE-NEXT: orps %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: movdqa %xmm6, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[1,3] +; SSE-NEXT: andps %xmm3, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; SSE-NEXT: andnps %xmm9, %xmm3 +; SSE-NEXT: orps %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm10 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm7[0] +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm7[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm8[0,2] ; SSE-NEXT: movaps {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE-NEXT: andps %xmm8, %xmm10 -; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] ; SSE-NEXT: movaps %xmm8, %xmm9 ; SSE-NEXT: andnps %xmm11, %xmm9 ; SSE-NEXT: orps %xmm10, %xmm9 -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm4[1] -; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm0[1] +; SSE-NEXT: movdqa %xmm4, %xmm12 ; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm10[0,2] ; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: andps %xmm10, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,2,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[2,2,3,3] ; SSE-NEXT: movaps %xmm10, %xmm11 ; SSE-NEXT: andnps %xmm13, %xmm11 ; SSE-NEXT: orps %xmm12, %xmm11 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm6[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm7[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[0,2] -; SSE-NEXT: andps %xmm8, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] -; SSE-NEXT: pslld $16, %xmm5 -; SSE-NEXT: andnps %xmm5, %xmm8 -; SSE-NEXT: orps %xmm4, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[0,2] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm7[0,2] ; SSE-NEXT: andps %xmm10, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,1,1] ; SSE-NEXT: andnps %xmm6, %xmm10 ; SSE-NEXT: orps %xmm12, %xmm10 -; SSE-NEXT: movaps %xmm10, 16(%rax) +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] +; SSE-NEXT: andps %xmm8, %xmm0 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: andnps %xmm1, %xmm8 +; SSE-NEXT: orps %xmm0, %xmm8 ; SSE-NEXT: movaps %xmm8, 48(%rax) +; SSE-NEXT: movaps %xmm10, 16(%rax) ; SSE-NEXT: movaps %xmm11, 64(%rax) ; SSE-NEXT: movaps %xmm9, (%rax) -; SSE-NEXT: movaps %xmm2, 32(%rax) ; SSE-NEXT: movaps %xmm3, 80(%rax) +; SSE-NEXT: movaps %xmm5, 32(%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride6_vf8: @@ -688,8 +711,10 @@ ; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,u,u,u,u,22,23,18,19,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,6,3,7,2,6,3,7] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,6,7,u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,u,u,u,u,26,27,30,31,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] @@ -745,9 +770,9 @@ ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,1,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] @@ -784,9 +809,9 @@ ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-FAST-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 ; AVX512F-FAST-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5] ; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm3 @@ -814,8 +839,10 @@ ; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,u,u,u,u,22,23,18,19,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,6,3,7,2,6,3,7] +; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,6,7,u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,u,u,u,u,26,27,30,31,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,u,u,u,u,22,23,30,31] @@ -863,333 +890,325 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $24, %rsp -; SSE-NEXT: movdqa (%rdi), %xmm15 -; SSE-NEXT: movdqa 16(%rdi), %xmm11 -; SSE-NEXT: movdqa (%rsi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rsi), %xmm4 -; SSE-NEXT: movdqa (%rdx), %xmm14 -; SSE-NEXT: movdqa 16(%rdx), %xmm12 -; SSE-NEXT: movdqa (%rcx), %xmm3 -; SSE-NEXT: movdqa 16(%rcx), %xmm2 -; SSE-NEXT: movdqa 16(%r8), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r9), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm6 +; SSE-NEXT: movdqa 16(%rsi), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdx), %xmm5 +; SSE-NEXT: movdqa 16(%rdx), %xmm11 +; SSE-NEXT: movdqa (%rcx), %xmm10 +; SSE-NEXT: movdqa 16(%rcx), %xmm14 +; SSE-NEXT: movdqa (%r8), %xmm8 +; SSE-NEXT: movdqa (%r9), %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm9[3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,2],xmm2[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0,1,3] +; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm2, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: andnps %xmm13, %xmm1 +; SSE-NEXT: orps %xmm12, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm5[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,2],xmm10[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3] +; SSE-NEXT: andps %xmm2, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: andnps %xmm10, %xmm1 +; SSE-NEXT: orps %xmm6, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; SSE-NEXT: movdqa %xmm13, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm12[3,3] +; SSE-NEXT: movdqa 16(%r8), %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm6[2,1,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm10[0,1] +; SSE-NEXT: movdqa 16(%r9), %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] +; SSE-NEXT: andps %xmm0, %xmm15 +; SSE-NEXT: orps %xmm15, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm14[4],xmm11[5],xmm14[5],xmm11[6],xmm14[6],xmm11[7],xmm14[7] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm9 -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm9[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm0[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0,1,3] -; SSE-NEXT: movaps {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm7, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: andnps %xmm8, %xmm0 -; SSE-NEXT: orps %xmm5, %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; SSE-NEXT: movdqa %xmm15, %xmm13 -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm6[3,3] -; SSE-NEXT: movdqa (%r8), %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,2],xmm8[2,3] -; SSE-NEXT: movdqa (%r9), %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: andnps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0,1,3] -; SSE-NEXT: andps %xmm7, %xmm13 -; SSE-NEXT: orps %xmm13, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm12[3,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,6,5,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: andnps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm11[3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm6[0,1,2,3,6,5,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm14[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: andps %xmm7, %xmm1 +; SSE-NEXT: andps %xmm0, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm10[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] +; SSE-NEXT: andnps %xmm14, %xmm0 ; SSE-NEXT: orps %xmm1, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3] -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm7[0] +; SSE-NEXT: movdqa %xmm8, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,0],xmm7[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[0,2] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: pandn %xmm11, %xmm14 +; SSE-NEXT: andps %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[0,2] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,0,1,1] +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: pandn %xmm11, %xmm7 +; SSE-NEXT: andps %xmm9, %xmm1 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm4[0] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm1[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm3 ; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm14[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[2,1,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm3[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: andps %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; SSE-NEXT: andnps %xmm3, %xmm7 -; SSE-NEXT: orps %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm11[0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm11[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm5[0,2] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pslld $16, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: pandn %xmm4, %xmm13 -; SSE-NEXT: andps %xmm1, %xmm3 -; SSE-NEXT: por %xmm3, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm6[1,1] +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: andps %xmm15, %xmm11 +; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[0,2] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: andps %xmm9, %xmm8 +; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm13[0] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm13[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[0,2] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: andps %xmm3, %xmm4 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm14[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm10[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm14[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: andps %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm10, %xmm5 +; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: andps %xmm15, %xmm4 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm12[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm12[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm13[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,0,1,1] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: andps %xmm9, %xmm4 ; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm14 = xmm14[0],xmm15[0] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[0,2] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: andps %xmm1, %xmm14 -; SSE-NEXT: por %xmm14, %xmm15 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm11[0] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm11[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[0,2] +; SSE-NEXT: andps %xmm15, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,2,3,3] +; SSE-NEXT: pslld $16, %xmm10 +; SSE-NEXT: pandn %xmm10, %xmm15 +; SSE-NEXT: por %xmm4, %xmm15 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm12[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm12[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: pandn %xmm4, %xmm11 -; SSE-NEXT: andps %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm9[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2] -; SSE-NEXT: andps %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,1,1] -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm2[0] -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[0,2] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: andps %xmm1, %xmm9 -; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,1],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm10[0,2] -; SSE-NEXT: andps %xmm1, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pslld $16, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm12[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm11[0,2] +; SSE-NEXT: andps %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: por %xmm6, %xmm9 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movdqa %xmm9, 160(%rax) +; SSE-NEXT: movdqa %xmm15, 144(%rax) +; SSE-NEXT: movdqa %xmm8, 112(%rax) +; SSE-NEXT: movdqa %xmm3, 96(%rax) +; SSE-NEXT: movdqa %xmm0, 64(%rax) ; SSE-NEXT: movdqa %xmm1, 48(%rax) -; SSE-NEXT: movdqa %xmm0, 96(%rax) -; SSE-NEXT: movdqa %xmm3, 112(%rax) -; SSE-NEXT: movdqa %xmm11, 160(%rax) -; SSE-NEXT: movdqa %xmm15, (%rax) -; SSE-NEXT: movdqa %xmm8, 16(%rax) -; SSE-NEXT: movdqa %xmm5, 64(%rax) -; SSE-NEXT: movdqa %xmm13, 144(%rax) -; SSE-NEXT: movaps %xmm7, 32(%rax) +; SSE-NEXT: movdqa %xmm7, 16(%rax) +; SSE-NEXT: movdqa %xmm14, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 176(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: addq $24, %rsp +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride6_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6],ymm2[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm4 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm13 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm13[2,3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm13[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm14, %ymm2 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm11[4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm12 -; AVX1-ONLY-NEXT: vpslld $16, %xmm12, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm10[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0],xmm1[1,2],xmm7[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm10[2],ymm0[3,4],ymm10[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm13 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1],xmm10[2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm3[1],xmm10[2,3,4,5,6],xmm3[7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm3[0,1,2],xmm15[3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3],ymm3[4],ymm8[5,6],ymm3[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm3[0,1],xmm11[0],xmm3[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1,2],xmm9[3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm13[0],xmm3[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm12 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm12[4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm10 +; AVX1-ONLY-NEXT: vpslld $16, %xmm10, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm1[1,2],xmm2[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1,2],xmm2[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm10[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0],xmm9[1],xmm2[2,3,4,5,6],xmm9[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm0[1,2],xmm2[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm5[1],xmm2[2,3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = xmm1[0,1],xmm12[0],xmm1[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5],xmm2[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm13[4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3,4,5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3,4,5,6],xmm8[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 176(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 160(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 112(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 96(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 32(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 16(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm9, 112(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 96(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm15, 64(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 80(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 176(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 128(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 144(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride6_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm13 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 @@ -1216,89 +1235,91 @@ ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm11 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm9, %ymm12, %ymm9 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm12[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm13, %ymm9 ; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm12 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm13 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm14 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm12 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm13 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm14, %ymm15, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,1,1,1] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; AVX2-SLOW-NEXT: vpbroadcastq %xmm15, %ymm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm11[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm15, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm14, %ymm15, %ymm14 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11] +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,0,2,2,5,4,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2,3],ymm9[4],ymm15[5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm15[2],ymm9[3,4],ymm15[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm15[1],ymm9[2,3],ymm15[4],ymm9[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm9, %ymm15, %ymm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,0,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero -; AVX2-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm13[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm15, %ymm9 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm7[1,1,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[2,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,2,2,5,4,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm14, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm14, (%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vzeroupper @@ -1306,7 +1327,7 @@ ; ; AVX2-FAST-LABEL: store_i16_stride6_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm4 @@ -1317,103 +1338,105 @@ ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm7 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm11 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-FAST-NEXT: vpbroadcastq %xmm10, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm10 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[2,1,3,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm7[1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm8[1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7] ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm11 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm12, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[8],ymm14[8],ymm12[9],ymm14[9],ymm12[10],ymm14[10],ymm12[11],ymm14[11] -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm12 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm12[0,0,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm8, %ymm13, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11] +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm14 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm8 ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm15 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm12 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [1,0,3,2,1,0,3,2] -; AVX2-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[1,1,1,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,0,7,6,5,0,7,6] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm13[4],ymm1[4],ymm13[5],ymm1[5],ymm13[6],ymm1[6],ymm13[7],ymm1[7],ymm13[12],ymm1[12],ymm13[13],ymm1[13],ymm13[14],ymm1[14],ymm13[15],ymm1[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm15[1],ymm7[2,3],ymm15[4],ymm7[5,6],ymm15[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm15, %ymm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[8],ymm14[8],ymm15[9],ymm14[9],ymm15[10],ymm14[10],ymm15[11],ymm14[11] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm15 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm13, %ymm14, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm14, %ymm13 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] +; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm14 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; AVX2-FAST-NEXT: vpbroadcastq %xmm14, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm11[0,0,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm14, %ymm14 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm13, %ymm14, %ymm13 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm3 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,4,0,6,5,4,0,6] +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0],ymm8[1],ymm14[2,3],ymm8[4],ymm14[5,6],ymm8[7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm14[2],ymm8[3,4],ymm14[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm14 = ymm15[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm8, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,0,3,2,1,0,3,2] ; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5,6],ymm7[7] -; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero -; AVX2-FAST-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [5,4,0,6,5,4,0,6] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [5,0,7,6,5,0,7,6] ; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm1[0],ymm13[1],ymm1[1],ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[8],ymm1[8],ymm13[9],ymm1[9],ymm13[10],ymm1[10],ymm13[11],ymm1[11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm15[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm14, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm14, 96(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm13, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-NEXT: vzeroupper @@ -1421,202 +1444,204 @@ ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride6_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[12],ymm5[12],ymm3[13],ymm5[13],ymm3[14],ymm5[14],ymm3[15],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm15, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[0,0,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[8],ymm2[8],ymm12[9],ymm2[9],ymm12[10],ymm2[10],ymm12[11],ymm2[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm6, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm12 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm13 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm15, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[8],ymm14[8],ymm12[9],ymm14[9],ymm12[10],ymm14[10],ymm12[11],ymm14[11] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm14 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm15 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[8],ymm14[8],ymm15[9],ymm14[9],ymm15[10],ymm14[10],ymm15[11],ymm14[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm12, %ymm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm10, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm7 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1,2],ymm14[3],ymm12[4,5],ymm14[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm12, %ymm14, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[0,0,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm13, %ymm14, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm2[0],ymm11[1],ymm2[1],ymm11[2],ymm2[2],ymm11[3],ymm2[3],ymm11[8],ymm2[8],ymm11[9],ymm2[9],ymm11[10],ymm2[10],ymm11[11],ymm2[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3],ymm11[4],ymm14[5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm14 = ymm15[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm11, %ymm14, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[2,1,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm13, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm4 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm7[4],ymm2[5],ymm7[5],ymm2[6],ymm7[6],ymm2[7],ymm7[7],ymm2[12],ymm7[12],ymm2[13],ymm7[13],ymm2[14],ymm7[14],ymm2[15],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm15[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, (%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i16_stride6_vf16: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm3 -; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} ymm6 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} ymm7 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[2,1,2,3,6,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm1 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] +; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} ymm7 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} ymm8 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-SLOW-NEXT: vpermi2d %ymm6, %ymm7, %ymm8 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[2,1,2,3,6,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,1,2,3,6,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,5,20,3,4,21,6,7,13,14,30,14,13,31,15,15] +; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[1,2,2,3,5,6,6,7] ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <5,u,14,6,u,15,7,u> -; AVX512F-SLOW-NEXT: vpermi2d %ymm7, %ymm8, %ymm9 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [8,21,10,11,22,13,14,23] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] -; AVX512F-SLOW-NEXT: vpermi2d %zmm9, %zmm7, %zmm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [8,21,10,11,22,13,14,23] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512F-SLOW-NEXT: vpermi2d %zmm9, %zmm8, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm16 ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} xmm9 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <0,8,u,1,9,u,2,10> -; AVX512F-SLOW-NEXT: vpermi2d %ymm9, %ymm11, %ymm13 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512F-SLOW-NEXT: vpbroadcastq %xmm14, %ymm14 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} xmm14 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[0,1,2,1] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,1,2,1] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512F-SLOW-NEXT: vpermi2d %ymm14, %ymm12, %ymm15 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[1,2,2,3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4,5],ymm12[6],ymm15[7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 +; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} xmm10 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[0,1,2,1] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,1,2,1] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,5] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm12, %zmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,16,0,1,17,5,2,18,8,9,24,11,8,25,10,11] +; AVX512F-SLOW-NEXT: vpermi2d %zmm9, %zmm12, %zmm13 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm13, %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm12 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[1,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm15 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[1,2,2,3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] +; AVX512F-SLOW-NEXT: vpbroadcastq %xmm9, %ymm9 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1],ymm9[2],ymm13[3,4],ymm9[5],ymm13[6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <4,12,u,5,13,u,6,14> -; AVX512F-SLOW-NEXT: vpermi2d %ymm4, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <1,2,18,2,u,19,3,3,12,28,12,13,29,13,14,30> +; AVX512F-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <1,u,10,2,u,11,3,u> -; AVX512F-SLOW-NEXT: vpermi2d %ymm1, %ymm2, %ymm3 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,9,2,3,10,5,6,11] ; AVX512F-SLOW-NEXT: vpermi2d %ymm1, %ymm3, %ymm2 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq @@ -1628,80 +1653,78 @@ ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm3 ; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %ymm16 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm0 ; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,8,u,1,9,u,2,10> -; AVX512F-FAST-NEXT: vpermi2d %ymm9, %ymm11, %ymm12 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm9 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm11 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7] -; AVX512F-FAST-NEXT: vpermi2d %ymm13, %ymm12, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm13 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm12 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] -; AVX512F-FAST-NEXT: vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512F-FAST-NEXT: vpermi2d %ymm13, %ymm12, %ymm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm12 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[1,2,2,3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,2,2,3] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm12, %zmm13 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm0 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,u,10,2,u,11,3,u> -; AVX512F-FAST-NEXT: vpermi2d %ymm8, %ymm6, %ymm7 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,9,2,3,10,5,6,11] -; AVX512F-FAST-NEXT: vpermi2d %ymm6, %ymm7, %ymm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <4,12,u,5,13,u,6,14> -; AVX512F-FAST-NEXT: vpermi2d %ymm6, %ymm7, %ymm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm10 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11] -; AVX512F-FAST-NEXT: vpermi2d %zmm9, %zmm6, %zmm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] +; AVX512F-FAST-NEXT: vpsrldq {{.*#+}} ymm7 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpsrldq {{.*#+}} ymm8 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] ; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm8 ; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm7 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] -; AVX512F-FAST-NEXT: vpsrldq {{.*#+}} ymm8 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpsrldq {{.*#+}} ymm9 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX512F-FAST-NEXT: vpermi2d %ymm8, %ymm7, %ymm9 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[1,2,2,3,5,6,6,7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7] -; AVX512F-FAST-NEXT: vpermi2d %ymm7, %ymm9, %ymm8 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <5,u,14,6,u,15,7,u> +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,5,20,3,4,21,6,7,13,14,30,14,13,31,15,15] +; AVX512F-FAST-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[1,2,2,3,5,6,6,7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[1,2,2,3,5,6,6,7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [12,1,2,13,4,5,14,7] +; AVX512F-FAST-NEXT: vpermi2d %ymm6, %ymm8, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [8,21,10,11,22,13,14,23] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-FAST-NEXT: vpermi2d %zmm9, %zmm8, %zmm6 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm16 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512F-FAST-NEXT: vpsrldq {{.*#+}} xmm9 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512F-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm12 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm11 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm12, %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,16,0,1,17,5,2,18,8,9,24,11,8,25,10,11] +; AVX512F-FAST-NEXT: vpermi2d %zmm9, %zmm11, %zmm12 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm9 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm11 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,8,3,4,9,6,7] +; AVX512F-FAST-NEXT: vpermi2d %ymm14, %ymm12, %ymm15 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [16,9,10,17,12,13,18,15] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[1,2,2,3] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,2,2,3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm12, %zmm14 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm0 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <1,2,18,2,u,19,3,3,12,28,12,13,29,13,14,30> +; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,9,2,3,10,5,6,11] ; AVX512F-FAST-NEXT: vpermi2d %ymm2, %ymm3, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,21,10,11,22,13,14,23] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm1[4],ymm10[5],ymm1[5],ymm10[6],ymm1[6],ymm10[7],ymm1[7],ymm10[12],ymm1[12],ymm10[13],ymm1[13],ymm10[14],ymm1[14],ymm10[15],ymm1[15] -; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,20,11,12,21,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11] +; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 128(%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -1917,7 +1940,7 @@ ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,0],xmm5[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa %xmm2, %xmm14 @@ -1929,8 +1952,8 @@ ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm3[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm7[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm3[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm5[0,2] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,1,1] @@ -1944,7 +1967,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[1,0] ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm14[0,2] ; SSE-NEXT: movdqa %xmm2, %xmm0 @@ -1958,7 +1981,7 @@ ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm15[1] ; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm15[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm15[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[0,2] ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: pandn %xmm14, %xmm0 @@ -1971,7 +1994,7 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm1[0,2] ; SSE-NEXT: movdqa %xmm8, %xmm7 ; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5] @@ -1981,8 +2004,8 @@ ; SSE-NEXT: por %xmm6, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,0,1,1] ; SSE-NEXT: movdqa %xmm5, %xmm1 @@ -1995,7 +2018,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm1[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm8[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm8 @@ -2005,7 +2028,7 @@ ; SSE-NEXT: por %xmm7, %xmm6 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] ; SSE-NEXT: movdqa %xmm5, %xmm7 ; SSE-NEXT: pandn %xmm14, %xmm7 @@ -2015,7 +2038,7 @@ ; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm11[0] ; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm11[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm11[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm2[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movdqa %xmm0, %xmm14 @@ -2025,8 +2048,8 @@ ; SSE-NEXT: andps %xmm9, %xmm8 ; SSE-NEXT: por %xmm8, %xmm3 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm13[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm1[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm13[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm11[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,1,1] ; SSE-NEXT: movdqa %xmm5, %xmm8 @@ -2037,7 +2060,7 @@ ; SSE-NEXT: movdqa %xmm1, %xmm14 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm14 = xmm14[0],xmm12[0] ; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,1],xmm12[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm12[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm11[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm0 @@ -2048,7 +2071,7 @@ ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm12 = xmm12[1],xmm1[1] ; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm12[0,2] ; SSE-NEXT: movdqa %xmm5, %xmm12 ; SSE-NEXT: pandn %xmm15, %xmm12 @@ -2059,7 +2082,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] ; SSE-NEXT: movdqa %xmm10, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,0],xmm0[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm13[0,2] ; SSE-NEXT: movdqa %xmm4, %xmm15 ; SSE-NEXT: pslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm15[0,1,2,3,4,5] @@ -2068,8 +2091,8 @@ ; SSE-NEXT: andps %xmm9, %xmm14 ; SSE-NEXT: por %xmm14, %xmm13 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm10[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm1[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm10[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] ; SSE-NEXT: movdqa %xmm5, %xmm14 @@ -2081,7 +2104,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm15[0,2] ; SSE-NEXT: andps %xmm9, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[2,2,3,3] @@ -2090,7 +2113,7 @@ ; SSE-NEXT: por %xmm1, %xmm9 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm0[0,2] ; SSE-NEXT: andps %xmm5, %xmm10 ; SSE-NEXT: pandn %xmm15, %xmm5 @@ -2139,264 +2162,268 @@ ; ; AVX1-ONLY-LABEL: store_i16_stride6_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $120, %rsp -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: subq $88, %rsp +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm11 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0,1],xmm14[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3],xmm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = xmm13[0,1],xmm0[0],xmm13[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm14[5],xmm13[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm0[4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm12[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm0 -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm11[0],xmm3[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm11[4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm8[5],xmm4[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6],xmm4[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm2 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm12[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = xmm13[0,1],xmm1[0],xmm13[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1,2,3,4],xmm13[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0,1],xmm13[2,3,4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6],xmm14[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm13[3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm6 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm12[0],xmm5[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3,4],xmm0[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6],ymm5[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm7 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0],xmm5[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm3, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vpslld $16, %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1,2,3,4],xmm6[5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0],xmm0[1,2],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm3[0],xmm5[1],xmm3[2,3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5],xmm3[6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3,4,5,6],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0,1],xmm13[2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6],xmm14[7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm14 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3],xmm12[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm14, %ymm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm14, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6],ymm11[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm10 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm14 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm11[0,1],xmm3[0],xmm11[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0],xmm6[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm9[5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm0 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1],xmm9[2,3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0,1],xmm1[0],xmm8[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm12[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0],xmm2[1,2],xmm6[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3,4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 48(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 16(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 96(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 112(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm12, 64(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm13, 80(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm15, 160(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 352(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 368(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 320(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 336(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 288(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 304(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 256(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, 272(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm15, 224(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm14, 240(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm13, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2404,54 +2431,51 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 368(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%rax) -; AVX1-ONLY-NEXT: addq $120, %rsp +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rax) +; AVX1-ONLY-NEXT: addq $88, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride6_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $680, %rsp # imm = 0x2A8 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: subq $584, %rsp # imm = 0x248 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm9 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] @@ -2459,283 +2483,276 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm10 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,2,1] -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,2,1] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm5 ; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm14 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm8 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm15 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[2,1,2,3,6,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm0 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm0[1,2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2],ymm9[3,4],ymm4[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm12 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5,6],ymm12[7] -; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = mem[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm12, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2],ymm12[3,4],ymm0[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm9 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm9, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm3[4],mem[4],ymm3[5],mem[5],ymm3[6],mem[6],ymm3[7],mem[7],ymm3[12],mem[12],ymm3[13],mem[13],ymm3[14],mem[14],ymm3[15],mem[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2],ymm9[3,4],ymm0[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm14, %ymm12 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm12, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,0,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6],ymm11[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm11 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero +; AVX2-SLOW-NEXT: vpbroadcastq %xmm11, %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm11 = mem[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm11, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm10, %ymm11, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm2[4],ymm13[4],ymm2[5],ymm13[5],ymm2[6],ymm13[6],ymm2[7],ymm13[7],ymm2[12],ymm13[12],ymm2[13],ymm13[13],ymm2[14],ymm13[14],ymm2[15],ymm13[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[3,3,3,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm15[4],ymm4[4],ymm15[5],ymm4[5],ymm15[6],ymm4[6],ymm15[7],ymm4[7],ymm15[12],ymm4[12],ymm15[13],ymm4[13],ymm15[14],ymm4[14],ymm15[15],ymm4[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm9 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5,6],ymm9[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm9, %ymm12, %ymm15 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,0,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5,6],ymm12[7] -; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero -; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm12, %ymm9 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = mem[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm12, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm12 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm12 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[2],mem[2],ymm5[3],mem[3],ymm5[8],mem[8],ymm5[9],mem[9],ymm5[10],mem[10],ymm5[11],mem[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm11 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,0,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6],ymm11[7] +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm11 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero +; AVX2-SLOW-NEXT: vpbroadcastq %xmm11, %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm11, %ymm11 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm10, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm11 +; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[2],mem[2],ymm4[3],mem[3],ymm4[8],mem[8],ymm4[9],mem[9],ymm4[10],mem[10],ymm4[11],mem[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[1,0,2,2,5,4,6,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[2],mem[2],ymm3[3],mem[3],ymm3[8],mem[8],ymm3[9],mem[9],ymm3[10],mem[10],ymm3[11],mem[11] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5,6],ymm13[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,2,2,5,4,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm10, %ymm13, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm14[0],ymm2[0],ymm14[1],ymm2[1],ymm14[2],ymm2[2],ymm14[3],ymm2[3],ymm14[8],ymm2[8],ymm14[9],ymm2[9],ymm14[10],ymm2[10],ymm14[11],ymm2[11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[8],ymm1[8],ymm6[9],ymm1[9],ymm6[10],ymm1[10],ymm6[11],ymm1[11] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[1,0,2,2,5,4,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6],ymm14[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm13, %ymm14, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,1,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm15 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm15 = mem[2,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm13, %ymm15, %ymm13 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm15 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[1,1,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3],ymm14[4],ymm0[5,6],ymm14[7] +; AVX2-SLOW-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm14 = mem[2,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm14, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm9[4],mem[4],ymm9[5],mem[5],ymm9[6],mem[6],ymm9[7],mem[7],ymm9[12],mem[12],ymm9[13],mem[13],ymm9[14],mem[14],ymm9[15],mem[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm15 = ymm11[4],mem[4],ymm11[5],mem[5],ymm11[6],mem[6],ymm11[7],mem[7],ymm11[12],mem[12],ymm11[13],mem[13],ymm11[14],mem[14],ymm11[15],mem[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm8, %ymm12 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm12, %ymm9, %ymm9 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm10[4],ymm1[4],ymm10[5],ymm1[5],ymm10[6],ymm1[6],ymm10[7],ymm1[7],ymm10[12],ymm1[12],ymm10[13],ymm1[13],ymm10[14],ymm1[14],ymm10[15],ymm1[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm15, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 288(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 352(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 352(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 288(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 320(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 256(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm13, 64(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-SLOW-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX2-SLOW-NEXT: addq $584, %rsp # imm = 0x248 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride6_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $616, %rsp # imm = 0x268 +; AVX2-FAST-NEXT: subq $680, %rsp # imm = 0x2A8 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] @@ -2745,7 +2762,7 @@ ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 @@ -2756,23 +2773,20 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm1 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2782,1472 +2796,1405 @@ ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,3,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm8 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm14 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm9 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpsrldq {{.*#+}} ymm11 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[1],ymm1[1],ymm9[2],ymm1[2],ymm9[3],ymm1[3],ymm9[8],ymm1[8],ymm9[9],ymm1[9],ymm9[10],ymm1[10],ymm9[11],ymm1[11] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm11[0],ymm1[0],ymm11[1],ymm1[1],ymm11[2],ymm1[2],ymm11[3],ymm1[3],ymm11[8],ymm1[8],ymm11[9],ymm1[9],ymm11[10],ymm1[10],ymm11[11],ymm1[11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm0[1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm0[1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm9, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <1,2,1,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm13 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0],ymm13[1],ymm4[2,3],ymm13[4],ymm4[5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm10 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm12 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm12 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6],ymm10[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm15[0,0,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm10, %ymm12, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,1,1,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm9 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm11 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm11 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6],ymm10[7] +; AVX2-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm11 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-FAST-NEXT: vpbroadcastq %xmm11, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[0,0,2,1,4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm11, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm10, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm11 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm15[4],ymm14[4],ymm15[5],ymm14[5],ymm15[6],ymm14[6],ymm15[7],ymm14[7],ymm15[12],ymm14[12],ymm15[13],ymm14[13],ymm15[14],ymm14[14],ymm15[15],ymm14[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,5,6,5,6,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm14[0],ymm4[0],ymm14[1],ymm4[1],ymm14[2],ymm4[2],ymm14[3],ymm4[3],ymm14[8],ymm4[8],ymm14[9],ymm4[9],ymm14[10],ymm4[10],ymm14[11],ymm4[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [5,4,2,2,5,4,6,6] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6],ymm10[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm9 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7] +; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm12, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[8],ymm6[8],ymm0[9],ymm6[9],ymm0[10],ymm6[10],ymm0[11],ymm6[11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm5[1],ymm12[2,3],ymm5[4],ymm12[5,6],ymm5[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2],ymm5[3,4],ymm12[5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm5, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm12 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm12 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,2,1,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2],ymm5[3,4],ymm12[5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm11 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3],ymm11[4],ymm5[5,6],ymm11[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm10, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero -; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm10 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm10 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm10 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,1,1,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm10[2],ymm1[3,4],ymm10[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3],ymm10[4],ymm1[5,6],ymm10[7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm14[4],mem[4],ymm14[5],mem[5],ymm14[6],mem[6],ymm14[7],mem[7],ymm14[12],mem[12],ymm14[13],mem[13],ymm14[14],mem[14],ymm14[15],mem[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [5,6,5,6,5,6,7,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[3,3,3,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm9, %ymm14 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5,6],ymm14[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm13 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm10, %ymm13, %ymm10 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[12],mem[12],ymm2[13],mem[13],ymm2[14],mem[14],ymm2[15],mem[15] ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm4[1],ymm12[2,3],ymm4[4],ymm12[5,6],ymm4[7] -; AVX2-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = mem[0,0,2,1,4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[8],ymm14[8],ymm15[9],ymm14[9],ymm15[10],ymm14[10],ymm15[11],ymm14[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [5,4,2,2,5,4,6,6] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm15, %ymm12 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7] -; AVX2-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm5[0],ymm9[0],ymm5[1],ymm9[1],ymm5[2],ymm9[2],ymm5[3],ymm9[3],ymm5[8],ymm9[8],ymm5[9],ymm9[9],ymm5[10],ymm9[10],ymm5[11],ymm9[11] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm15, %ymm5 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[12],ymm6[12],ymm0[13],ymm6[13],ymm0[14],ymm6[14],ymm0[15],ymm6[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[3,3,3,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm3, %ymm2, %ymm0 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 288(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 352(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm10, 160(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 192(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 256(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $616, %rsp # imm = 0x268 +; AVX2-FAST-NEXT: addq $680, %rsp # imm = 0x2A8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride6_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $680, %rsp # imm = 0x2A8 +; AVX2-FAST-PERLANE-NEXT: subq $584, %rsp # imm = 0x248 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,0,2,1,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[8],ymm15[8],ymm1[9],ymm15[9],ymm1[10],ymm15[10],ymm1[11],ymm15[11] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm11 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm11[0],ymm1[0],ymm11[1],ymm1[1],ymm11[2],ymm1[2],ymm11[3],ymm1[3],ymm11[8],ymm1[8],ymm11[9],ymm1[9],ymm11[10],ymm1[10],ymm11[11],ymm1[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm0[1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm11, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm6, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm8, %ymm14, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[8],ymm9[8],ymm1[9],ymm9[9],ymm1[10],ymm9[10],ymm1[11],ymm9[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm10 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm10[2],ymm0[3,4],ymm10[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm10, %ymm11, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1],ymm8[2],ymm14[3,4],ymm8[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm13 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm14 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm8, %ymm11, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm12[4],ymm9[4],ymm12[5],ymm9[5],ymm12[6],ymm9[6],ymm12[7],ymm9[7],ymm12[12],ymm9[12],ymm12[13],ymm9[13],ymm12[14],ymm9[14],ymm12[15],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm13, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm13 = mem[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm0, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm11, %ymm13, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm2, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm11 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} xmm13 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[2,1,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm10, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm15, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm8, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm14[0],ymm10[0],ymm14[1],ymm10[1],ymm14[2],ymm10[2],ymm14[3],ymm10[3],ymm14[8],ymm10[8],ymm14[9],ymm10[9],ymm14[10],ymm10[10],ymm14[11],ymm10[11] +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm14 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm15 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[8],ymm14[8],ymm15[9],ymm14[9],ymm15[10],ymm14[10],ymm15[11],ymm14[11] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm14[2],ymm10[3,4],ymm14[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm14 = ymm7[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm10[1,2],ymm14[3],ymm10[4,5],ymm14[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm4, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm10, %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm3, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm9[4],ymm7[4],ymm9[5],ymm7[5],ymm9[6],ymm7[6],ymm9[7],ymm7[7],ymm9[12],ymm7[12],ymm9[13],ymm7[13],ymm9[14],ymm7[14],ymm9[15],ymm7[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3],ymm14[4],ymm8[5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm6[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm15, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm14[4],ymm8[4],ymm14[5],ymm8[5],ymm14[6],ymm8[6],ymm14[7],ymm8[7],ymm14[12],ymm8[12],ymm14[13],ymm8[13],ymm14[14],ymm8[14],ymm14[15],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm11[4],mem[4],ymm11[5],mem[5],ymm11[6],mem[6],ymm11[7],mem[7],ymm11[12],mem[12],ymm11[13],mem[13],ymm11[14],mem[14],ymm11[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm15, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm7, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[8],ymm15[8],ymm13[9],ymm15[9],ymm13[10],ymm15[10],ymm13[11],ymm15[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm15 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm11 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm11[0],ymm15[0],ymm11[1],ymm15[1],ymm11[2],ymm15[2],ymm11[3],ymm15[3],ymm11[8],ymm15[8],ymm11[9],ymm15[9],ymm11[10],ymm15[10],ymm11[11],ymm15[11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm13, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm13, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,0,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm13 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm13, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[0,0,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm13, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm11, %ymm13, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm13 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm6, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm6, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = mem[0,0,2,1,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm13, %ymm15, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[8],ymm10[8],ymm12[9],ymm10[9],ymm12[10],ymm10[10],ymm12[11],ymm10[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[8],ymm7[8],ymm9[9],ymm7[9],ymm9[10],ymm7[10],ymm9[11],ymm7[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[1,0,2,2,5,4,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm9, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm14[0],ymm8[0],ymm14[1],ymm8[1],ymm14[2],ymm8[2],ymm14[3],ymm8[3],ymm14[8],ymm8[8],ymm14[9],ymm8[9],ymm14[10],ymm8[10],ymm14[11],ymm8[11] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm11, %ymm15, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm2, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3],ymm13[4],ymm0[5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm13, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm9[4],mem[4],ymm9[5],mem[5],ymm9[6],mem[6],ymm9[7],mem[7],ymm9[12],mem[12],ymm9[13],mem[13],ymm9[14],mem[14],ymm9[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm5[4],mem[4],ymm5[5],mem[5],ymm5[6],mem[6],ymm5[7],mem[7],ymm5[12],mem[12],ymm5[13],mem[13],ymm5[14],mem[14],ymm5[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm8, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm12, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm7[4],ymm10[4],ymm7[5],ymm10[5],ymm7[6],ymm10[6],ymm7[7],ymm10[7],ymm7[12],ymm10[12],ymm7[13],ymm10[13],ymm7[14],ymm10[14],ymm7[15],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm7[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,2,2,5,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 256(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 288(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $584, %rsp # imm = 0x248 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-ONLY-SLOW-LABEL: store_i16_stride6_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: movw $9362, %r10w # imm = 0x2492 +; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm8, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4],ymm8[5],ymm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm14[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm16, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm15[0],ymm8[1],ymm15[1],ymm8[2],ymm15[2],ymm8[3],ymm15[3],ymm8[8],ymm15[8],ymm8[9],ymm15[9],ymm8[10],ymm15[10],ymm8[11],ymm15[11] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,0,2,2,5,4,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: movw $18724, %r10w # imm = 0x4924 +; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2],ymm10[3,4],ymm0[5],ymm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm10, %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5,6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm14[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm11, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm10 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[8],ymm0[8],ymm10[9],ymm0[9],ymm10[10],ymm0[10],ymm10[11],ymm0[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm8[4],ymm15[4],ymm8[5],ymm15[5],ymm8[6],ymm15[6],ymm8[7],ymm15[7],ymm8[12],ymm15[12],ymm8[13],ymm15[13],ymm8[14],ymm15[14],ymm8[15],ymm15[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,2,3,3,5,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm8, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm9, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3],ymm10[4],ymm0[5,6],ymm10[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm14[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm8 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm10 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm8, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm8, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm0[1,2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm10 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm10, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm8, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm16, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm2[0],ymm14[1],ymm2[1],ymm14[2],ymm2[2],ymm14[3],ymm2[3],ymm14[8],ymm2[8],ymm14[9],ymm2[9],ymm14[10],ymm2[10],ymm14[11],ymm2[11] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm2, %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm11[0],ymm8[0],ymm11[1],ymm8[1],ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[8],ymm8[8],ymm11[9],ymm8[9],ymm11[10],ymm8[10],ymm11[11],ymm8[11] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm15[0],ymm8[0],ymm15[1],ymm8[1],ymm15[2],ymm8[2],ymm15[3],ymm8[3],ymm15[8],ymm8[8],ymm15[9],ymm8[9],ymm15[10],ymm8[10],ymm15[11],ymm8[11] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: movw $18724, %ax # imm = 0x4924 -; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm28 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm0[1,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[1,0,2,2,5,4,6,6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm16[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm10, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm7, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5,6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm10[0,1,2,3],zmm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm9 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm10 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[8],ymm0[8],ymm10[9],ymm0[9],ymm10[10],ymm0[10],ymm10[11],ymm0[11] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm13[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm10, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5,6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm10[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm2[0,1,2,3],zmm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm4 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm11[4],ymm8[4],ymm11[5],ymm8[5],ymm11[6],ymm8[6],ymm11[7],ymm8[7],ymm11[12],ymm8[12],ymm11[13],ymm8[13],ymm11[14],ymm8[14],ymm11[15],ymm8[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm31[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2],ymm0[3,4],ymm10[5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm2[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm5 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm8[4],ymm15[5],ymm8[5],ymm15[6],ymm8[6],ymm15[7],ymm8[7],ymm15[12],ymm8[12],ymm15[13],ymm8[13],ymm15[14],ymm8[14],ymm15[15],ymm8[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm3[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm9[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm14[4],ymm10[4],ymm14[5],ymm10[5],ymm14[6],ymm10[6],ymm14[7],ymm10[7],ymm14[12],ymm10[12],ymm14[13],ymm10[13],ymm14[14],ymm10[14],ymm14[15],ymm10[15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm9[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[8],ymm5[8],ymm8[9],ymm5[9],ymm8[10],ymm5[10],ymm8[11],ymm5[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm4[4],ymm6[5],ymm4[5],ymm6[6],ymm4[6],ymm6[7],ymm4[7],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm29, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm30, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm29[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm30[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: movw $9362, %ax # imm = 0x2492 -; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm26, %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,0,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm5, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm24, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm24[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm23[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm27, %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm27[0],zero,xmm27[1],zero,xmm27[2],zero,xmm27[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm28, %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm5, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm21, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm9, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm16, %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm1, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, (%rax) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i16_stride6_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: pushq %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[8],ymm1[8],ymm12[9],ymm1[9],ymm12[10],ymm1[10],ymm12[11],ymm1[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [1,0,2,2,1,0,2,2] +; AVX512F-ONLY-FAST-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm30, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm31 +; AVX512F-ONLY-FAST-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm31 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [16,9,10,17,12,13,18,15] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm0, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm0[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm20, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm20, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm7[0],ymm15[1],ymm7[1],ymm15[2],ymm7[2],ymm15[3],ymm7[3],ymm15[8],ymm7[8],ymm15[9],ymm7[9],ymm15[10],ymm7[10],ymm15[11],ymm7[11] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [8,9,20,11,12,21,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm1[0],ymm15[1],ymm1[1],ymm15[2],ymm1[2],ymm15[3],ymm1[3],ymm15[8],ymm1[8],ymm15[9],ymm1[9],ymm15[10],ymm1[10],ymm15[11],ymm1[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,9,2,3,8,5,6,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm16, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm28 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm0[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm0[1,1,1,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[2],ymm11[2],ymm0[3],ymm11[3],ymm0[8],ymm11[8],ymm0[9],ymm11[9],ymm0[10],ymm11[10],ymm0[11],ymm11[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm2, %zmm20, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm25 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm8, %zmm25, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm14, %ymm16, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm3[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [5,6,5,6,5,6,7,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm15[4],ymm1[4],ymm15[5],ymm1[5],ymm15[6],ymm1[6],ymm15[7],ymm1[7],ymm15[12],ymm1[12],ymm15[13],ymm1[13],ymm15[14],ymm1[14],ymm15[15],ymm1[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm24, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm15 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[1],ymm1[1],ymm15[2],ymm1[2],ymm15[3],ymm1[3],ymm15[8],ymm1[8],ymm15[9],ymm1[9],ymm15[10],ymm1[10],ymm15[11],ymm1[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm1[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0],ymm3[0],ymm13[1],ymm3[1],ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[8],ymm3[8],ymm13[9],ymm3[9],ymm13[10],ymm3[10],ymm13[11],ymm3[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm11[4],ymm0[5],ymm11[5],ymm0[6],ymm11[6],ymm0[7],ymm11[7],ymm0[12],ymm11[12],ymm0[13],ymm11[13],ymm0[14],ymm11[14],ymm0[15],ymm11[15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm11, %zmm15 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [8,21,10,11,20,13,14,23] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm16, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm31 = [12,1,2,13,4,5,14,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm31, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm3[4],ymm12[5],ymm3[5],ymm12[6],ymm3[6],ymm12[7],ymm3[7],ymm12[12],ymm3[12],ymm12[13],ymm3[13],ymm12[14],ymm3[14],ymm12[15],ymm3[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm24, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm13[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm13 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm12 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[1],ymm13[1],ymm1[2],ymm13[2],ymm1[3],ymm13[3],ymm1[8],ymm13[8],ymm1[9],ymm13[9],ymm1[10],ymm13[10],ymm1[11],ymm13[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm10[4],ymm5[5],ymm10[5],ymm5[6],ymm10[6],ymm5[7],ymm10[7],ymm5[12],ymm10[12],ymm5[13],ymm10[13],ymm5[14],ymm10[14],ymm5[15],ymm10[15] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm10[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm0, %zmm5, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm1[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm31, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm1[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [1,0,2,2,1,0,2,2] -; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm6[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: movw $9362, %ax # imm = 0x2492 -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [16,9,10,17,12,13,18,15] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm2, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm14[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [0,9,2,3,8,5,6,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm29, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [8,9,20,11,12,21,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm21, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm0[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm0[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [5,6,5,6,5,6,7,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm23, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm9 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[1],ymm1[1],ymm9[2],ymm1[2],ymm9[3],ymm1[3],ymm9[8],ymm1[8],ymm9[9],ymm1[9],ymm9[10],ymm1[10],ymm9[11],ymm1[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm1[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm9[0],ymm1[0],ymm9[1],ymm1[1],ymm9[2],ymm1[2],ymm9[3],ymm1[3],ymm9[8],ymm1[8],ymm9[9],ymm1[9],ymm9[10],ymm1[10],ymm9[11],ymm1[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm9[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm14, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm7[4],ymm15[5],ymm7[5],ymm15[6],ymm7[6],ymm15[7],ymm7[7],ymm15[12],ymm7[12],ymm15[13],ymm7[13],ymm15[14],ymm7[14],ymm15[15],ymm7[15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm19, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm15 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [12,1,2,13,4,5,14,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm15, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm16, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [8,21,10,11,20,13,14,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm30, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm7 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm12 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm14 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm10, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm30 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm0, %ymm30, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[8],ymm13[8],ymm0[9],ymm13[9],ymm0[10],ymm13[10],ymm0[11],ymm13[11] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm5, %zmm20, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm1[1,1,1,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[8],ymm9[8],ymm1[9],ymm9[9],ymm1[10],ymm9[10],ymm1[11],ymm9[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm17, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm3[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm8, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm3, %ymm5, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm21, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm8[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm0[4],ymm13[4],ymm0[5],ymm13[5],ymm0[6],ymm13[6],ymm0[7],ymm13[7],ymm0[12],ymm13[12],ymm0[13],ymm13[13],ymm0[14],ymm13[14],ymm0[15],ymm13[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm11, %ymm23, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm14[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm7, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm7, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm27[0],zero,xmm27[1],zero,xmm27[2],zero,xmm27[3],zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm13 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[8],ymm13[8],ymm0[9],ymm13[9],ymm0[10],ymm13[10],ymm0[11],ymm13[11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[8],ymm13[8],ymm10[9],ymm13[9],ymm10[10],ymm13[10],ymm10[11],ymm13[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm9[4],ymm1[5],ymm9[5],ymm1[6],ymm9[6],ymm1[7],ymm9[7],ymm1[12],ymm9[12],ymm1[13],ymm9[13],ymm1[14],ymm9[14],ymm1[15],ymm9[15] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm13[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm23[0,1,2,3],zmm21[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm3, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm14, %zmm18, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm25[0,1,2,3],zmm22[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm14, %zmm18, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm15[0,1,2,3],zmm19[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm26, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm27, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm18, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm28, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm24, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm19, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm14, %zmm15, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm16[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm31, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm3, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm9[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm7, %ymm3, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm10[0,1,2,3],zmm2[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: popq %rax +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm15, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm29, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm12, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm10, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm16, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm15, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i16_stride6_vf32: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,0,2,2] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm8 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX512DQ-SLOW-NEXT: movw $9362, %r10w # imm = 0x2492 +; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k2 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm4, %zmm3, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm4[1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm8, %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm14[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm16, %zmm17 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,2,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[1,0,2,2,5,4,6,6] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm15[0],ymm8[1],ymm15[1],ymm8[2],ymm15[2],ymm8[3],ymm15[3],ymm8[8],ymm15[8],ymm8[9],ymm15[9],ymm8[10],ymm15[10],ymm8[11],ymm15[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512DQ-SLOW-NEXT: movw $18724, %r10w # imm = 0x4924 +; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k1 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm9, %zmm6, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm9 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm11, %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm10, %xmm18 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3],ymm11[4],ymm0[5,6],ymm11[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm14 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm14[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm11, %zmm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[8],ymm0[8],ymm10[9],ymm0[9],ymm10[10],ymm0[10],ymm10[11],ymm0[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm8[4],ymm15[4],ymm8[5],ymm15[5],ymm8[6],ymm15[6],ymm8[7],ymm15[7],ymm8[12],ymm15[12],ymm8[13],ymm15[13],ymm8[14],ymm15[14],ymm8[15],ymm15[15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm8 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm10 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm10, %zmm8, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm9, %ymm10 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5,6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm0[1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm14[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm9 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,0,2,2] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm10 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm14 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm10, %ymm10 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm10, %zmm8, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm10 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm10, %ymm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm10[2],ymm0[3,4],ymm10[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm8, %ymm8 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm14 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm16, %zmm14 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm2[0],ymm14[1],ymm2[1],ymm14[2],ymm2[2],ymm14[3],ymm2[3],ymm14[8],ymm2[8],ymm14[9],ymm2[9],ymm14[10],ymm2[10],ymm14[11],ymm2[11] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm31 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,0,2,2,5,4,6,6] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm11[0],ymm8[0],ymm11[1],ymm8[1],ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[8],ymm8[8],ymm11[9],ymm8[9],ymm11[10],ymm8[10],ymm11[11],ymm8[11] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm15[0],ymm8[0],ymm15[1],ymm8[1],ymm15[2],ymm8[2],ymm15[3],ymm8[3],ymm15[8],ymm8[8],ymm15[9],ymm8[9],ymm15[10],ymm8[10],ymm15[11],ymm8[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512DQ-SLOW-NEXT: movw $18724, %ax # imm = 0x4924 -; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm6 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm3, %xmm25 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm26 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm12 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm0[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,2,3,3] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[1,0,2,2,5,4,6,6] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm0[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm7 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm10, %zmm16, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm10 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm13 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3],ymm13[4],ymm7[5,6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm7[0,1,2,3],zmm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[8],ymm7[8],ymm9[9],ymm7[9],ymm9[10],ymm7[10],ymm9[11],ymm7[11] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm9 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[2],ymm0[2],ymm9[3],ymm0[3],ymm9[8],ymm0[8],ymm9[9],ymm0[9],ymm9[10],ymm0[10],ymm9[11],ymm0[11] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm13[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm4 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm9, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm2, %ymm3 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3],ymm3[4],ymm7[5,6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm9[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm2[0,1,2,3],zmm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm11[4],ymm8[4],ymm11[5],ymm8[5],ymm11[6],ymm8[6],ymm11[7],ymm8[7],ymm11[12],ymm8[12],ymm11[13],ymm8[13],ymm11[14],ymm8[14],ymm11[15],ymm8[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm9 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm4 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm8 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2],ymm2[3,4],ymm10[5],ymm2[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm10 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm2[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm15[4],ymm8[4],ymm15[5],ymm8[5],ymm15[6],ymm8[6],ymm15[7],ymm8[7],ymm15[12],ymm8[12],ymm15[13],ymm8[13],ymm15[14],ymm8[14],ymm15[15],ymm8[15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm4 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm8 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm14[4],ymm9[4],ymm14[5],ymm9[5],ymm14[6],ymm9[6],ymm14[7],ymm9[7],ymm14[12],ymm9[12],ymm14[13],ymm9[13],ymm14[14],ymm9[14],ymm14[15],ymm9[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm7[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm3, %zmm4, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm2[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm5, %zmm4, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm5 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm29, %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm30, %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,0,2,2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm4[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm5 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX512DQ-SLOW-NEXT: movw $9362, %ax # imm = 0x2492 -; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm4, %zmm3, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[2,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm28, %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm27, %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm28[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm27[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm23, %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm24, %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,0,2,2] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm6[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm6 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm6, %zmm5, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm25, %xmm7 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[2,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm25[0],zero,xmm25[1],zero,xmm25[2],zero,xmm25[3],zero -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm5, %ymm5 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm21, %zmm10 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm9, %zmm10 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm16, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm11 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm11 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm1, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, (%rax) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i16_stride6_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $40, %rsp -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm2 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [5,6,5,6,5,6,7,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm29 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX512DQ-FAST-NEXT: movw $18724, %ax # imm = 0x4924 -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm3, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [8,21,10,11,20,13,14,23] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm21, %zmm20 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm31 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [12,1,2,13,4,5,14,7] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm9, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm5 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm4 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm17 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm4[2,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm0 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm15[4],ymm5[4],ymm15[5],ymm5[5],ymm15[6],ymm5[6],ymm15[7],ymm5[7],ymm15[12],ymm5[12],ymm15[13],ymm5[13],ymm15[14],ymm5[14],ymm15[15],ymm5[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm16 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm4[4],ymm13[4],ymm4[5],ymm13[5],ymm4[6],ymm13[6],ymm4[7],ymm13[7],ymm4[12],ymm13[12],ymm4[13],ymm13[13],ymm4[14],ymm13[14],ymm4[15],ymm13[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm10 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm2[0],ymm10[1],ymm2[1],ymm10[2],ymm2[2],ymm10[3],ymm2[3],ymm10[8],ymm2[8],ymm10[9],ymm2[9],ymm10[10],ymm2[10],ymm10[11],ymm2[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm2, %zmm23 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm10 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm0 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm0, %zmm23, %zmm21 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm9, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm0 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm0[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm9, %ymm0 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm13 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm13[0],ymm4[1],ymm13[1],ymm4[2],ymm13[2],ymm4[3],ymm13[3],ymm4[8],ymm13[8],ymm4[9],ymm13[9],ymm4[10],ymm13[10],ymm4[11],ymm13[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm24 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm13 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm4 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm0[0,0,2,1] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm26 = [1,0,2,2,1,0,2,2] -; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm5, %ymm26, %ymm5 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm7 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm25, %zmm7 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [1,0,2,2,1,0,2,2] +; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm19, %ymm0 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512DQ-FAST-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm25 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [16,9,10,17,12,13,18,15] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm4, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm2 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[12],ymm4[12],ymm9[13],ymm4[13],ymm9[14],ymm4[14],ymm9[15],ymm4[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm14[4],ymm0[5],ymm14[5],ymm0[6],ymm14[6],ymm0[7],ymm14[7],ymm0[12],ymm14[12],ymm0[13],ymm14[13],ymm0[14],ymm14[14],ymm0[15],ymm14[15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [5,6,5,6,5,6,7,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm11 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm11[0],ymm4[0],ymm11[1],ymm4[1],ymm11[2],ymm4[2],ymm11[3],ymm4[3],ymm11[8],ymm4[8],ymm11[9],ymm4[9],ymm11[10],ymm4[10],ymm11[11],ymm4[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX512DQ-FAST-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm5, %zmm7 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [16,9,10,17,12,13,18,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm27, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm6 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm26, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm6[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm14, %ymm28 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm4, %zmm30 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm11 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [12,1,2,13,4,5,14,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm23 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm2, %ymm24, %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [8,21,10,11,20,13,14,23] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[8],ymm14[8],ymm0[9],ymm14[9],ymm0[10],ymm14[10],ymm0[11],ymm14[11] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm14 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm26 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm14[0],ymm4[1],ymm14[1],ymm4[2],ymm14[2],ymm4[3],ymm14[3],ymm4[8],ymm14[8],ymm4[9],ymm14[9],ymm4[10],ymm14[10],ymm4[11],ymm14[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm0[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm0[2,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm12 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm0[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm12, %ymm19, %ymm12 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm3, %zmm26, %zmm1 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm3 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm8, %zmm1, %zmm27 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm15[0],ymm5[0],ymm15[1],ymm5[1],ymm15[2],ymm5[2],ymm15[3],ymm5[3],ymm15[8],ymm5[8],ymm15[9],ymm5[9],ymm15[10],ymm5[10],ymm15[11],ymm5[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> -; AVX512DQ-FAST-NEXT: vpermd %zmm24, %zmm8, %zmm15 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm0, %zmm15 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,20,11,12,21,14,15] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm14 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[2],mem[2],ymm14[3],mem[3],ymm14[8],mem[8],ymm14[9],mem[9],ymm14[10],mem[10],ymm14[11],mem[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm15 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm8, %zmm15 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm12, %zmm15 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm20[0],zero,xmm20[1],zero,xmm20[2],zero,xmm20[3],zero +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm19, %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero +; AVX512DQ-FAST-NEXT: vpermi2d %ymm0, %ymm15, %ymm19 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [16,9,10,17,12,13,18,15] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm13 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm1[0],ymm13[1],ymm1[1],ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[8],ymm1[8],ymm13[9],ymm1[9],ymm13[10],ymm1[10],ymm13[11],ymm1[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm4[4],ymm14[4],ymm4[5],ymm14[5],ymm4[6],ymm14[6],ymm4[7],ymm14[7],ymm4[12],ymm14[12],ymm4[13],ymm14[13],ymm4[14],ymm14[14],ymm4[15],ymm14[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,6,5,6,5,6,7,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm13, %ymm20, %ymm13 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm14 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm14[0],ymm4[1],ymm14[1],ymm4[2],ymm14[2],ymm4[3],ymm14[3],ymm4[8],ymm14[8],ymm4[9],ymm14[9],ymm4[10],ymm14[10],ymm4[11],ymm14[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm12[4],ymm0[4],ymm12[5],ymm0[5],ymm12[6],ymm0[6],ymm12[7],ymm0[7],ymm12[12],ymm0[12],ymm12[13],ymm0[13],ymm12[14],ymm0[14],ymm12[15],ymm0[15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[3,3,3,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm13, %zmm4, %zmm1 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermi2d %ymm13, %ymm1, %ymm24 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm5 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm17, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm6 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> +; AVX512DQ-FAST-NEXT: vpermd %zmm26, %zmm31, %zmm14 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm9, %zmm5, %zmm14 {%k2} ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,9,2,3,8,5,6,11] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm14, %ymm24, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm14 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm14[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512DQ-FAST-NEXT: vpermd %zmm2, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm13, %ymm29 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm11 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm14 = ymm13[0],mem[0],ymm13[1],mem[1],ymm13[2],mem[2],ymm13[3],mem[3],ymm13[8],mem[8],ymm13[9],mem[9],ymm13[10],mem[10],ymm13[11],mem[11] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm14, %zmm12, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm12 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm12, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm4, %ymm24, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm4 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm5 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm22[0,1,2,3],zmm20[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm13, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm14, %zmm13 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm23[0,1,2,3],zmm21[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm16 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm14, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,3,4,9,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm12, %ymm6, %ymm7 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm6, %ymm1 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm7[0,1,2,3],zmm25[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm28, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm6 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm27[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm29, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm7, %zmm3 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm15[0,1,2,3],zmm10[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm8, %zmm7 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,9,2,3,8,5,6,11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm14, %ymm20 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm9, %ymm16, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [8,9,20,11,12,21,14,15] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm11, %zmm17, %zmm14 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm11, %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm6 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm6[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm10[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[2],ymm0[2],ymm12[3],ymm0[3],ymm12[8],ymm0[8],ymm12[9],ymm0[9],ymm12[10],ymm0[10],ymm12[11],ymm0[11] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm5 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm26, %xmm6 +; AVX512DQ-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vpermd %zmm27, %zmm31, %zmm26 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm2, %zmm26 {%k2} +; AVX512DQ-FAST-NEXT: vpermi2d %ymm8, %ymm26, %ymm16 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm26 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm0 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm22, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm19, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm11, %zmm7 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm9, %zmm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm23, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm10, %zmm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm24, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm21, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm10, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm20, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm16, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm0 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 320(%rax) -; AVX512DQ-FAST-NEXT: addq $40, %rsp +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -4686,7 +4633,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm1[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -4699,8 +4646,8 @@ ; SSE-NEXT: por %xmm10, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm3[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm9[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm2[0,2] ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,1,1] @@ -4714,7 +4661,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] ; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[1,0] ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm13[0,2] ; SSE-NEXT: movdqa %xmm1, %xmm0 @@ -4728,7 +4675,7 @@ ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm2[0,2] ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm13, %xmm0 @@ -4740,7 +4687,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,0],xmm0[1,0] ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm10[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -4752,8 +4699,8 @@ ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm9[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm9[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,0,1,1] ; SSE-NEXT: movdqa %xmm12, %xmm0 @@ -4766,7 +4713,7 @@ ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[1,0] ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm9[0,2] ; SSE-NEXT: movdqa %xmm1, %xmm0 @@ -4780,7 +4727,7 @@ ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm10[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm10[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[0,2] ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm9, %xmm0 @@ -4792,7 +4739,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[1,0] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4804,8 +4751,8 @@ ; SSE-NEXT: por %xmm2, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm9[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm9[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] ; SSE-NEXT: movdqa %xmm12, %xmm1 @@ -4818,7 +4765,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm0 @@ -4829,7 +4776,7 @@ ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm5[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[0,2] ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm3, %xmm0 @@ -4841,7 +4788,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[1,0] ; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -4853,8 +4800,8 @@ ; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; SSE-NEXT: movdqa %xmm12, %xmm3 @@ -4867,7 +4814,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm1 @@ -4878,7 +4825,7 @@ ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[0,2] ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 @@ -4890,7 +4837,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[1,0] ; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -4902,8 +4849,8 @@ ; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; SSE-NEXT: movdqa %xmm12, %xmm3 @@ -4916,7 +4863,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm1 @@ -4927,7 +4874,7 @@ ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[0,2] ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 @@ -4940,7 +4887,7 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[1,0] ; SSE-NEXT: movaps %xmm3, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -4952,8 +4899,8 @@ ; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm5[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] ; SSE-NEXT: movdqa %xmm12, %xmm4 @@ -4966,7 +4913,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] ; SSE-NEXT: pslld $16, %xmm3 @@ -4978,7 +4925,7 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm5[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[0,2] ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 @@ -4990,7 +4937,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[1,0] ; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -5001,8 +4948,8 @@ ; SSE-NEXT: andps %xmm8, %xmm0 ; SSE-NEXT: por %xmm0, %xmm13 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,1,1,1,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; SSE-NEXT: movdqa %xmm1, %xmm4 @@ -5015,7 +4962,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[1,0] ; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] ; SSE-NEXT: movdqa %xmm4, %xmm1 @@ -5027,7 +4974,7 @@ ; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] ; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm7[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm3[0,2] ; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: pandn %xmm5, %xmm4 @@ -5039,7 +4986,7 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm1[1,0] ; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[0,2] ; SSE-NEXT: movdqa %xmm15, %xmm5 @@ -5050,9 +4997,9 @@ ; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] ; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[0,2] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,0,1,1] ; SSE-NEXT: movdqa %xmm15, %xmm9 @@ -5065,7 +5012,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[1,0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[0,2] ; SSE-NEXT: andps %xmm8, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,2,3,3] @@ -5074,7 +5021,7 @@ ; SSE-NEXT: por %xmm2, %xmm8 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm1[0,2] ; SSE-NEXT: andps %xmm12, %xmm7 ; SSE-NEXT: pandn %xmm5, %xmm12 @@ -5172,558 +5119,554 @@ ; ; AVX1-ONLY-LABEL: store_i16_stride6_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $504, %rsp # imm = 0x1F8 -; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa 80(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0],xmm6[1,2],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa 80(%r9), %xmm3 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6],xmm8[7] +; AVX1-ONLY-NEXT: subq $472, %rsp # imm = 0x1D8 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = xmm3[0,1],xmm6[0],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm7 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5],xmm14[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm14 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm14[3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm9[1,2],xmm11[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5,6],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm6[4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm7, %xmm11 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm9 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0],xmm9[1],xmm8[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3],xmm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5,6],ymm7[7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm7[0,1],xmm0[0],xmm7[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3],xmm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm5[1,2],xmm6[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm0[4,5],xmm5[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5],xmm5[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0],xmm4[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa 64(%r8), %xmm5 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0],xmm6[1],xmm12[2,3] -; AVX1-ONLY-NEXT: vmovdqa 64(%r9), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm6[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6],xmm13[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm2[0],xmm0[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm12[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1],xmm5[0],xmm1[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm8[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm8[0,1],xmm2[0],xmm8[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm3[1,2],xmm5[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0],xmm6[1,2],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm3 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5],xmm6[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm8[0,1],xmm1[0],xmm8[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm3[1,2],xmm5[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6],xmm6[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0],xmm5[1],xmm12[2,3] -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm4[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6],xmm13[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm2[0],xmm0[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm12[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1],xmm4[0],xmm1[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm8[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5],xmm3[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovdqa 112(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0],xmm6[1,2],xmm3[3] -; AVX1-ONLY-NEXT: vmovdqa 112(%r9), %xmm3 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5],xmm6[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm8[0,1],xmm1[0],xmm8[3] +; AVX1-ONLY-NEXT: vmovdqa 64(%r9), %xmm0 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm3[1,2],xmm5[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6],xmm6[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovdqa 96(%r8), %xmm4 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0],xmm5[1],xmm12[2,3] -; AVX1-ONLY-NEXT: vmovdqa 96(%r9), %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm4[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6],xmm13[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm2[0],xmm0[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm12[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1],xmm4[0],xmm1[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm8[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5],xmm3[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0],xmm1[1,2],xmm2[3] -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm2 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 80(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 80(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 80(%r8), %xmm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm8[0,1],xmm1[0],xmm8[3] +; AVX1-ONLY-NEXT: vmovdqa 80(%r9), %xmm0 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm3[1,2],xmm5[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6],xmm6[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm2, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm8[0,1],xmm1[0],xmm8[3] +; AVX1-ONLY-NEXT: vmovdqa 96(%r9), %xmm0 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm15[5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm8[0,1,2],xmm15[3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm3[1,2],xmm5[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6],xmm6[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0,1,2,3,4],xmm5[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 112(%r8), %xmm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm7[0,1],xmm0[0],xmm7[3] +; AVX1-ONLY-NEXT: vmovdqa 112(%r9), %xmm1 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm14[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2,3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm14[3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm2[1,2],xmm3[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm5[1],xmm4[2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0],xmm0[1],xmm12[2,3] -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3,4,5],xmm15[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2,3,4,5,6],xmm15[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm15, %ymm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm15, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6],ymm11[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = xmm11[0,1],xmm3[0],xmm11[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm15[5],xmm10[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm6[0,1],xmm1[0],xmm6[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5],xmm8[6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm9[3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm3[1,2],xmm4[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, 16(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 112(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, 96(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm12, 80(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm13, 64(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm14, 176(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 752(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 736(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 720(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 704(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 688(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 672(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 656(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 640(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm11, 624(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 608(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm13, 592(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 576(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 560(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 544(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 624(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 528(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 608(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 512(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 592(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 496(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 576(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 480(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 688(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 464(%rax) ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 672(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 656(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 640(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 448(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 752(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 720(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 704(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 368(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -5733,38 +5676,38 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 368(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 496(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 480(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 464(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 560(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 528(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 512(%rax) -; AVX1-ONLY-NEXT: addq $504, %rsp # imm = 0x1F8 +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rax) +; AVX1-ONLY-NEXT: addq $472, %rsp # imm = 0x1D8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -6247,7 +6190,7 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm15 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] @@ -6475,7 +6418,7 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] @@ -6496,8 +6439,7 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 @@ -6805,12 +6747,12 @@ ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[3,3,3,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm15 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] @@ -7198,7 +7140,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm10, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm7, %ymm7 @@ -7215,8 +7157,7 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm1[1,2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] @@ -7366,12 +7307,12 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm0, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] @@ -7477,2095 +7418,1974 @@ ; ; AVX512F-ONLY-SLOW-LABEL: store_i16_stride6_vf64: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $408, %rsp # imm = 0x198 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: subq $200, %rsp ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,0,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm8, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm6[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm3, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm5, %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6],ymm5[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm2[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm2[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm7 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,2,3,3,5,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm7, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm8, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5,6],ymm9[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm4[1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm4[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm4[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,0,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm8 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm5, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm4[1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm8, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4],ymm8[5],ymm5[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm4, %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm4[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,2,5,4,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm15, %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %xmm19 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm19[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[1,0,2,2,5,4,6,6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm6[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm10[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm6, %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm5[0,1,2,3],zmm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm10, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm5[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm10[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm10 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm12 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[8],ymm10[8],ymm12[9],ymm10[9],ymm12[10],ymm10[10],ymm12[11],ymm10[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm12[0],ymm8[0],ymm12[1],ymm8[1],ymm12[2],ymm8[2],ymm12[3],ymm8[3],ymm12[8],ymm8[8],ymm12[9],ymm8[9],ymm12[10],ymm8[10],ymm12[11],ymm8[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm7[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm10, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm7, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm0[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm3[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm4 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm3[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm12[4],ymm11[4],ymm12[5],ymm11[5],ymm12[6],ymm11[6],ymm12[7],ymm11[7],ymm12[12],ymm11[12],ymm12[13],ymm11[13],ymm12[14],ymm11[14],ymm12[15],ymm11[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[1,2,3,3,5,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm9[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[8],ymm10[8],ymm12[9],ymm10[9],ymm12[10],ymm10[10],ymm12[11],ymm10[11] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm3[0],ymm12[1],ymm3[1],ymm12[2],ymm3[2],ymm12[3],ymm3[3],ymm12[8],ymm3[8],ymm12[9],ymm3[9],ymm12[10],ymm3[10],ymm12[11],ymm3[11] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm17, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm11, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm8, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm10, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5,6],ymm11[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,0,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm9 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm10 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm9, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm8, %zmm9 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm9, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm10 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm10, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[1,0,2,2,5,4,6,6] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm9, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm10, %ymm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm14, %ymm1, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm14, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6],ymm10[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm5[1,2],ymm12[3],ymm5[4,5],ymm12[6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm5[0,1,2,3],zmm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm10[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm5[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,0,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm3[1],ymm10[2,3],ymm3[4],ymm10[5,6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm3 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm9 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[8],ymm3[8],ymm9[9],ymm3[9],ymm9[10],ymm3[10],ymm9[11],ymm3[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[8],ymm3[8],ymm8[9],ymm3[9],ymm8[10],ymm3[10],ymm8[11],ymm3[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm5 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,1,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[0,1,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm10, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: movw $9362, %ax # imm = 0x2492 -; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm4 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm4[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm11, %xmm16 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm3, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,0,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm9 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm10 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm9, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm15[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm14[4],xmm11[5],xmm14[5],xmm11[6],xmm14[6],xmm11[7],xmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm6, %zmm10 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r8), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm10, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm12 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm10[0,1,2,3],zmm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm9 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm9, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm1[1,2],ymm10[3],ymm1[4,5],ymm10[6],ymm1[7] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm10 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm10, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm4[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rsi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,2,2,5,4,6,6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm6, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[1,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[1,0,2,2,5,4,6,6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm11[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm14[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm11 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm3[2],ymm14[3,4],ymm3[5],ymm14[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm11[0,1,2,3],zmm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm11 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm13 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rcx), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%rdx), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[8],ymm3[8],ymm9[9],ymm3[9],ymm9[10],ymm3[10],ymm9[11],ymm3[11] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,0,2,2,5,4,6,6] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r8), %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm12[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6],ymm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm0, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm11 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm14 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm14[0],ymm11[0],ymm14[1],ymm11[1],ymm14[2],ymm11[2],ymm14[3],ymm11[3],ymm14[8],ymm11[8],ymm14[9],ymm11[9],ymm14[10],ymm11[10],ymm14[11],ymm11[11] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm8[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm25, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, (%rsp), %zmm26, %zmm25 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm8[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm1[0],ymm13[1],ymm1[1],ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[8],ymm1[8],ymm13[9],ymm1[9],ymm13[10],ymm1[10],ymm13[11],ymm1[11] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm17, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm9, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm31, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm0 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm8[4],ymm2[5],ymm8[5],ymm2[6],ymm8[6],ymm2[7],ymm8[7],ymm2[12],ymm8[12],ymm2[13],ymm8[13],ymm2[14],ymm8[14],ymm2[15],ymm8[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm11[2,1,2,3,6,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm3[4],ymm9[5],ymm3[5],ymm9[6],ymm3[6],ymm9[7],ymm3[7],ymm9[12],ymm3[12],ymm9[13],ymm3[13],ymm9[14],ymm3[14],ymm9[15],ymm3[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,2,3,3,5,6,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[2,1,2,3,6,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm2[4],ymm11[4],ymm2[5],ymm11[5],ymm2[6],ymm11[6],ymm2[7],ymm11[7],ymm2[12],ymm11[12],ymm2[13],ymm11[13],ymm2[14],ymm11[14],ymm2[15],ymm11[15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm5[2,1,2,3,6,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6],ymm0[7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm8 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm25, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm16, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm12[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm4, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm12[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0],ymm5[1,2],ymm10[3],ymm5[4,5],ymm10[6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm12 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpsrldq {{.*#+}} xmm15 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,0,2,2] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm5, %zmm2 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm5[1,2],ymm12[3],ymm5[4,5],ymm12[6],ymm5[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3],ymm15[4],ymm3[5,6],ymm15[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm15, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm28 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm31, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 96(%r9), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm11[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm23 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm23, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm19, %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq %xmm5, %ymm5 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm25, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm10, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm18, %zmm25, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm15 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm15 # 64-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm22, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm28, %zmm18, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm14, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm18, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm18, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm26, %zmm14, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm24, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm29, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm24 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm23, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm26, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm16, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm29, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm22, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $408, %rsp # imm = 0x198 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm26, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm29, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm27, %zmm22, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm10, %zmm26, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm12, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm29, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $200, %rsp ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i16_stride6_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1064, %rsp # imm = 0x428 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm7 +; AVX512F-ONLY-FAST-NEXT: subq $536, %rsp # imm = 0x218 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %xmm16 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm11[0],ymm2[1],ymm11[1],ymm2[2],ymm11[2],ymm2[3],ymm11[3],ymm2[8],ymm11[8],ymm2[9],ymm11[9],ymm2[10],ymm11[10],ymm2[11],ymm11[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm14[0],ymm6[1],ymm14[1],ymm6[2],ymm14[2],ymm6[3],ymm14[3],ymm6[8],ymm14[8],ymm6[9],ymm14[9],ymm6[10],ymm14[10],ymm6[11],ymm14[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm8, %xmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,1,1,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [1,0,2,2,1,0,2,2] +; AVX512F-ONLY-FAST-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm28, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm10, %zmm29 +; AVX512F-ONLY-FAST-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm5, %zmm29 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [16,9,10,17,12,13,18,15] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm25, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[1,1,1,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm22, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 ; AVX512F-ONLY-FAST-NEXT: movw $18724, %ax # imm = 0x4924 -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm18 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [8,9,20,11,12,21,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm17, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,9,2,3,8,5,6,11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm8, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm22, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm16 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm12, %ymm8, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm9 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [5,6,5,6,5,6,7,7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm2[4],ymm11[4],ymm2[5],ymm11[5],ymm2[6],ymm11[6],ymm2[7],ymm11[7],ymm2[12],ymm11[12],ymm2[13],ymm11[13],ymm2[14],ymm11[14],ymm2[15],ymm11[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm9, %ymm24, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm11 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm11[0],ymm2[1],ymm11[1],ymm2[2],ymm11[2],ymm2[3],ymm11[3],ymm2[8],ymm11[8],ymm2[9],ymm11[9],ymm2[10],ymm11[10],ymm2[11],ymm11[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm2[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[8],ymm0[8],ymm10[9],ymm0[9],ymm10[10],ymm0[10],ymm10[11],ymm0[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm11, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm9, %zmm25 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [8,21,10,11,20,13,14,23] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,1,2,13,4,5,14,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm2, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm14[4],ymm6[5],ymm14[5],ymm6[6],ymm14[6],ymm6[7],ymm14[7],ymm6[12],ymm14[12],ymm6[13],ymm14[13],ymm6[14],ymm14[14],ymm6[15],ymm14[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm24, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm6 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm15[4],ymm11[4],ymm15[5],ymm11[5],ymm15[6],ymm11[6],ymm15[7],ymm11[7],ymm15[12],ymm11[12],ymm15[13],ymm11[13],ymm15[14],ymm11[14],ymm15[15],ymm11[15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm14 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm20, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm0[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm3, %ymm10, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [1,0,2,2,1,0,2,2] -; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm19, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm9 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm30 -; AVX512F-ONLY-FAST-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm30 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [16,9,10,17,12,13,18,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,9,2,3,8,5,6,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm12, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm3, %ymm10, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [8,9,20,11,12,21,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm10, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [5,6,5,6,5,6,7,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm10, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm6 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [12,1,2,13,4,5,14,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm4, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,21,10,11,20,13,14,23] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm28, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm1 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm2, %zmm20, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm2[1,1,1,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm15, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm13, %xmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm18, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm19, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm28, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm22 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm24 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm23, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm19, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm25, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm14 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm28, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [5,6,5,6,5,6,7,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm3 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm3, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm11[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,8,3,4,9,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm11, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm14, %ymm11, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm1, %ymm11, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm28, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm14 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm18[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm9, %ymm14, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm21[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm26[3,3,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm9, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [12,1,2,13,4,5,14,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm9, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [8,21,10,11,20,13,14,23] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm13[0],ymm8[0],ymm13[1],ymm8[1],ymm13[2],ymm8[2],ymm13[3],ymm8[3],ymm13[8],ymm8[8],ymm13[9],ymm8[9],ymm13[10],ymm8[10],ymm13[11],ymm8[11] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm31, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm31 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm23, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm15[0],ymm11[0],ymm15[1],ymm11[1],ymm15[2],ymm11[2],ymm15[3],ymm11[3],ymm15[8],ymm11[8],ymm15[9],ymm11[9],ymm15[10],ymm11[10],ymm15[11],ymm11[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,1,1,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[8],ymm6[8],ymm0[9],ymm6[9],ymm0[10],ymm6[10],ymm0[11],ymm6[11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm9, %zmm16, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm5, %zmm28 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [0,9,2,3,8,5,6,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm4, %ymm25, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [8,9,20,11,12,21,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm19, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm13[4],ymm8[4],ymm13[5],ymm8[5],ymm13[6],ymm8[6],ymm13[7],ymm8[7],ymm13[12],ymm8[12],ymm13[13],ymm8[13],ymm13[14],ymm8[14],ymm13[15],ymm8[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[12],ymm6[12],ymm0[13],ymm6[13],ymm0[14],ymm6[14],ymm0[15],ymm6[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm15[0],ymm4[1],ymm15[1],ymm4[2],ymm15[2],ymm4[3],ymm15[3],ymm4[8],ymm15[8],ymm4[9],ymm15[9],ymm4[10],ymm15[10],ymm4[11],ymm15[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm27, %zmm22, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm1, %zmm27 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm8, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[1,1,1,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0],ymm11[0],ymm15[1],ymm11[1],ymm15[2],ymm11[2],ymm15[3],ymm11[3],ymm15[8],ymm11[8],ymm15[9],ymm11[9],ymm15[10],ymm11[10],ymm15[11],ymm11[11] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm3 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm0, %ymm8, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15] -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm8 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm13[0],ymm8[0],ymm13[1],ymm8[1],ymm13[2],ymm8[2],ymm13[3],ymm8[3],ymm13[8],ymm8[8],ymm13[9],ymm8[9],ymm13[10],ymm8[10],ymm13[11],ymm8[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm12[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm15[4],ymm11[4],ymm15[5],ymm11[5],ymm15[6],ymm11[6],ymm15[7],ymm11[7],ymm15[12],ymm11[12],ymm15[13],ymm11[13],ymm15[14],ymm11[14],ymm15[15],ymm11[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm13[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm12[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm24, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm1 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm20, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm8[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm8 = ymm13[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm8[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [12,1,2,13,4,5,14,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm2, %ymm6, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm8[4],ymm0[5],ymm8[5],ymm0[6],ymm8[6],ymm0[7],ymm8[7],ymm0[12],ymm8[12],ymm0[13],ymm8[13],ymm0[14],ymm8[14],ymm0[15],ymm8[15] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm24, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm11, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm15[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm15 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm12 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[2],ymm15[2],ymm12[3],ymm15[3],ymm12[8],ymm15[8],ymm12[9],ymm15[9],ymm12[10],ymm15[10],ymm12[11],ymm15[11] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[8],ymm15[8],ymm13[9],ymm15[9],ymm13[10],ymm15[10],ymm13[11],ymm15[11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm8[4],ymm0[4],ymm8[5],ymm0[5],ymm8[6],ymm0[6],ymm8[7],ymm0[7],ymm8[12],ymm0[12],ymm8[13],ymm0[13],ymm8[14],ymm0[14],ymm8[15],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm12, %zmm2 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm2, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm12, %ymm6, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm19, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm12 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm13 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm0, %ymm4, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm1, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm8, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm8[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm8, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm25 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm8[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm13, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm17, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm16, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm29, %zmm7, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm16 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm10, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm16, %zmm7, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm9, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm7, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm11, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,5,6,5,6,7,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm31, %ymm1, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm27[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm24[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm26[3,3,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [12,1,2,13,4,5,14,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm6, %ymm27, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [8,21,10,11,20,13,14,23] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm6, %zmm10, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[12],ymm15[12],ymm10[13],ymm15[13],ymm10[14],ymm15[14],ymm10[15],ymm15[15] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm6, %ymm1, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm29, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm24 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm14 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} ymm10 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm10[0],ymm14[0],ymm10[1],ymm14[1],ymm10[2],ymm14[2],ymm10[3],ymm14[3],ymm10[8],ymm14[8],ymm10[9],ymm14[9],ymm10[10],ymm14[10],ymm10[11],ymm14[11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm10, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[8],ymm10[8],ymm15[9],ymm10[9],ymm15[10],ymm10[10],ymm15[11],ymm10[11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm15[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm13[4],ymm5[4],ymm13[5],ymm5[5],ymm13[6],ymm5[6],ymm13[7],ymm5[7],ymm13[12],ymm5[12],ymm13[13],ymm5[13],ymm13[14],ymm5[14],ymm13[15],ymm5[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm12, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm12[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %ymm6, %ymm5, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm12[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm13 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm23, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm7, %ymm19, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm12 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpsrldq {{.*#+}} xmm13 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm13 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm13 = zmm18[0,1,2,3],mem[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm13, %zmm18, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm13 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm13 = zmm16[0,1,2,3],mem[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm16 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm13, %zmm18, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm27[0,1,2,3],zmm21[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm19 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm13, %zmm18, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm17[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm26, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm18, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm3 = zmm25[0,1,2,3],mem[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm13 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm14[0,1,2,3],zmm29[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm14 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm17, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm24[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm22[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm18 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm12, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm12, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm9[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm22 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm26 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm1, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpmovzxwd {{.*#+}} xmm27 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq %xmm9, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm20[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,8,3,4,9,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm18, %ymm2, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm22, %ymm2, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm26, %ymm2, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm27, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm2 = zmm30[0,1,2,3],mem[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm21, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm31[0,1,2,3],zmm28[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm18, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm0[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm23[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm26, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm11[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm11, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm24, %zmm29, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm20, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm20, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm13, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm29, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm13, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm21, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm27, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm13, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm13, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm30, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm13, %zmm5 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1064, %rsp # imm = 0x428 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $536, %rsp # imm = 0x218 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i16_stride6_vf64: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $408, %rsp # imm = 0x198 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512DQ-SLOW-NEXT: subq $72, %rsp +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,0,2,2] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm6 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm7 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 +; AVX512DQ-SLOW-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm6, %zmm3, %zmm4 {%k2} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm4, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm4 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm6, %ymm14 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm6[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[0,0,2,1] ; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,2,5,4,6,6] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm3, %ymm31 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm5 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,0,2,2,5,4,6,6] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm7 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX512DQ-SLOW-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm2 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm3, %xmm12 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %xmm19 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm19[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm6 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3],ymm3[4],ymm7[5,6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm13 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm0[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm0[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm9 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm10 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm9[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm8, %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512DQ-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm5[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm5[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,0,2,2] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm7 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm6, %zmm5 {%k2} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7] +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3,4],ymm7[5],ymm5[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm5, %ymm23 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm5[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm4[0],ymm7[1],ymm4[1],ymm7[2],ymm4[2],ymm7[3],ymm4[3],ymm7[8],ymm4[8],ymm7[9],ymm4[9],ymm7[10],ymm4[10],ymm7[11],ymm4[11] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[1,0,2,2,5,4,6,6] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[1,1,1,1] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[8],ymm6[8],ymm10[9],ymm6[9],ymm10[10],ymm6[10],ymm10[11],ymm6[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm9, %zmm8, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm8 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm0[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %xmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,2,3,3] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %ymm3 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[1,0,2,2,5,4,6,6] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %ymm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6],ymm0[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm5, %zmm31 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm3[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm9 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm3[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm5[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[12],ymm6[12],ymm10[13],ymm6[13],ymm10[14],ymm6[14],ymm10[15],ymm6[15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm6[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm6 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm11 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[8],ymm6[8],ymm11[9],ymm6[9],ymm11[10],ymm6[10],ymm11[11],ymm6[11] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm17, %zmm10 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm7[4],ymm4[4],ymm7[5],ymm4[5],ymm7[6],ymm4[6],ymm7[7],ymm4[7],ymm7[12],ymm4[12],ymm7[13],ymm4[13],ymm7[14],ymm4[14],ymm7[15],ymm4[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm4, %zmm11, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm10, %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm18, %ymm12 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm8, %ymm7 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6],ymm7[7] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %xmm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm16, %zmm17, %zmm14 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm10[1,2],ymm8[3],ymm10[4,5],ymm8[6],ymm10[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm22 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm7[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm7[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,0,2,2] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm9 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm10 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm9, %ymm9 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm9, %zmm8, %zmm7 {%k2} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm8 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm9, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm19 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm6 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,0,2,2,5,4,6,6] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm9, %zmm0, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm8, %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7] +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5,6],ymm4[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm20 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm4 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm7 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm7[0],ymm4[0],ymm7[1],ymm4[1],ymm7[2],ymm4[2],ymm7[3],ymm4[3],ymm7[8],ymm4[8],ymm7[9],ymm4[9],ymm7[10],ymm4[10],ymm7[11],ymm4[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm5, %zmm4, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm5 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,2,1] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %xmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %xmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,0,2,2] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm8 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm9 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm8, %ymm8 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm8, %zmm7, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %xmm7 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm9, %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,3,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rcx), %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdx), %ymm6 +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,0,2,2,5,4,6,6] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm9 +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1] +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rsi), %ymm4 ; AVX512DQ-SLOW-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm5 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm10, %zmm6, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %ymm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm12, %xmm18 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm10, %zmm3, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm9, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r8), %ymm3 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %xmm11 +; AVX512DQ-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm1[1],ymm9[2,3],ymm1[4],ymm9[5,6],ymm1[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm11[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,1,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3],ymm11[4],ymm5[5,6],ymm11[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm5[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %xmm10 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa %xmm10, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm5[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %ymm5 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm10[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm24, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, (%rsp), %zmm26, %zmm24 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[2,1,2,3,6,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm8 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm11 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm11[0],ymm8[0],ymm11[1],ymm8[1],ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[8],ymm8[8],ymm11[9],ymm8[9],ymm11[10],ymm8[10],ymm11[11],ymm8[11] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,1,2,3,6,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm10[0],ymm2[1],ymm10[1],ymm2[2],ymm10[2],ymm2[3],ymm10[3],ymm2[8],ymm10[8],ymm2[9],ymm10[9],ymm2[10],ymm10[10],ymm2[11],ymm10[11] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm10 +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm4 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm8 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm3 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm8, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm7, %ymm14 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm3, %ymm7 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm12 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm12[4],ymm8[5],ymm12[5],ymm8[6],ymm12[6],ymm8[7],ymm12[7],ymm8[12],ymm12[12],ymm8[13],ymm12[13],ymm8[14],ymm12[14],ymm8[15],ymm12[15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm7 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm10 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm11 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[12],ymm7[12],ymm4[13],ymm7[13],ymm4[14],ymm7[14],ymm4[15],ymm7[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm11, %zmm10, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm3, %ymm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm14, %ymm16 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6],ymm10[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm5[1,2],ymm11[3],ymm5[4,5],ymm11[6],ymm5[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm5[0,1,2,3],zmm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %ymm5 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm10[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm5[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,3,2,3,6,7,6,7] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,2] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm11 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm9 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm9, %ymm9 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512DQ-SLOW-NEXT: movw $9362, %ax # imm = 0x2492 -; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm9, %zmm2, %zmm0 {%k2} -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm9[1,2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm6, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm13, %xmm17 -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm27 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,0,2,2] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm14 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm15[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm11 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm14, %zmm11 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %xmm6 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm11, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1,2],ymm0[3],ymm14[4,5],ymm0[6],ymm14[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm14 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm14, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm11[0,1,2,3],zmm0[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[8],ymm7[8],ymm4[9],ymm7[9],ymm4[10],ymm7[10],ymm4[11],ymm7[11] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,2,2,5,4,6,6] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[8],ymm12[8],ymm8[9],ymm12[9],ymm8[10],ymm12[10],ymm8[11],ymm12[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm13 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm6, %xmm2 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r9), %xmm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm0[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,3,3] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[1,0,2,2,5,4,6,6] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm0[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm3, %zmm12 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm15, %zmm14, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %ymm15 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm12, %ymm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm3[2],ymm14[3,4],ymm3[5],ymm14[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512DQ-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm13 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm12[0,1,2,3],zmm14[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm0[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,2,3,3,5,6,7,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm5, %zmm4, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %xmm4 +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufb %ymm12, %ymm3, %ymm12 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm1 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm13 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm1[0],ymm13[1],ymm1[1],ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[8],ymm1[8],ymm13[9],ymm1[9],ymm13[10],ymm1[10],ymm13[11],ymm1[11] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm8, %zmm1, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm16, %ymm14 -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm15, %ymm1 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm31, %ymm9 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm31[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,1,2,3,6,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm8 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} ymm9 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,2,3,3,5,6,7,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm9, %zmm8, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpshufb %ymm14, %ymm9, %ymm1 -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm8 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm9 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm9 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm17, %xmm11 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm11[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm0[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm4, %ymm24 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm2[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm7 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[1,0,2,2] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm7, %zmm10, %zmm5 {%k2} -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm5, %ymm7 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm7[1,2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa 32(%r9), %ymm3 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[0,1,2,1] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,5] -; AVX512DQ-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm12 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpsrldq {{.*#+}} xmm15 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm25 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm23, %zmm23 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 96(%r9), %ymm5 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm5[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,0,2,2] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm12, %zmm13, %zmm7 {%k2} -; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7] -; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[2,3,2,3,6,7,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm23 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm23, %ymm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm2 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm10 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm18, %zmm1, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm1 # 64-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm20, %zmm22, %zmm18 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm28, %zmm14, %zmm18 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm16, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm14, %zmm11 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm27, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm26, %zmm14, %zmm13 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm15 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm29, %zmm14, %zmm15 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm23 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm29, %zmm28 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm26, %zmm28 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm27, %zmm16 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm17, %zmm7 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 512(%rax) +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm8 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm26, %zmm8 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm27, %zmm6 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm12, %zmm3 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm15, %zmm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm27, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512DQ-SLOW-NEXT: addq $408, %rsp # imm = 0x198 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512DQ-SLOW-NEXT: addq $72, %rsp ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i16_stride6_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $920, %rsp # imm = 0x398 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm12, (%rsp) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm15, %xmm28 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[2],ymm13[2],ymm2[3],ymm13[3],ymm2[8],ymm13[8],ymm2[9],ymm13[9],ymm2[10],ymm13[10],ymm2[11],ymm13[11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm10, %xmm18 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm10 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [5,6,5,6,5,6,7,7] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm12, %ymm17, %ymm12 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm8 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: subq $1080, %rsp # imm = 0x438 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[8],ymm6[8],ymm3[9],ymm6[9],ymm3[10],ymm6[10],ymm3[11],ymm6[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm8[0],ymm15[0],ymm8[1],ymm15[1],ymm8[2],ymm15[2],ymm8[3],ymm15[3],ymm8[8],ymm15[8],ymm8[9],ymm15[9],ymm8[10],ymm15[10],ymm8[11],ymm15[11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm15, %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm8, %ymm16 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm7[0],ymm4[0],ymm7[1],ymm4[1],ymm7[2],ymm4[2],ymm7[3],ymm4[3],ymm7[8],ymm4[8],ymm7[9],ymm4[9],ymm7[10],ymm4[10],ymm7[11],ymm4[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm10, %xmm17 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm12, %xmm8 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm31 = [1,0,2,2,1,0,2,2] +; AVX512DQ-FAST-NEXT: # ymm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm10, %ymm31, %ymm10 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm19 +; AVX512DQ-FAST-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm10, %zmm19 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [16,9,10,17,12,13,18,15] +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm23, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm7, %xmm10 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,6,7,4,5,10,11,8,9,10,11,12,13,14,15,24,25,22,23,20,21,26,27,24,25,26,27,28,29,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm2 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [5,6,5,6,5,6,7,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm3[4],ymm6[4],ymm3[5],ymm6[5],ymm3[6],ymm6[6],ymm3[7],ymm6[7],ymm3[12],ymm6[12],ymm3[13],ymm6[13],ymm3[14],ymm6[14],ymm3[15],ymm6[15] +; AVX512DQ-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7 ; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm6 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm21 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[8],ymm6[8],ymm3[9],ymm6[9],ymm3[10],ymm6[10],ymm3[11],ymm6[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm24 ; AVX512DQ-FAST-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm12, %zmm6, %zmm21 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [8,21,10,11,20,13,14,23] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm6 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm6, %zmm20, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [12,1,2,13,4,5,14,7] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm6, %ymm19, %ymm21 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm6 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm6[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm6 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm6[2,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm7, %zmm3, %zmm24 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [12,1,2,13,4,5,14,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm8 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm17, %ymm9 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm10 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[8],ymm10[8],ymm0[9],ymm10[9],ymm0[10],ymm10[10],ymm0[11],ymm10[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm27 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm9, %zmm0, %zmm27 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm29 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm19, %ymm27 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm31 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm22 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm8 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,0,2,2,1,0,2,2] -; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm14, %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm9 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm14 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm29, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,21,10,11,20,13,14,23] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm2, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm2 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm3, %ymm31, %ymm3 ; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm7 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm25 -; AVX512DQ-FAST-NEXT: movw $9362, %ax # imm = 0x2492 -; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm7, %zmm9, %zmm25 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [16,9,10,17,12,13,18,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm6, %zmm16, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm14 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm11 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm1, %zmm11 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %xmm0 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm5, %ymm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm28 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm3, %zmm28 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm30 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm15 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm15, %xmm26 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm23, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm16, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm1 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,0,2,1] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm1 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm2 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm13[4],ymm2[5],ymm13[5],ymm2[6],ymm13[6],ymm2[7],ymm13[7],ymm2[12],ymm13[12],ymm2[13],ymm13[13],ymm2[14],ymm13[14],ymm2[15],ymm13[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm5, %ymm17, %ymm5 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm4 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm6 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm13 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm4, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm4 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm0, %ymm19, %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm0[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm5 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm8[4],ymm1[4],ymm8[5],ymm1[5],ymm8[6],ymm1[6],ymm8[7],ymm1[7],ymm8[12],ymm1[12],ymm8[13],ymm1[13],ymm8[14],ymm1[14],ymm8[15],ymm1[15] -; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm17, %ymm6 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm7 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm15 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm15[0],ymm7[0],ymm15[1],ymm7[1],ymm15[2],ymm7[2],ymm15[3],ymm7[3],ymm15[8],ymm7[8],ymm15[9],ymm7[9],ymm15[10],ymm7[10],ymm15[11],ymm7[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm6, %zmm7, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm5 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm5, %zmm0, %zmm20 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm5, %ymm19, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm5[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm5[2,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm7 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm14 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm7[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm12, %ymm10, %ymm12 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm7 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm9 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm7, %ymm7 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm17 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm7, %zmm12, %zmm17 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm1 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm16, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm14, %xmm12 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm1[4],xmm12[5],xmm1[5],xmm12[6],xmm1[6],xmm12[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm5 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15] +; AVX512DQ-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm5 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm7 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm25 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm3, %zmm5, %zmm25 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm29, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm27, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm2 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512DQ-FAST-NEXT: vpermd %ymm12, %ymm10, %ymm12 -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm10 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm8, %ymm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm10, %zmm10 -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm8, %zmm12, %zmm10 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm12 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[2,1,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm1, %zmm10, %zmm16 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm1[1,1,1,1] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm1[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm1[1,1,1,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm20 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm3, %ymm31, %ymm3 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm5 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm7 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm5, %ymm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm18 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm3, %zmm18 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %xmm5 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm23, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm1 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,0,2,1] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm3 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm11[4],ymm6[5],ymm11[5],ymm6[6],ymm11[6],ymm6[7],ymm11[7],ymm6[12],ymm11[12],ymm6[13],ymm11[13],ymm6[14],ymm11[14],ymm6[15],ymm11[15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; AVX512DQ-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm8 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm9 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm27 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm7, %zmm8, %zmm27 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm29, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa %ymm14, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm13, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm13 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm10 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm4 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vpermd %ymm8, %ymm31, %ymm8 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm14 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm31 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm8, %zmm31 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm8 +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm21[0],zero,xmm21[1],zero,xmm21[2],zero,xmm21[3],zero +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm0, %ymm22 +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero +; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm0, %ymm30 +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm0, %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512DQ-FAST-NEXT: vpermi2d %ymm1, %ymm31, %ymm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,1,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm23, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm1, %ymm23 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm1[0,0,2,1] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm12 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[8],ymm4[8],ymm12[9],ymm4[9],ymm12[10],ymm4[10],ymm12[11],ymm4[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm15[4],ymm1[4],ymm15[5],ymm1[5],ymm15[6],ymm1[6],ymm15[7],ymm1[7],ymm15[12],ymm1[12],ymm15[13],ymm1[13],ymm15[14],ymm1[14],ymm15[15],ymm1[15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [5,6,5,6,5,6,7,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm12 +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpsrldq {{.*#+}} ymm14 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm1[0],ymm14[1],ymm1[1],ymm14[2],ymm1[2],ymm14[3],ymm1[3],ymm14[8],ymm1[8],ymm14[9],ymm1[9],ymm14[10],ymm1[10],ymm14[11],ymm1[11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[12],ymm5[12],ymm3[13],ymm5[13],ymm3[14],ymm5[14],ymm3[15],ymm5[15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[3,3,3,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm4, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm12, %zmm1, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm12 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermi2d %ymm1, %ymm4, %ymm29 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm9, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512DQ-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512DQ-FAST-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm1[1,1,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,1,1,1] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm21[0,1,2,3],zmm23[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm21 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm23, %zmm1, %zmm21 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm27[0,1,2,3],zmm29[0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm23 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm27, %zmm1, %zmm23 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm4[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm31, %zmm24 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm13, %zmm1, %zmm24 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm0[0,1,2,3],zmm20[0,1,2,3] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm14 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> +; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm14, %zmm2, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,9,2,3,8,5,6,11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm2, %ymm14, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = ymm15[0],mem[0],ymm15[1],mem[1],ymm15[2],mem[2],ymm15[3],mem[3],ymm15[8],mem[8],ymm15[9],mem[9],ymm15[10],mem[10],ymm15[11],mem[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[2],ymm11[2],ymm6[3],ymm11[3],ymm6[8],ymm11[8],ymm6[9],ymm11[9],ymm6[10],ymm11[10],ymm6[11],ymm11[11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm6[2,2,2,3] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm22, %zmm19 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm13, %zmm1, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,8,3,4,9,6,7] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm13, %ymm25 -; AVX512DQ-FAST-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm13, %ymm11 -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm30[0],zero,xmm30[1],zero,xmm30[2],zero,xmm30[3],zero -; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm13, %ymm17 -; AVX512DQ-FAST-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero -; AVX512DQ-FAST-NEXT: vpermt2d %ymm1, %ymm13, %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14> -; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm14, %zmm9, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm8, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm3, %zmm7, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [0,9,2,3,8,5,6,11] -; AVX512DQ-FAST-NEXT: vpermt2d %ymm12, %ymm20, %ymm13 -; AVX512DQ-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm12, %zmm7, %zmm15 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm25[0,1,2,3],zmm26[0,1,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm26 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm4, %ymm20, %ymm9 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %xmm6 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm3 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm12, %zmm25, %zmm27 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm11[0,1,2,3],zmm28[0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm5, %zmm4, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %xmm4 -; AVX512DQ-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm7, %zmm12 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm5, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm11 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm26, %xmm28 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm20 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm19, %zmm22, %zmm20 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm30, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm28 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm19, %zmm22, %zmm28 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm18 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm16, %zmm22, %zmm18 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm23, %zmm16 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm16 +; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm15, %zmm2, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm11, %zmm8, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [8,9,20,11,12,21,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm8, %zmm23, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm6, %zmm10 +; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm5 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm3, %zmm8, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm26, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm11 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm11, %ymm20, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm11 -; AVX512DQ-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm3, %zmm1, %zmm7 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm3, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm0 -; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm20, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm3 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm3[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm11 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm3[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm6[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm8, %ymm14, %ymm30 +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm8 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm6 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm10, %zmm22, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm3, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512DQ-FAST-NEXT: vpermt2d %ymm10, %ymm14, %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm10 +; AVX512DQ-FAST-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm8, %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm13 +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm8 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm11 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm6 +; AVX512DQ-FAST-NEXT: vpermi2d %ymm11, %ymm7, %ymm14 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm6, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm13, %ymm26 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm11[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm12, %zmm23, %zmm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm15, %ymm31 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm12[2,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm13, %ymm15 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm15[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm13, %ymm15 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm4[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm11[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm22, %zmm25, %zmm5 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm17[0,1,2,3],zmm18[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm26, %zmm18 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm17, %zmm25, %zmm18 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm16[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm10, %zmm25, %zmm0 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm13[0,1,2,3],zmm8[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm28, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm10, %zmm3 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm15[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm29, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm10, %zmm6 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm30, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm10, %zmm4 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm7[0,1,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm19, %zmm22, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm19, %zmm19 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm25, %zmm23 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm19, %zmm22, %zmm23 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm29, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm26, %zmm15 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm22, %zmm15 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm17, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm4, %zmm5 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 576(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 704(%rax) -; AVX512DQ-FAST-NEXT: addq $920, %rsp # imm = 0x398 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512DQ-FAST-NEXT: addq $1080, %rsp # imm = 0x438 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride6_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm24 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34,0,0,0,32,3,35,0,0,1,33,4,36,0,0,2,34] +; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm8, %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm3, %zmm2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm10 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37,0,32,3,35,0,0,1,33,4,36,0,0,2,34,5,37] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm25, %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm12, %zmm15 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm6, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm13, %zmm15 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] +; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm7, %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm19 ; AVX512BW-NEXT: vpermt2w %zmm24, %zmm17, %zmm19 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm14, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm14, %zmm12 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50,0,0,16,48,19,51,0,0,17,49,20,52,0,0,18,50] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm21, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm21, %zmm23 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53,16,48,19,51,0,0,17,49,20,52,0,0,18,50,21,53] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm18, %zmm16 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45] +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm18, %zmm16 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58,21,53,24,56,0,0,22,54,25,57,0,0,23,55,26,58] ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm20 ; AVX512BW-NEXT: vpermt2w %zmm24, %zmm22, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm25, %zmm0 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0,8,40,0,0,6,38,9,41,0,0,7,39,10,42,0,0] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0] ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm24, %zmm25, %zmm26 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0,13,45,0,0,11,43,14,46,0,0,12,44,15,47,0,0] +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm25, %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0,24,56,0,0,22,54,25,57,0,0,23,55,26,58,0,0] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm2, %zmm6 -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm2, %zmm17 -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm2, %zmm18 -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm2, %zmm22 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm25, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm24, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm25 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm13, %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42,5,37,8,40,0,0,6,38,9,41,0,0,7,39,10,42] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm13, %zmm25 -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm1, %zmm12 -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm1, %zmm14 -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm1, %zmm21 -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm1, %zmm24 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm1 +; AVX512BW-NEXT: vpermi2w %zmm11, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2w %zmm11, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2w %zmm11, %zmm4, %zmm17 +; AVX512BW-NEXT: vpermi2w %zmm11, %zmm4, %zmm21 +; AVX512BW-NEXT: vpermi2w %zmm11, %zmm4, %zmm22 +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm25, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm24, %zmm11 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61,0,0,27,59,30,62,0,0,28,60,31,63,0,0,29,61] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm25, %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm9 +; AVX512BW-NEXT: vpermi2w %zmm9, %zmm1, %zmm3 +; AVX512BW-NEXT: vpermi2w %zmm9, %zmm1, %zmm13 +; AVX512BW-NEXT: vpermi2w %zmm9, %zmm1, %zmm14 +; AVX512BW-NEXT: vpermi2w %zmm9, %zmm1, %zmm18 +; AVX512BW-NEXT: vpermi2w %zmm9, %zmm1, %zmm24 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm25, %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm9 ; AVX512BW-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31> +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm2 ; AVX512BW-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm9 -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31> -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31> +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm15, %zmm6 +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u> +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm19, %zmm12 ; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u> -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31> -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm8, %zmm11 -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31> -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u> -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm20 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31> -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm25 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm6 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31> +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm23, %zmm16 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm20 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31> +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm11, %zmm20 +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u> +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm7 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm15, %zmm7 ; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm14 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm19, %zmm14 ; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm18 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm23, %zmm18 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm2 ; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31] -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm7, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm20 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm12, %zmm25 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm4, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm2, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm7, %zmm18 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm10, %zmm22 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm12, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31] +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm13, %zmm6 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm11, %zmm22 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47] +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm11, %zmm12 +; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31] +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm16 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31] +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm5, %zmm20 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63] +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm13, %zmm7 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm11, %zmm14 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm18 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm5, %zmm22 +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm8, %zmm1 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -20,83 +20,85 @@ ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm1 -; SSE-NEXT: movdqa (%r8), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,7,7] -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,0,3,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, 16(%rax) -; SSE-NEXT: movdqa %xmm4, (%rax) -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movd %xmm0, 24(%rax) +; SSE-NEXT: movdqa (%rdi), %xmm4 +; SSE-NEXT: movdqa (%rsi), %xmm0 +; SSE-NEXT: movdqa (%rdx), %xmm6 +; SSE-NEXT: movdqa (%rcx), %xmm2 +; SSE-NEXT: movdqa (%r8), %xmm5 +; SSE-NEXT: movdqa (%r9), %xmm1 +; SSE-NEXT: movdqa (%r10), %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[1,1,1,1,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm4[0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm7[2,1] +; SSE-NEXT: movaps {{.*#+}} xmm4 = [65535,65535,65535,65535,0,0,0,65535] +; SSE-NEXT: andps %xmm4, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; SSE-NEXT: andnps %xmm5, %xmm4 +; SSE-NEXT: orps %xmm6, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,1,3] +; SSE-NEXT: movd %xmm1, 24(%rax) +; SSE-NEXT: movq %xmm0, 16(%rax) +; SSE-NEXT: movaps %xmm4, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf2: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,2,3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,0,1,4,5,8,9,u,u] +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm0 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastss (%r10), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,2,3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,u,0,1,4,5,8,9,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6],xmm2[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,10,11,14,15,u,u,u,u,u,u,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,2,3,6,7,10,11,u,u,u,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpextrd $2, %xmm1, 24(%rax) -; AVX1-ONLY-NEXT: vmovq %xmm0, 16(%rax) +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,10,11,14,15,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,6,7,10,11,u,u,u,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpextrd $2, %xmm0, 24(%rax) +; AVX1-ONLY-NEXT: vmovq %xmm1, 16(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rax) +; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i16_stride7_vf2: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vpbroadcastd (%r10), %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX2-ONLY-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,ymm0[22,23,26,27,30,31],zero,zero,zero,zero,zero,zero,ymm0[24,25,20,21] ; AVX2-ONLY-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-ONLY-NEXT: vpextrd $2, %xmm1, 24(%rax) ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-ONLY-NEXT: vpextrd $2, %xmm1, 24(%rax) ; AVX2-ONLY-NEXT: vmovq %xmm1, 16(%rax) ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rax) ; AVX2-ONLY-NEXT: vzeroupper @@ -106,22 +108,24 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512F-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-NEXT: vmovdqa (%r8), %xmm1 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512F-NEXT: vpbroadcastd (%r10), %ymm2 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512F-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u] ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,ymm0[22,23,26,27,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512F-NEXT: vpextrd $2, %xmm1, 24(%rax) ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpextrd $2, %xmm1, 24(%rax) ; AVX512F-NEXT: vmovq %xmm1, 16(%rax) ; AVX512F-NEXT: vmovdqa %xmm0, (%rax) ; AVX512F-NEXT: vzeroupper @@ -131,14 +135,17 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm1 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512BW-NEXT: vpbroadcastd (%r10), %ymm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,8,10,12,1,3,5,7,9,11,13,u,u> ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 @@ -172,77 +179,78 @@ ; SSE-LABEL: store_i16_stride7_vf4: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm7 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm6 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,0,1] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm6, %xmm8 -; SSE-NEXT: pandn %xmm7, %xmm6 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm2[0] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm2[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,1,0,1] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm8, %xmm10 +; SSE-NEXT: pandn %xmm9, %xmm8 +; SSE-NEXT: por %xmm10, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm2[0] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm2[1,1,1,1,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm9, %xmm10 +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: por %xmm7, %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: psrld $16, %xmm10 -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,0,65535] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm9, %xmm5 +; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm8, %xmm7 +; SSE-NEXT: por %xmm10, %xmm7 +; SSE-NEXT: psrld $16, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] +; SSE-NEXT: pand %xmm8, %xmm9 +; SSE-NEXT: pandn %xmm11, %xmm8 +; SSE-NEXT: por %xmm9, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,0,0,65535,65535,65535] +; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm6[0,1,2,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm11[0] +; SSE-NEXT: pandn %xmm10, %xmm9 +; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 +; SSE-NEXT: psrlq $48, %xmm5 ; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: pand %xmm6, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm8[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0] -; SSE-NEXT: pandn %xmm9, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 -; SSE-NEXT: psrlq $48, %xmm4 -; SSE-NEXT: por %xmm8, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: por %xmm5, %xmm8 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,1] ; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,65535,65535,0,0,0,65535] ; SSE-NEXT: andps %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: andnps %xmm3, %xmm2 ; SSE-NEXT: orps %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm2, (%rax) -; SSE-NEXT: movq %xmm5, 48(%rax) -; SSE-NEXT: movdqa %xmm6, 32(%rax) +; SSE-NEXT: movq %xmm8, 48(%rax) +; SSE-NEXT: movdqa %xmm9, 32(%rax) ; SSE-NEXT: movdqa %xmm7, 16(%rax) ; SSE-NEXT: retq ; @@ -250,51 +258,60 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,1,2,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3,4],xmm3[5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3],xmm4[4,5,6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[0,1,8,9,u,u,u,u,u,u,u,u,2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,1,0,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,0,1,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm9[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm10[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpackusdw %xmm8, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,u,u,u,u,u,u,u,u,u,u,4,5,12,13] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm6[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 16(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rax) -; AVX1-ONLY-NEXT: vmovq %xmm4, 48(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 32(%rax) +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[14,15,4,5,8,9,u,u,8,9,14,15,12,13,14,15] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vmovlps %xmm1, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rax) +; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride7_vf4: @@ -305,39 +322,42 @@ ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX2-SLOW-NEXT: movq (%r10), %rcx +; AVX2-SLOW-NEXT: vmovq %rcx, %xmm6 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm4[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpor %ymm5, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,7,6,7,8,9,10,11,12,15,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,3],zero,zero,ymm2[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[0,1,8,9,4,5,6,7,4,5],zero,zero,ymm5[26,27],zero,zero,zero,zero,ymm5[24,25,20,21,22,23,20,21,28,29] -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm0[2,3,2,3,2,3,2,3],zero,zero,zero,zero,ymm0[0,1,2,3,18,19,18,19,18,19,18,19,26,27],zero,zero,ymm0[16,17,18,19] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19],zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,6,7],zero,zero,zero,zero,ymm0[4,5,4,5,4,5,4,5,28,29,22,23,30,31],zero,zero,ymm0[20,21,20,21,20,21,20,21] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,23],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[14,15,14,15,14,15,14,15,4,5,6,7,14,15,14,15,30,31,30,31,30,31,30,31,20,21,22,23,30,31,30,31] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,0,0,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rax) -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vmovq %xmm1, 48(%rax) -; AVX2-SLOW-NEXT: vmovdqa %xmm0, 32(%rax) +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,0] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,20,21] +; AVX2-SLOW-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vmovd %ecx, %xmm2 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX2-SLOW-NEXT: vmovq %xmm0, 48(%rax) +; AVX2-SLOW-NEXT: vmovdqa %xmm3, 32(%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -349,37 +369,42 @@ ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX2-FAST-NEXT: movq (%r10), %rcx +; AVX2-FAST-NEXT: vmovq %rcx, %xmm6 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm4[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpor %ymm5, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,1,5,0,1,1,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,7,6,7,8,9,10,11,12,15,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <5,7,1,3,7,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,3],zero,zero,ymm2[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,4,5,6,7,4,5],zero,zero,ymm2[26,27],zero,zero,zero,zero,ymm2[24,25,20,21,22,23,20,21,28,29] -; AVX2-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,4,5,8,9,u,u,u,u,u,u,u,u,18,19,22,23,26,27,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,0] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,20,21] +; AVX2-FAST-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovd %ecx, %xmm2 +; AVX2-FAST-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX2-FAST-NEXT: vmovq %xmm0, 48(%rax) -; AVX2-FAST-NEXT: vmovdqa %xmm1, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %xmm3, 32(%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -391,38 +416,41 @@ ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: movq (%r10), %rcx +; AVX2-FAST-PERLANE-NEXT: vmovq %rcx, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm4[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,7,4,5,14,15,u,u,u,u,16,17,18,19,20,21,22,23,20,21,30,31,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,3],zero,zero,ymm2[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[0,1,8,9,4,5,6,7,4,5],zero,zero,ymm5[26,27],zero,zero,zero,zero,ymm5[24,25,20,21,22,23,20,21,28,29] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm0[2,3,2,3,2,3,2,3],zero,zero,zero,zero,ymm0[0,1,2,3,18,19,18,19,18,19,18,19,26,27],zero,zero,ymm0[16,17,18,19] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,6,7],zero,zero,zero,zero,ymm0[4,5,4,5,4,5,4,5,28,29,22,23,30,31],zero,zero,ymm0[20,21,20,21,20,21,20,21] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,23],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[14,15,14,15,14,15,14,15,4,5,6,7,14,15,14,15,30,31,30,31,30,31,30,31,20,21,22,23,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[4,5,12,13,4,5,6,7,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,0,0,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rax) -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm1, 48(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,20,21] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovd %ecx, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, 48(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -438,32 +466,33 @@ ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm3[26,27],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,20,21,28,29] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm2[u,u,u,u,u,u,6,7,14,15],zero,zero,ymm2[30,31,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,12,13,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[6,7],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512F-SLOW-NEXT: vporq %zmm3, %zmm2, %zmm2 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[0,1,u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm0[u,u,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[0,1,u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm0[u,u,u,u] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u],zero,zero,zero,zero,ymm0[4,5,u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm0[u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm1[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[18,19,u,u,u,u] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm1[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[18,19,u,u,u,u] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,4,5,12,13],zero,zero,ymm1[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[22,23,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512F-SLOW-NEXT: vporq %zmm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm2[2,3,0,1] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm1[26,27],zero,zero,zero,zero,ymm1[u,u,u,u,u,u,20,21,28,29] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm2[u,u,u,u,u,u,6,7,14,15],zero,zero,ymm2[30,31,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[6,7],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) -; AVX512F-SLOW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 -; AVX512F-SLOW-NEXT: vmovq %xmm0, 48(%rax) -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rax) +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) +; AVX512F-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; AVX512F-SLOW-NEXT: vmovq %xmm1, 48(%rax) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -479,29 +508,30 @@ ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,u> -; AVX512F-FAST-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6] -; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpermi2q %ymm4, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6] +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,ymm2[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[18,19,22,23,26,27],zero,zero,zero,zero ; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] ; AVX512F-FAST-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm3[26,27],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,20,21,28,29] -; AVX512F-FAST-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <5,7,1,3,7,u,u,u> -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,1,8,9],zero,zero,zero,zero,ymm0[u,u,u,u,u,u,2,3],zero,zero,ymm0[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <4,6,u,u,u,2,1,3> +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,4,5,u,u,u,u,u,u],zero,zero,ymm3[22,23],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,24,25,28,29] +; AVX512F-FAST-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <5,7,1,3,7,u,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm0[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512F-FAST-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512F-FAST-NEXT: vmovq %xmm1, 48(%rax) @@ -521,16 +551,16 @@ ; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,4,8,12,32,40,36,1,5,9,13,33,41,37,2,6,10,14,34,42,38,3,7,11,15,35,43,39,u,u,u,u> -; AVX512BW-SLOW-NEXT: vpermi2w %zmm0, %zmm2, %zmm1 -; AVX512BW-SLOW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) -; AVX512BW-SLOW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 +; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,4,8,12,32,40,36,1,5,9,13,33,41,37,2,6,10,14,34,42,38,3,7,11,15,35,43,39,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 +; AVX512BW-SLOW-NEXT: vextracti32x4 $2, %zmm2, 32(%rax) +; AVX512BW-SLOW-NEXT: vextracti32x4 $3, %zmm2, %xmm0 ; AVX512BW-SLOW-NEXT: vmovq %xmm0, 48(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa %ymm1, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa %ymm2, (%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; @@ -546,12 +576,12 @@ ; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512BW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,u> -; AVX512BW-FAST-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vpermi2q %ymm4, %ymm0, %ymm1 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6,10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u> ; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) @@ -584,147 +614,151 @@ ; SSE-LABEL: store_i16_stride7_vf8: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa (%rdx), %xmm5 -; SSE-NEXT: movdqa (%rcx), %xmm11 -; SSE-NEXT: movdqa (%r8), %xmm4 -; SSE-NEXT: movdqa (%r9), %xmm9 -; SSE-NEXT: movdqa (%rax), %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE-NEXT: movdqa %xmm6, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,2],xmm0[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa (%rsi), %xmm7 +; SSE-NEXT: movdqa (%rdx), %xmm4 +; SSE-NEXT: movdqa (%rcx), %xmm5 +; SSE-NEXT: movdqa (%r8), %xmm8 +; SSE-NEXT: movdqa (%r9), %xmm6 +; SSE-NEXT: movdqa (%rax), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: por %xmm11, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm11 +; SSE-NEXT: psrld $16, %xmm11 +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm11, %xmm13 +; SSE-NEXT: pandn %xmm12, %xmm11 +; SSE-NEXT: por %xmm13, %xmm11 +; SSE-NEXT: pand %xmm3, %xmm11 +; SSE-NEXT: pandn %xmm9, %xmm3 +; SSE-NEXT: por %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm9, %xmm10 +; SSE-NEXT: movaps {{.*#+}} xmm9 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm12, %xmm13 -; SSE-NEXT: pandn %xmm10, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 ; SSE-NEXT: por %xmm13, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm12[0,3] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pandn %xmm3, %xmm10 -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm10, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm10, %xmm14 -; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm14[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm11[3,3,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm15[0,2] -; SSE-NEXT: andps %xmm10, %xmm14 -; SSE-NEXT: andnps %xmm13, %xmm10 -; SSE-NEXT: orps %xmm14, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[0,1,0,1] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm14, %xmm12 -; SSE-NEXT: pandn %xmm13, %xmm14 -; SSE-NEXT: por %xmm12, %xmm14 -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: psrld $16, %xmm12 -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,0],xmm12[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm13[0,2] +; SSE-NEXT: andps %xmm9, %xmm12 +; SSE-NEXT: andnps %xmm10, %xmm9 +; SSE-NEXT: orps %xmm12, %xmm9 +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] +; SSE-NEXT: movdqa %xmm11, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,2],xmm12[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm12, %xmm15 -; SSE-NEXT: pandn %xmm13, %xmm12 -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; SSE-NEXT: por %xmm15, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] -; SSE-NEXT: por %xmm12, %xmm15 -; SSE-NEXT: psrlq $48, %xmm11 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm11[1] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: pandn %xmm13, %xmm11 -; SSE-NEXT: movdqa %xmm6, %xmm12 -; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm11, %xmm12 -; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: movdqa %xmm4, %xmm11 -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm13 -; SSE-NEXT: por %xmm12, %xmm13 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: pand %xmm9, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[3,3,3,3] -; SSE-NEXT: pandn %xmm11, %xmm9 -; SSE-NEXT: por %xmm13, %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm2[1,1,1,1,4,5,6,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: movdqa %xmm8, %xmm13 -; SSE-NEXT: psrld $16, %xmm13 -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] +; SSE-NEXT: pand %xmm14, %xmm15 +; SSE-NEXT: pandn %xmm13, %xmm14 +; SSE-NEXT: por %xmm15, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,2],xmm14[0,3] ; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm13 -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm5[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm11 -; SSE-NEXT: por %xmm13, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: pand %xmm13, %xmm11 -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,2],xmm3[1,1] -; SSE-NEXT: pandn %xmm8, %xmm13 -; SSE-NEXT: por %xmm11, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm14[0,1] -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,1] -; SSE-NEXT: movaps {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,0,65535] -; SSE-NEXT: andps %xmm4, %xmm6 -; SSE-NEXT: andnps %xmm0, %xmm4 -; SSE-NEXT: orps %xmm6, %xmm4 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm12[2,1] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm0[1,1,1,1,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE-NEXT: psrld $16, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1] +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: pandn %xmm13, %xmm15 +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm13[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: por %xmm15, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,0,0,65535,65535,65535] +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm0[1,1] +; SSE-NEXT: pandn %xmm2, %xmm15 +; SSE-NEXT: por %xmm6, %xmm15 +; SSE-NEXT: psrlq $48, %xmm5 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: psrld $16, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pandn %xmm6, %xmm5 +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] +; SSE-NEXT: pandn %xmm6, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm7, %xmm11 +; SSE-NEXT: pandn %xmm12, %xmm7 +; SSE-NEXT: por %xmm11, %xmm7 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[2,1] +; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,0,65535] +; SSE-NEXT: andps %xmm2, %xmm5 +; SSE-NEXT: andnps %xmm7, %xmm2 +; SSE-NEXT: orps %xmm5, %xmm2 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm13 = xmm13[0],xmm3[0] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm14[2,1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: andps %xmm1, %xmm5 +; SSE-NEXT: andps %xmm1, %xmm13 ; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: orps %xmm5, %xmm1 +; SSE-NEXT: orps %xmm13, %xmm1 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm1, (%rax) -; SSE-NEXT: movaps %xmm4, 64(%rax) -; SSE-NEXT: movdqa %xmm15, 16(%rax) -; SSE-NEXT: movdqa %xmm13, 32(%rax) -; SSE-NEXT: movaps %xmm10, 48(%rax) -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3] -; SSE-NEXT: movaps %xmm7, 80(%rax) -; SSE-NEXT: movdqa %xmm9, 96(%rax) +; SSE-NEXT: movaps %xmm2, 64(%rax) +; SSE-NEXT: movdqa %xmm4, 96(%rax) +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0,1,3] +; SSE-NEXT: movaps %xmm10, 80(%rax) +; SSE-NEXT: movaps %xmm9, 48(%rax) +; SSE-NEXT: movdqa %xmm15, 32(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf8: @@ -833,10 +867,10 @@ ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm3 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm9 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm8[4,5,u,u,u,u,u,u,u,u,u,u,u,u,6,7,22,23,u,u,u,u,u,u,u,u,u,u,u,u,24,25] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm9 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm9[4,5,u,u,u,u,u,u,u,u,u,u,u,u,6,7,22,23,u,u,u,u,u,u,u,u,u,u,u,u,24,25] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,2,2,4,5,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3,4,5],ymm10[6],ymm6[7,8],ymm10[9],ymm6[10,11,12,13],ymm10[14],ymm6[15] @@ -844,36 +878,36 @@ ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpbroadcastd 4(%r10), %ymm12 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm6, %ymm11, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm8[1,3,1,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[20,21,28,29],zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm7[1,3,1,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm12[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[20,21,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,2,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,ymm11[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,2,0] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[28,29,20,21] ; AVX2-SLOW-NEXT: vpor %ymm11, %ymm12, %ymm11 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21] -; AVX2-SLOW-NEXT: vpbroadcastd 8(%r10), %ymm12 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,0] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[28,29,20,21] -; AVX2-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpbroadcastd (%r10), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpbroadcastd (%r10), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm11, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,1,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[20,21,28,29],zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm7[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[20,21,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21] +; AVX2-SLOW-NEXT: vpbroadcastd 8(%r10), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 ; AVX2-SLOW-NEXT: vpsrlq $48, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] @@ -885,8 +919,8 @@ ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, (%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm6, 32(%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -904,42 +938,42 @@ ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm9 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,2,1,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,24,25],zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,5,u,u,5,2,6,u> -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[20,21,24,25] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm8[0,2,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm7[0,2,2,0] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[28,29,20,21] ; AVX2-FAST-NEXT: vpor %ymm6, %ymm10, %ymm6 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpbroadcastd 4(%r10), %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpbroadcastd (%r10), %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm6, %ymm11, %ymm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm8[1,3,1,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[20,21,28,29],zero,zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm7[1,3,1,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm12[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[20,21,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,1,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,7,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17,24,25],zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <1,5,u,u,5,2,6,u> +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[20,21,24,25] ; AVX2-FAST-NEXT: vpor %ymm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21] -; AVX2-FAST-NEXT: vpbroadcastd 8(%r10), %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,0] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[28,29,20,21] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpbroadcastd 4(%r10), %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm11, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,1,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[20,21,28,29],zero,zero,zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm7[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[20,21,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpbroadcastd (%r10), %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm9[1,3,3,1] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21] +; AVX2-FAST-NEXT: vpbroadcastd 8(%r10), %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vpsrlq $48, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] @@ -951,9 +985,9 @@ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm10, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, (%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -985,29 +1019,29 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm6, %ymm11, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm8[1,3,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[20,21,28,29],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm7[1,3,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm12[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[20,21,28,29],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,ymm11[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,2,0] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[28,29,20,21] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%r10), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[28,29,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%r10), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm11, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[20,21,28,29],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm7[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[20,21,28,29],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%r10), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm9[1,3,3,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%r10), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm8, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpsrlq $48, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] @@ -1019,8 +1053,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -1035,12 +1069,12 @@ ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm5 ; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm6 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm7[0,2,1,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,1,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,ymm10[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[4,5,u,u,u,u,u,u,u,u,u,u,u,u,6,7,22,23,u,u,u,u,u,u,u,u,u,u,u,u,24,25] @@ -1049,11 +1083,11 @@ ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,1,2,2,4,5,6,6] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4,5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11,12,13],ymm11[14],ymm10[15] ; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,2,0] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,2,2,0] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,8,9],zero,zero,zero,zero,ymm11[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm11[u,u,u,u,u,u,28,29,20,21] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 ; AVX512F-SLOW-NEXT: vporq %zmm9, %zmm10, %zmm9 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm7[0,2,0,2] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm10[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm10[u,u,u,u] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13],zero,zero,ymm10[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm10[u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 @@ -1074,10 +1108,10 @@ ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u],zero,zero,zero,zero,ymm1[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[20,21,28,29,u,u,u,u] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,3,1,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,1,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm2[u,u,u,u] ; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,3,3,1] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] ; AVX512F-SLOW-NEXT: vpternlogd $206, 8(%r10){1to8}, %ymm2, %ymm3 @@ -1099,9 +1133,9 @@ ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm3 ; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm4 ; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm5 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm6 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm8 ; AVX512F-FAST-NEXT: vpsrlq $48, %xmm3, %xmm3 ; AVX512F-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -1112,30 +1146,30 @@ ; AVX512F-FAST-NEXT: vpbroadcastd 12(%r10), %xmm2 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm7[1,3,1,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm8[1,3,1,3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u],zero,zero,zero,zero,ymm1[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[20,21,28,29,u,u,u,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm6[1,3,1,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,3,1,3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm2[u,u,u,u] ; AVX512F-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm6[1,3,3,1] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] ; AVX512F-FAST-NEXT: vpternlogd $206, 8(%r10){1to8}, %ymm2, %ymm3 ; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,2,0,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm8[0,2,0,2] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,1,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm7[0,2,1,3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,2,0] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm7[0,2,2,0] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,8,9],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,28,29,20,21] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,5,u,u,5,2,6,u> -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm4 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[20,21,24,25] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 ; AVX512F-FAST-NEXT: vporq %zmm2, %zmm3, %zmm2 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,2,0,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,0,2] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm3[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm3[u,u,u,u] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,4,5,12,13],zero,zero,ymm3[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm3[u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 @@ -1157,17 +1191,17 @@ ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28] -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28] +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1194,611 +1228,632 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $232, %rsp +; SSE-NEXT: subq $216, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa 16(%rsi), %xmm2 -; SSE-NEXT: movdqa 16(%rdx), %xmm7 -; SSE-NEXT: movdqa 16(%rcx), %xmm12 -; SSE-NEXT: movdqa 16(%r8), %xmm10 -; SSE-NEXT: movdqa 16(%r9), %xmm5 -; SSE-NEXT: movdqa 16(%rax), %xmm4 -; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rsi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r8), %xmm15 +; SSE-NEXT: movdqa 16(%r8), %xmm3 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r9), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm8 +; SSE-NEXT: movdqa 16(%rax), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535] ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm11 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: andnps %xmm0, %xmm2 +; SSE-NEXT: orps %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,0,1] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa (%r9), %xmm13 +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,3,3] -; SSE-NEXT: movdqa %xmm10, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] -; SSE-NEXT: movaps {{.*#+}} xmm15 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm15, %xmm2 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] -; SSE-NEXT: movaps %xmm15, %xmm5 -; SSE-NEXT: andnps %xmm3, %xmm5 -; SSE-NEXT: orps %xmm2, %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa (%rcx), %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa (%rdx), %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa (%rsi), %xmm7 ; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: pand %xmm12, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: movdqa (%r8), %xmm6 -; SSE-NEXT: movdqa (%r9), %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movdqa (%rdx), %xmm14 -; SSE-NEXT: movdqa (%rcx), %xmm5 -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm13 -; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm11[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm3[0,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] ; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,0],xmm5[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm11[0,2] +; SSE-NEXT: andps %xmm12, %xmm4 +; SSE-NEXT: orps %xmm7, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: pandn %xmm5, %xmm11 +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7] +; SSE-NEXT: movdqa %xmm15, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm11, %xmm5 ; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,2],xmm7[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm5[0,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,1,0,1] +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: psrld $16, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: por %xmm4, %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm4, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: por %xmm10, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm10, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm9, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm3[3,3] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[3,3,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm9[0,2] -; SSE-NEXT: andps %xmm7, %xmm12 -; SSE-NEXT: orps %xmm8, %xmm12 -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm2, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[2,2,2,2] -; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2] -; SSE-NEXT: andps %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm1, %xmm0 +; SSE-NEXT: andnps %xmm10, %xmm1 +; SSE-NEXT: orps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] -; SSE-NEXT: psrlq $48, %xmm5 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; SSE-NEXT: psrlq $48, %xmm12 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm12[1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,5,4] +; SSE-NEXT: pandn %xmm2, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE-NEXT: movdqa %xmm13, %xmm9 -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: pand %xmm11, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,6,5,4,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[3,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,5,4] -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: por %xmm5, %xmm11 -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm10[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,2],xmm5[2,0] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,5,6,6,7] -; SSE-NEXT: andps %xmm15, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,2,2,2] -; SSE-NEXT: andnps %xmm9, %xmm15 -; SSE-NEXT: orps %xmm4, %xmm15 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm0, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[2,3,2,3] -; SSE-NEXT: andnps %xmm8, %xmm0 -; SSE-NEXT: orps %xmm15, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,65535] -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,1,2,3] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,1,2,3] +; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm6[1,1] -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE-NEXT: psrld $16, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm13[1,1] +; SSE-NEXT: movaps {{.*#+}} xmm8 = [65535,65535,0,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: andnps %xmm1, %xmm9 +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: orps %xmm5, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrld $16, %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm11 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: por %xmm11, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm11 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,2],xmm13[1,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: andnps %xmm15, %xmm2 -; SSE-NEXT: orps %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,2],xmm4[1,1] +; SSE-NEXT: andnps %xmm11, %xmm8 +; SSE-NEXT: orps %xmm1, %xmm8 +; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm7[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,2],xmm1[2,0] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,6,7] +; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm0, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; SSE-NEXT: andnps %xmm2, %xmm0 +; SSE-NEXT: orps %xmm11, %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm11, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; SSE-NEXT: andnps %xmm2, %xmm11 +; SSE-NEXT: orps %xmm0, %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm15 -; SSE-NEXT: pandn %xmm6, %xmm15 -; SSE-NEXT: movdqa %xmm13, %xmm6 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[1],mem[0] -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm7[2,1] -; SSE-NEXT: andps %xmm0, %xmm13 -; SSE-NEXT: orps %xmm15, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: movaps %xmm10, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[0,1] -; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[2,1] -; SSE-NEXT: andps %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: por %xmm7, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm14 = xmm14[0],xmm9[0] +; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,0,65535] +; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: pandn %xmm6, %xmm11 +; SSE-NEXT: movaps %xmm13, %xmm6 +; SSE-NEXT: shufps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[2,0],mem[0,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[2,1] +; SSE-NEXT: andps %xmm2, %xmm6 +; SSE-NEXT: orps %xmm11, %xmm6 +; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: pslldq {{.*#+}} xmm5 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: pandn %xmm7, %xmm10 +; SSE-NEXT: por %xmm5, %xmm10 +; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm1[0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[2,1] +; SSE-NEXT: andps %xmm2, %xmm7 +; SSE-NEXT: pandn %xmm10, %xmm2 +; SSE-NEXT: por %xmm7, %xmm2 +; SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[0],mem[0] ; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: # xmm14 = xmm14[2,0],mem[2,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: andps %xmm5, %xmm14 +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,0,0,65535] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: andps %xmm1, %xmm14 ; SSE-NEXT: por %xmm14, %xmm7 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] -; SSE-NEXT: andps %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: por %xmm4, %xmm5 +; SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: andps %xmm1, %xmm15 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm15, %xmm1 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm5, 112(%rax) +; SSE-NEXT: movdqa %xmm1, 112(%rax) ; SSE-NEXT: movdqa %xmm7, (%rax) -; SSE-NEXT: movdqa %xmm0, 176(%rax) -; SSE-NEXT: movaps %xmm13, 64(%rax) -; SSE-NEXT: movaps %xmm2, 32(%rax) +; SSE-NEXT: movdqa %xmm2, 176(%rax) +; SSE-NEXT: movaps %xmm6, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps %xmm12, 160(%rax) +; SSE-NEXT: movaps %xmm0, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps %xmm0, 160(%rax) +; SSE-NEXT: movaps %xmm8, 144(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movaps %xmm3, 144(%rax) +; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movaps %xmm9, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rax) -; SSE-NEXT: movdqa %xmm11, 96(%rax) +; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movdqa %xmm12, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) -; SSE-NEXT: addq $232, %rsp +; SSE-NEXT: addq $216, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride7_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $40, %rsp +; AVX1-ONLY-NEXT: subq $120, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm15 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm14[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm8 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm9 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm13 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm9[1,2,3,4,5,6],xmm11[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1],xmm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[2,2,2,2] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm13 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm11[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] +; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm7, %ymm9 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm13[3],xmm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,5],xmm7[6],xmm3[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm12 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm4[0,2],xmm12[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm10[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm8[0,2],xmm13[1,3] -; AVX1-ONLY-NEXT: vmovaps %xmm8, %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm1, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm12[0,0,0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm2 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm9 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm15[2],xmm2[2],xmm15[3],xmm2[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm14[0,1,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,u,u,u,u,u,u,u,u,6,7,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm14[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm14[5],xmm12[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm3[0,2],xmm6[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm14, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm12[6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm8[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5],xmm2[6],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm11 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm9 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm11[1],xmm4[1] +; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,u,u,u,u,u,u,u,u,6,7,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) -; AVX1-ONLY-NEXT: addq $40, %rsp +; AVX1-ONLY-NEXT: addq $120, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride7_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: pushq %rax +; AVX2-SLOW-NEXT: subq $72, %rsp ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm15 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> -; AVX2-SLOW-NEXT: vpermd %ymm7, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <3,u,u,u,4,u,u,4> +; AVX2-SLOW-NEXT: vpermd %ymm15, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,3,2,3,4,7,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm6[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm5 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm14 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 @@ -1807,188 +1862,169 @@ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX2-SLOW-NEXT: vpbroadcastd (%rax), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm8, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm3 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm12 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX2-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm8, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm7, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,2,2] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7,8],ymm1[9],ymm7[10,11],ymm1[12],ymm7[13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7,8,9],ymm7[10],ymm1[11,12],ymm7[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[2,3,3,3,6,7,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm12 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2],xmm15[3,4],xmm1[5],xmm15[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2],xmm1[3,4],xmm7[5],xmm1[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,4,5,7,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm12 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm9, %ymm12, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7,8,9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5,6,7,8],ymm12[9],ymm15[10,11],ymm12[12],ymm15[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm12, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,3,6,6,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm12, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,3,3,3,6,7,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm12, %ymm12 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vpbroadcastd (%rax), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm10, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,5,7,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3],ymm11[4],ymm1[5,6,7,8],ymm11[9],ymm1[10,11],ymm11[12],ymm1[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm11[2],ymm1[3,4],ymm11[5],ymm1[6,7,8,9],ymm11[10],ymm1[11,12],ymm11[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7,8,9],ymm7[10],ymm1[11,12],ymm7[13],ymm1[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm1, %ymm11, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5],ymm1[6],ymm7[7,8,9,10],ymm1[11],ymm7[12,13],ymm1[14],ymm7[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm7, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, 32(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm12, 192(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 32(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-SLOW-NEXT: popq %rax +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: addq $72, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride7_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: pushq %rax +; AVX2-FAST-NEXT: subq $40, %rsp ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm7 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm4 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm6 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm15 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm2 ; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm3 ; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[1,2,2,3,5,6,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7,8,9],ymm9[10],ymm10[11,12],ymm9[13],ymm10[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15] @@ -2001,8 +2037,7 @@ ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm15[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] @@ -2010,82 +2045,105 @@ ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[1,2,2,3,5,6,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7,8,9],ymm9[10],ymm10[11,12],ymm9[13],ymm10[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <3,u,u,3,u,u,u,4> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm6[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <3,u,u,3,u,u,u,4> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm14 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm15 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm12, %ymm13, %ymm8 +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm12 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX2-FAST-NEXT: vpbroadcastd (%rax), %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm9, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm13 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm14 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm8, %ymm12 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm8 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] -; AVX2-FAST-NEXT: vpbroadcastd 8(%rax), %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-NEXT: vpbroadcastd 8(%rax), %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm12, %ymm12 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1],xmm1[2],xmm9[3,4],xmm1[5],xmm9[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 4(%rax), %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7,8,9,10],ymm2[11],ymm5[12,13],ymm2[14],ymm5[15] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] @@ -2095,210 +2153,217 @@ ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpbroadcastd 4(%rax), %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd (%rax), %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 128(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm2, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 160(%rax) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm12, 64(%rax) -; AVX2-FAST-NEXT: popq %rax +; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm8, (%rax) +; AVX2-FAST-NEXT: addq $40, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride7_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $40, %rsp +; AVX2-FAST-PERLANE-NEXT: subq $104, %rsp ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <3,u,u,u,4,u,u,4> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm4, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <3,u,u,3,u,u,u,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm9, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm8, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rax), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm8, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rax), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm1, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5],ymm1[6],ymm7[7,8,9,10],ymm1[11],ymm7[12,13],ymm1[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5,6,7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7,8,9],ymm1[10],ymm7[11,12],ymm1[13],ymm7[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,3,3,3,6,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2],xmm15[3,4],xmm1[5],xmm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3,4],xmm1[5],xmm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm9, %ymm14, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm1, %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7,8,9,10],ymm1[11],ymm14[12,13],ymm1[14],ymm14[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm5[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm4[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[2,3,3,3,6,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rax), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm10, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm9[2,2,2,2,6,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6,7,8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[1,2,2,3,5,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1],ymm1[2],ymm11[3,4],ymm1[5],ymm11[6,7,8,9],ymm1[10],ymm11[11,12],ymm1[13],ymm11[14,15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm8[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7,8,9],ymm1[10],ymm7[11,12],ymm1[13],ymm7[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm11, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5],ymm1[6],ymm7[7,8,9,10],ymm1[11],ymm7[12,13],ymm1[14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[u,u,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $40, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $104, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -2310,142 +2375,140 @@ ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm6 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %ymm1 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u],zero,zero,ymm4[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[16,17,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,14,15],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm3[u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512F-SLOW-NEXT: vprold $16, %xmm8, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1],xmm9[2],xmm13[3,4],xmm9[5],xmm13[6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm9[1],xmm13[2,3],xmm9[4],xmm13[5,6],xmm9[7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm14, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm17 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm15 = zmm13[0,0,1,1,4,4,5,5] +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm9 +; AVX512F-SLOW-NEXT: vpbroadcastd (%r10), %ymm7 +; AVX512F-SLOW-NEXT: vpbroadcastd 4(%r10), %ymm16 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u],zero,zero,ymm4[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[16,17,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[u,u,u,u,u,u,14,15],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm3[u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpor %ymm9, %ymm15, %ymm9 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,3,3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[12,13,14,15],zero,zero,ymm5[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm5[u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512F-SLOW-NEXT: vpor %ymm10, %ymm11, %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm15 -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm15 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[16,17,u,u,u,u] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm0[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm0[u,u,u,u] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512F-SLOW-NEXT: vpor %ymm7, %ymm12, %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm13 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm18 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpbroadcastd 8(%r10), %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpandnq %ymm7, %ymm16, %ymm7 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm1[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[16,17,u,u] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX512F-SLOW-NEXT: vprold $16, %xmm11, %xmm14 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2],xmm15[3,4],xmm14[5],xmm15[6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6],xmm8[7] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm8[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,1,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 -; AVX512F-SLOW-NEXT: vpbroadcastd (%r10), %ymm8 -; AVX512F-SLOW-NEXT: vpbroadcastd 4(%r10), %ymm10 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7,8,9],ymm9[10],ymm0[11,12],ymm9[13],ymm0[14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6,7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7,8,9],ymm9[10],ymm10[11,12],ymm9[13],ymm10[14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm5[1,1,1,1,5,5,5,5] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm9 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpor %ymm10, %ymm11, %ymm10 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm10 +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm10 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[16,17,u,u,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u] +; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm9 +; AVX512F-SLOW-NEXT: vpbroadcastd 8(%r10), %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpandn %ymm8, %ymm11, %ymm8 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[16,17,u,u] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm8 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7,8,9,10],ymm0[11],ymm10[12,13],ymm0[14],ymm10[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX512F-SLOW-NEXT: vprold $16, %ymm2, %ymm10 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm17[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm16[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm5[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13,14,15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm16[0,1,2,2,4,5,6,6] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm10 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8,9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm5[1,1,1,1,5,5,5,5] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm10 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm1[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7,8,9,10],ymm9[11],ymm11[12,13],ymm9[14],ymm11[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX512F-SLOW-NEXT: vprold $16, %ymm2, %ymm11 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm0[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[0,1,2,2,4,5,6,6] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[2,3,3,3,6,7,7,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,3,3,6,7,7,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -2459,126 +2522,124 @@ ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm4 ; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm0 ; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm5[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[12,13,14,15],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm3[u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512F-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u],zero,zero,ymm4[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[16,17,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %ymm7, %ymm10, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm10 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm12, %zmm15 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm15 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[16,17,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm0[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm0[u,u,u,u] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512F-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm12 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm13 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm18 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm16 -; AVX512F-FAST-NEXT: vpbroadcastd 8(%r10), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpandn %ymm6, %ymm7, %ymm6 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[16,17,u,u] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512F-FAST-NEXT: vprold $16, %xmm10, %xmm9 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[1,1,2,3] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm9[2],xmm11[3,4],xmm9[5],xmm11[6,7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm6 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[1,1,2,2] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6],xmm9[7] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm14 = zmm11[0,0,1,1,4,4,5,5] +; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm11 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm12 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm15, %zmm15 +; AVX512F-FAST-NEXT: vpbroadcastd (%r10), %ymm6 +; AVX512F-FAST-NEXT: vpbroadcastd 4(%r10), %ymm16 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm6, %zmm6 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX512F-FAST-NEXT: vprold $16, %xmm9, %xmm15 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,2,3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm15[2],xmm0[3,4],xmm15[5],xmm0[6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm14, %zmm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6],xmm10[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm9 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm6 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,ymm5[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[12,13,14,15],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm3[u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512F-FAST-NEXT: vpor %ymm14, %ymm15, %ymm14 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm10 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm4[u,u,u,u,u,u],zero,zero,ymm4[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[16,17,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm2[u,u,u,u,u,u,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpor %ymm13, %ymm14, %ymm13 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm13 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[16,17,u,u,u,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,14,15],zero,zero,ymm0[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm0[u,u,u,u] +; AVX512F-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,1,1,4,4,5,5] -; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 -; AVX512F-FAST-NEXT: vpbroadcastd (%r10), %ymm9 -; AVX512F-FAST-NEXT: vpbroadcastd 4(%r10), %ymm10 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm9 +; AVX512F-FAST-NEXT: vpbroadcastd 8(%r10), %ymm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-FAST-NEXT: vpandn %ymm7, %ymm8, %ymm7 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm8[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[16,17,u,u] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm7 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,1,3,4,5,5,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7,8,9],ymm9[10],ymm10[11,12],ymm9[13],ymm10[14,15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13,14,15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm9 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[2,2,2,2,6,6,6,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5,6,7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13,14,15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1],ymm8[2],ymm10[3,4],ymm8[5],ymm10[6,7,8,9],ymm8[10],ymm10[11,12],ymm8[13],ymm10[14,15] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[1,1,1,1,5,5,5,5] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,0,2,1,4,4,6,5] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7,8,9,10],ymm0[11],ymm10[12,13],ymm0[14],ymm10[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX512F-FAST-NEXT: vprold $16, %ymm1, %ymm10 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm17[1,2,2,3,5,6,6,7] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[2,2,2,2,6,6,6,6] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512F-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm10, %zmm10 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm10 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[1,1,1,1,5,5,5,5] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm10 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,0,2,1,4,4,6,5] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7,8,9,10],ymm9[11],ymm11[12,13],ymm9[14],ymm11[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX512F-FAST-NEXT: vprold $16, %ymm1, %ymm11 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[1,2,2,3,5,6,6,7] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512F-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm11, %zmm11 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8,9,10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <6,u,u,u,7,u,u,7> -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[3,3,3,3,7,7,7,7] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <6,u,u,u,7,u,u,7> +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -2593,38 +2654,38 @@ ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <29,45,u,u,u,u,14,30,46,u,u,u,u,15,31,47> -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,16,32,48,u,u,u,1,17,33,49,u,u,u,2,18,34,50,u,u,u,3,19,35,51,u,u,u,4,20,36,52> -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = <29,45,u,u,u,u,14,30,46,u,u,u,u,15,31,47> +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 +; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 +; AVX512BW-NEXT: kmovd %ecx, %k1 +; AVX512BW-NEXT: vmovdqu16 %ymm4, %ymm5 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,16,32,48,u,u,u,1,17,33,49,u,u,u,2,18,34,50,u,u,u,3,19,35,51,u,u,u,4,20,36,52> +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <4,20,36,u,u,u,u,5,21,37,u,u,u,u,6,22,38,u,u,u,u,7,23,39,u,u,u,u,8,24,40,u> -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <4,20,36,u,u,u,u,5,21,37,u,u,u,u,6,22,38,u,u,u,u,7,23,39,u,u,u,u,8,24,40,u> +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <57,9,25,u,u,u,42,58,10,26,u,u,u,43,59,11,27,u,u,u,44,60,12,28,u,u,u,45,61,13,29,u> ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 -; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1} -; AVX512BW-NEXT: vmovdqa %ymm4, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-NEXT: vmovdqa %ymm5, 192(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <16 x i16>, ptr %in.vecptr0, align 64 @@ -2656,393 +2717,389 @@ ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rsi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm1 -; SSE-NEXT: movdqa 48(%rcx), %xmm5 +; SSE-NEXT: movdqa 48(%rdx), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%r8), %xmm6 -; SSE-NEXT: movdqa 48(%r9), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rax), %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,1,1] ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[0,2] -; SSE-NEXT: movaps {{.*#+}} xmm14 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm14, %xmm2 -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: andnps %xmm9, %xmm1 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: orps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm4[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: andps %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa 48(%r9), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm14 +; SSE-NEXT: movdqa 48(%rax), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrlq $48, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm3[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: movaps {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: andps %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] +; SSE-NEXT: andnps %xmm0, %xmm3 +; SSE-NEXT: orps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,0,1] +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%r8), %xmm11 -; SSE-NEXT: movdqa (%r9), %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r8), %xmm13 +; SSE-NEXT: movdqa (%r9), %xmm10 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rcx), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa (%rcx), %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa (%rdx), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa (%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: andnps %xmm12, %xmm1 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm6, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,2,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, (%rsp) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2] -; SSE-NEXT: andps %xmm2, %xmm3 -; SSE-NEXT: orps %xmm1, %xmm3 +; SSE-NEXT: andps %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: orps %xmm4, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,2],xmm6[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,2],xmm8[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm3[0,3] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rax), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 16(%r8), %xmm11 -; SSE-NEXT: movdqa 16(%r9), %xmm10 -; SSE-NEXT: movdqa %xmm11, %xmm5 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 16(%rcx), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 16(%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm6 -; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: andnps %xmm12, %xmm1 -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: movdqa 16(%r8), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r9), %xmm15 +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm12, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa 16(%rcx), %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm9 +; SSE-NEXT: movdqa 16(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm8 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] -; SSE-NEXT: andps %xmm8, %xmm0 -; SSE-NEXT: orps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rax), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm14, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 32(%r8), %xmm1 -; SSE-NEXT: movdqa 32(%r9), %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,2,2] ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm4[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[0,2] +; SSE-NEXT: andps %xmm11, %xmm0 +; SSE-NEXT: orps %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm3[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rax), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa 32(%r8), %xmm2 +; SSE-NEXT: movdqa 32(%r9), %xmm3 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: movdqa 32(%rcx), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 32(%rdx), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm5 -; SSE-NEXT: movdqa 32(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa 32(%rcx), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm14 +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: movdqa 32(%rdx), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: movdqa 32(%rdi), %xmm8 +; SSE-NEXT: movdqa 32(%rsi), %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 +; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm15 +; SSE-NEXT: por %xmm4, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm15, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: andnps %xmm0, %xmm14 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm14, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm4, %xmm13 -; SSE-NEXT: por %xmm3, %xmm13 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm13[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] -; SSE-NEXT: movaps {{.*#+}} xmm10 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm10, %xmm4 -; SSE-NEXT: andnps %xmm1, %xmm10 -; SSE-NEXT: orps %xmm4, %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,2,2,2] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 ; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm3[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2] +; SSE-NEXT: movaps {{.*#+}} xmm13 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm13, %xmm3 +; SSE-NEXT: andnps %xmm4, %xmm13 +; SSE-NEXT: orps %xmm3, %xmm13 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm10[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm3[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,0,1] -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,1,0,1] +; SSE-NEXT: pandn %xmm4, %xmm12 +; SSE-NEXT: por %xmm3, %xmm12 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm13, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm0, %xmm15 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: por %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pand %xmm15, %xmm7 +; SSE-NEXT: pandn %xmm12, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,2,2,2] +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm6[3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[3,3,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,2] +; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm0, %xmm1 +; SSE-NEXT: andnps %xmm11, %xmm0 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,65535] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] +; SSE-NEXT: movaps %xmm11, %xmm15 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm10 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,0,65535,65535,65535] @@ -3051,33 +3108,32 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [0,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movdqa %xmm1, %xmm9 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm5, %xmm1 @@ -3086,29 +3142,30 @@ ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm5, %xmm1 @@ -3116,44 +3173,44 @@ ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm2[1,1] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,0,0,0,65535,65535,65535] -; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm6[1,1] +; SSE-NEXT: movaps {{.*#+}} xmm8 = [65535,65535,0,0,0,65535,65535,65535] +; SSE-NEXT: movaps %xmm8, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: orps %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3166,19 +3223,18 @@ ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm11[1,1] -; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: movaps %xmm6, %xmm8 -; SSE-NEXT: andnps %xmm1, %xmm8 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[1,1] +; SSE-NEXT: movaps %xmm8, %xmm12 +; SSE-NEXT: andnps %xmm1, %xmm12 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm12 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] @@ -3189,18 +3245,18 @@ ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm5[1,1] -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: andnps %xmm1, %xmm5 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: orps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm3[1,1] +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: andnps %xmm1, %xmm9 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: orps %xmm0, %xmm9 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] @@ -3211,20 +3267,18 @@ ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm7[1,1] -; SSE-NEXT: andnps %xmm1, %xmm6 -; SSE-NEXT: orps %xmm0, %xmm6 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm15[1,1] +; SSE-NEXT: andnps %xmm1, %xmm8 +; SSE-NEXT: orps %xmm0, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps $42, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = xmm1[2,2],mem[2,0] @@ -3237,125 +3291,129 @@ ; SSE-NEXT: orps %xmm1, %xmm2 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: andps %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] ; SSE-NEXT: andnps %xmm0, %xmm1 ; SSE-NEXT: orps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,0,65535] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm3, %xmm12 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1],mem[0] -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm15[2,1] -; SSE-NEXT: andps %xmm0, %xmm1 -; SSE-NEXT: orps %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,xmm15[0,1,2,3,4,5,6,7,8,9] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm15, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm15 = zero,zero,xmm15[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm15 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm15, %xmm2 -; SSE-NEXT: movaps %xmm14, %xmm15 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm7[2,1] -; SSE-NEXT: andps %xmm0, %xmm15 -; SSE-NEXT: orps %xmm2, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm7, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm9[0,1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[2,1] -; SSE-NEXT: andps %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,0,65535] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: shufps $2, (%rsp), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[2,0],mem[0,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[2,1] +; SSE-NEXT: andps %xmm1, %xmm2 +; SSE-NEXT: orps %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: shufps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[2,0],mem[0,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,1] +; SSE-NEXT: andps %xmm1, %xmm0 +; SSE-NEXT: orps %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm7 +; SSE-NEXT: por %xmm3, %xmm7 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm11[0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm5[2,1] +; SSE-NEXT: andps %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm7, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,0,0,65535] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: andps %xmm4, %xmm10 -; SSE-NEXT: por %xmm10, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,0,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: pandn %xmm3, %xmm11 +; SSE-NEXT: andps %xmm4, %xmm13 +; SSE-NEXT: por %xmm13, %xmm11 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,0,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] ; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: andps %xmm4, %xmm7 -; SSE-NEXT: por %xmm7, %xmm14 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2,0],mem[2,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm3[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: pandn %xmm2, %xmm12 -; SSE-NEXT: andps %xmm4, %xmm7 -; SSE-NEXT: por %xmm7, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[2,0],mem[2,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,0,1,1] -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = mem[0,0,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] -; SSE-NEXT: andps %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm14 +; SSE-NEXT: andps %xmm4, %xmm5 +; SSE-NEXT: por %xmm5, %xmm14 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm6[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm3, %xmm15 +; SSE-NEXT: andps %xmm4, %xmm5 +; SSE-NEXT: por %xmm5, %xmm15 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2,0],mem[2,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = mem[0,0,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[0,0,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1] +; SSE-NEXT: andps %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: por %xmm5, %xmm4 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm4, 336(%rax) -; SSE-NEXT: movdqa %xmm12, 224(%rax) +; SSE-NEXT: movdqa %xmm15, 224(%rax) ; SSE-NEXT: movdqa %xmm14, 112(%rax) -; SSE-NEXT: movdqa %xmm9, (%rax) -; SSE-NEXT: movdqa %xmm0, 288(%rax) -; SSE-NEXT: movaps %xmm15, 176(%rax) -; SSE-NEXT: movaps %xmm1, 64(%rax) +; SSE-NEXT: movdqa %xmm11, (%rax) +; SSE-NEXT: movdqa %xmm1, 288(%rax) +; SSE-NEXT: movaps %xmm0, 176(%rax) +; SSE-NEXT: movaps %xmm2, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rax) -; SSE-NEXT: movaps %xmm6, 368(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 384(%rax) +; SSE-NEXT: movaps %xmm8, 368(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 352(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3363,7 +3421,7 @@ ; SSE-NEXT: movaps %xmm0, 304(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rax) -; SSE-NEXT: movaps %xmm5, 256(%rax) +; SSE-NEXT: movaps %xmm9, 256(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 240(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3371,7 +3429,7 @@ ; SSE-NEXT: movaps %xmm0, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) -; SSE-NEXT: movaps %xmm8, 144(%rax) +; SSE-NEXT: movaps %xmm12, 144(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 128(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3383,8 +3441,8 @@ ; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movdqa %xmm13, 320(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, 320(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) @@ -3392,8 +3450,6 @@ ; SSE-NEXT: movaps %xmm0, 432(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 400(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 384(%rax) ; SSE-NEXT: addq $696, %rsp # imm = 0x2B8 ; SSE-NEXT: retq ; @@ -3401,485 +3457,494 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $600, %rsp # imm = 0x258 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm1[6],xmm5[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm6[1,2,3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm9 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm1[6],xmm3[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm5 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm3[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm7 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm8 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm10 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm9[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6],xmm2[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,5,6,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,6,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm11 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm10 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,5],xmm5[6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm7[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm14 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm1[3],xmm14[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm5[0,2],xmm1[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm6[6],xmm5[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm10[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm14 -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm13 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm3[0],xmm13[0],xmm3[1],xmm13[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[0,2],xmm6[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm12[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm11[0,2],xmm7[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,0,0,0] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm13 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm11, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm10 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm15[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm12[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[0,1,0,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm13[6],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm13[2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm7[3],xmm13[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm13, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm13 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm14, %ymm15 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm14[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3,4,5],xmm12[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm14, %ymm15, %ymm12 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm11[6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm10[3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm15[0,2],xmm10[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm12[6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[0,2],xmm13[1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm4[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm12[5],xmm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm0[0,2],xmm1[1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm12[1],xmm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm4[1],xmm2[1] ; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm12 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm15, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm15, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm7[0,0,0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm3[0,1,2],xmm11[3] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm11 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX1-ONLY-NEXT: vandps %ymm13, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm13, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm11, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm12 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm9 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpblendw $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2],xmm1[3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm11, %xmm9 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm5[1],xmm12[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] +; AVX1-ONLY-NEXT: vandps %ymm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm9, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm8, %ymm11 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm14, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm14, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm14[2,3],xmm12[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm2[1],xmm6[1] +; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vpermilps $80, (%rsp), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm13[0,0,0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm9, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm9, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vpblendw $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,2],xmm2[3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 320(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 384(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rax) ; AVX1-ONLY-NEXT: addq $600, %rsp # imm = 0x258 @@ -3888,68 +3953,69 @@ ; ; AVX2-SLOW-LABEL: store_i16_stride7_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $600, %rsp # imm = 0x258 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm12 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm6 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: subq $664, %rsp # imm = 0x298 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm14 -; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm11 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> -; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,3,2,3,4,7,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpermd %ymm9, %ymm2, %ymm4 -; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vpermd %ymm7, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[0,3,2,3,4,7,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpermd %ymm9, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-SLOW-NEXT: vpermd %ymm7, %ymm3, %ymm5 +; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm3, %ymm5 ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm5 ; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm11 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm6[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm12 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15] ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <3,u,u,3,u,u,u,4> -; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm4, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm13 +; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm4, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm5 ; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] @@ -3957,42 +4023,42 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] ; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,1,2,3,6,5,6,7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,2,2,4,5,6,6] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] @@ -4003,111 +4069,116 @@ ; AVX2-SLOW-NEXT: vpbroadcastd 60(%r8), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[2,3,3,3,6,7,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[2,3,3,3,6,7,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm13 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] +; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm7, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,1] +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,1,3,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,1,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX2-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm7, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm8 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm7 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-SLOW-NEXT: vpbroadcastd (%rax), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm11 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm1[0,1,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] -; AVX2-SLOW-NEXT: vpbroadcastd (%rax), %ymm14 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm15, %ymm14, %ymm6 +; AVX2-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm14 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm15, %ymm14, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[1,1,2,2] +; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm13 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2],xmm14[3,4],xmm6[5],xmm14[6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[1,1,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm14[0,1],xmm8[2],xmm14[3,4],xmm8[5],xmm14[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm7 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[1,1,2,2] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3],xmm0[4],xmm6[5,6],xmm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[1,1,2,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1],xmm6[2],xmm15[3,4],xmm6[5],xmm15[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,2] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3],xmm0[4],xmm8[5,6],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[1,1,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0,1],xmm8[2],xmm15[3,4],xmm8[5],xmm15[6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm6 +; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm6, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm8, %ymm2 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm6, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm2, %ymm14 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm15 +; AVX2-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm2, %ymm15 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] @@ -4116,127 +4187,132 @@ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm4, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX2-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm12[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8,9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[2,3,3,3,6,7,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8,9,10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm13 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm12 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm7[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm9 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[2,3,3,3,6,7,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7,8,9],ymm5[10],ymm7[11,12],ymm5[13],ymm7[14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm8[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm7, %ymm5 -; AVX2-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8,9,10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8,9,10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm9, %ymm7 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] @@ -4244,26 +4320,27 @@ ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm6, 320(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 352(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 192(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm15, 32(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 352(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 192(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 288(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 256(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 288(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm14, 256(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm15, 32(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-SLOW-NEXT: addq $600, %rsp # imm = 0x258 +; AVX2-SLOW-NEXT: addq $664, %rsp # imm = 0x298 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -4271,186 +4348,184 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $328, %rsp # imm = 0x148 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm14 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm6 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm11 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm13 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm5 +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm6 +; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm7 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[1,2,2,3,5,6,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[1,2,2,3,5,6,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,3,3,7,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0,1],ymm2[2],ymm15[3,4],ymm2[5],ymm15[6,7,8,9],ymm2[10],ymm15[11,12],ymm2[13],ymm15[14,15] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm10 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15] +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7,8,9],ymm5[10],ymm15[11,12],ymm5[13],ymm15[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm10[3],ymm3[4,5],ymm10[6],ymm3[7,8,9,10],ymm10[11],ymm3[12,13],ymm10[14],ymm3[15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm3[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm10, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[0,0,2,1,4,4,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7,8,9,10],ymm10[11],ymm4[12,13],ymm10[14],ymm4[15] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm12, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm3 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm9[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm9, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm5 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm5[0,1,2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7,8,9,10],ymm14[11],ymm5[12,13],ymm14[14],ymm5[15] +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm14, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm10, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm15, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm15, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm12 +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm9, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm9, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm12, %ymm12 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <3,u,u,3,u,u,u,4> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm9, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <3,u,u,3,u,u,u,4> +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm12, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm9, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7,8,9],ymm8[10],ymm0[11,12],ymm8[13],ymm0[14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,2,3,6,7,5,6] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [5,6,2,3,6,7,5,6] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = @@ -4458,160 +4533,164 @@ ; AVX2-FAST-NEXT: vpbroadcastd 60(%r8), %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,28,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm11 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm10 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm15 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm9 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm5, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm8 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,1,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm14 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd 32(%rax), %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm12 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm8 +; AVX2-FAST-NEXT: vpbroadcastd (%rax), %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm3, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm12 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm8 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX2-FAST-NEXT: vpbroadcastd (%rax), %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm8, %ymm14, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm8 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1],xmm13[2,3],xmm8[4],xmm13[5,6],xmm8[7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FAST-NEXT: vpbroadcastd 32(%rax), %ymm15 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm8, %ymm15, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm2, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm9 +; AVX2-FAST-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm15 = mem[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0],xmm9[1],xmm15[2,3],xmm9[4],xmm15[5,6],xmm9[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm14 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm14[2],xmm4[3,4],xmm14[5],xmm4[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm8, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,1,2,2] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2,3],xmm1[4],xmm8[5,6],xmm1[7] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm15, %xmm8 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[1,1,2,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1],xmm8[2],xmm13[3,4],xmm8[5],xmm13[6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,0,1,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm2, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,2] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2,3],xmm2[4],xmm8[5,6],xmm2[7] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm11, %xmm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[1,1,2,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0,1],xmm8[2],xmm15[3,4],xmm8[5],xmm15[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm2, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 36(%rax), %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm6, %ymm13, %ymm6 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-FAST-NEXT: vpbroadcastd 4(%rax), %ymm8 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd 4(%rax), %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-NEXT: vpbroadcastd 36(%rax), %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm8, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm15 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX2-FAST-NEXT: vpbroadcastd 40(%rax), %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] +; AVX2-FAST-NEXT: vpbroadcastd 8(%rax), %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX2-FAST-NEXT: vpbroadcastd 8(%rax), %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd 40(%rax), %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 320(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 128(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 352(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 160(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, 192(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 320(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 352(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 160(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 288(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 256(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm15, 32(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 288(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 256(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4622,65 +4701,66 @@ ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride7_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $616, %rsp # imm = 0x268 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: subq $664, %rsp # imm = 0x298 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm8, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm3, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm12, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm11, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm5, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm13, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm8, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm10, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm3, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm12, %ymm3, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm12, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm11, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <3,u,u,3,u,u,u,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm3, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] @@ -4688,21 +4768,21 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,2,2,2,6,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -4711,17 +4791,18 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,2,2,4,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -4730,227 +4811,235 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 60(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,28,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[2,3,3,3,6,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[2,3,3,3,6,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm8, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rax), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rax), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm15, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rax), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm9, %ymm15, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm7 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm3, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm5, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[1,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1],xmm9[2],xmm15[3,4],xmm9[5],xmm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2,3],xmm1[4],xmm9[5,6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm10, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[1,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm7[2],xmm9[3,4],xmm7[5],xmm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm13, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[1,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%rax), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm9, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm9, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm3, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 40(%rax), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rax), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rax), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 40(%rax), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8,9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[2,3,3,3,6,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,2,2,2,6,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm8, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8,9,10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[2,3,3,3,6,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm10[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7,8,9],ymm6[10],ymm8[11,12],ymm6[13],ymm8[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm12, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm13[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm13, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm8, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm14[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] @@ -4958,1023 +5047,1106 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm6, %ymm4 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 352(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 288(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 256(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 288(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 256(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $616, %rsp # imm = 0x268 +; AVX2-FAST-PERLANE-NEXT: addq $664, %rsp # imm = 0x298 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i16_stride7_vf32: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $792, %rsp # imm = 0x318 +; AVX512F-SLOW-NEXT: subq $1080, %rsp # imm = 0x438 ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm26 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm14 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm14, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm15, %ymm2 -; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm3, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX512F-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm6, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm26 +; AVX512F-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm1 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm10, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm3 +; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm5 -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm4 -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm2 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 +; AVX512F-SLOW-NEXT: vpor %ymm3, %ymm2, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm13 -; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm13, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm7 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm2 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm8 +; AVX512F-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,4] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm11[0,0,1,3] ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm6 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,7,6] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> -; AVX512F-SLOW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,5,7,6] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm17 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> -; AVX512F-SLOW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vprold $16, %ymm14, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8,9,10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,3,3,10,9,11,10] -; AVX512F-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpandn %ymm11, %ymm12, %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm15 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm11, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm15 -; AVX512F-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpandn %ymm2, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm15, %ymm9 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[1,1,1,1,5,5,5,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7,8,9],ymm11[10],ymm9[11,12],ymm11[13],ymm9[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm31 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm9 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7,8,9],ymm9[10],ymm11[11,12],ymm9[13],ymm11[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm29 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm11 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpandn %ymm9, %ymm14, %ymm9 -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm11, %ymm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm7[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm9[3],ymm3[4,5],ymm9[6],ymm3[7,8,9,10],ymm9[11],ymm3[12,13],ymm9[14],ymm3[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm30 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm14[2],ymm3[3,4],ymm14[5],ymm3[6,7,8,9],ymm14[10],ymm3[11,12],ymm14[13],ymm3[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm19 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7,8,9,10],ymm10[11],ymm4[12,13],ymm10[14],ymm4[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512F-SLOW-NEXT: vprold $16, %ymm13, %ymm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm4[2],ymm12[3,4],ymm4[5],ymm12[6,7,8,9],ymm4[10],ymm12[11,12],ymm4[13],ymm12[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7,8,9],ymm4[10],ymm7[11,12],ymm4[13],ymm7[14,15] -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; AVX512F-SLOW-NEXT: vprold $16, %xmm7, %xmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm0 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm15[0,0,1,1] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm6[1,1,1,1,5,5,5,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7,8,9],ymm13[10],ymm15[11,12],ymm13[13],ymm15[14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm13[2,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm4[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7,8,9],ymm13[10],ymm15[11,12],ymm13[13],ymm15[14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm13[2,1,3,2] +; AVX512F-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm12 +; AVX512F-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm12[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpandnq %ymm13, %ymm20, %ymm13 +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm12, %ymm14 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm12 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7,8,9,10],ymm14[11],ymm13[12,13],ymm14[14],ymm13[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm13[2,2,3,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm13 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm13[0,0,1,1] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vprold $16, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm7[0,0,2,1] +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm7 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6],xmm7[7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX512F-SLOW-NEXT: vprold $16, %xmm11, %xmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[1,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3],xmm0[4],xmm8[5,6],xmm0[7] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm9 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm31 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm5, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7,8,9],ymm13[10],ymm0[11,12],ymm13[13],ymm0[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm26[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5,6,7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-SLOW-NEXT: vprold $16, %ymm3, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm10 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[1,1,1,1,5,5,5,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm22[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3],ymm12[4,5],ymm0[6],ymm12[7,8,9,10],ymm0[11],ymm12[12,13],ymm0[14],ymm12[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512F-SLOW-NEXT: vprold $16, %xmm0, %xmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[1,1,2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm2[2],xmm14[3,4],xmm2[5],xmm14[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm4[1],xmm13[2,3],xmm4[4],xmm13[5,6],xmm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm26[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm28 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm26[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1],ymm7[2],ymm14[3,4],ymm7[5],ymm14[6,7,8,9],ymm7[10],ymm14[11,12],ymm7[13],ymm14[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa %ymm10, %ymm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm22[1,1,1,1,5,5,5,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7,8,9],ymm10[10],ymm14[11,12],ymm10[13],ymm14[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm31[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm29[2,1,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm18[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm19[0,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm16[2,1,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,1,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512F-SLOW-NEXT: vprold $16, %ymm15, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7,8,9],ymm0[10],ymm6[11,12],ymm0[13],ymm6[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6,7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6,7,8],ymm9[9],ymm0[10,11],ymm9[12],ymm0[13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[0,1,2,2,4,5,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,3,3,3,6,7,7,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm16 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm16, %zmm22, %zmm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm16[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm7, %zmm14 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm7[0,1,2,3],zmm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm3 -; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm14 = mem[0,0,2,1] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm16 = mem[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm18 = mem[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm19 = mem[2,1,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[0,0,2,1] -; AVX512F-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm10 = mem[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm25[0,0,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm7 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,1,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm20[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm17[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm31, %zmm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm4, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[2,3,3,3,6,7,7,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm31 = ymm15[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,1,2,2,4,5,6,6] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm27, %zmm5 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm5 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,1,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm11[2,1,3,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm5 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm8, %zmm8 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm8 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm13[0,1,1,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm18, %zmm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm19, %zmm9 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm9 -; AVX512F-SLOW-NEXT: vpbroadcastd (%rax), %ymm4 -; AVX512F-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm9[3],ymm7[4,5],ymm9[6],ymm7[7,8,9,10],ymm9[11],ymm7[12,13],ymm9[14],ymm7[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm11[4],xmm14[4],xmm11[5],xmm14[5],xmm11[6],xmm14[6],xmm11[7],xmm14[7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,0,1,1] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,5,7,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm29[0,0,2,1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] +; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm6 = mem[0,0,1,1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm26[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm0 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm0[0,1,2,3,4,5,7,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,2,3,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm1[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm23[2,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm18[0,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm17[0,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm16[2,1,3,3] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm23 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm25 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm23, %zmm31, %zmm25 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm17 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm17 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm17 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm16 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm4 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX512F-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm18 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm18 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm24, %zmm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm26, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm22, %zmm7 -; AVX512F-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm5 -; AVX512F-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm5 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm30, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm31[0,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm15[2,1,3,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm16 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm16 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm18 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm11, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm2 +; AVX512F-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm15 = mem[2,1,3,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm19[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm12 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[0,1,2,2,4,5,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm19 = ymm5[2,3,3,3,6,7,7,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,1,3,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm23 = mem[0,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm30[2,1,3,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,1,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm3[2,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm29, %zmm9 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm6, %zmm6 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm0[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm29 = ymm0[0,1,2,2,4,5,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm30 = ymm0[2,3,3,3,6,7,7,7] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm9, %zmm31, %zmm6 +; AVX512F-SLOW-NEXT: vpbroadcastd (%rax), %ymm9 +; AVX512F-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm31 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm9, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm9 +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm8[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm27, %zmm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm28, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm4, %zmm3, %zmm7 +; AVX512F-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm3 +; AVX512F-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm15, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm14[2,1,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm19[2,1,3,2] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm20, %zmm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm13[0,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm29[2,1,3,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm6 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,0,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm11, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm30[2,1,3,2] +; AVX512F-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm2 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 320(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 384(%rax) -; AVX512F-SLOW-NEXT: addq $792, %rsp # imm = 0x318 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512F-SLOW-NEXT: addq $1080, %rsp # imm = 0x438 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i16_stride7_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: subq $1128, %rsp # imm = 0x468 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm3, %ymm4, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm1, %ymm2, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,1,8,9,9,11] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,0,1,1,12,13,u,15> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm5, %ymm8, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm9, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,10,9,11,10] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm4, %ymm7, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm15[2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,u,3,10,10,11,11> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm17, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm5, %ymm7, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8,9,10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5,6,7,8],ymm2[9],ymm13[10,11],ymm2[12],ymm13[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm15, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm5, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm3, %ymm29, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm11, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm9, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm6, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm10, %xmm19 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm7, %xmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5,6,7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm9 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] -; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm27, %zmm9, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm10, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm2[2],xmm8[3,4],xmm2[5],xmm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7,8,9,10],ymm9[11],ymm0[12,13],ymm9[14],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5,6,7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[10,11,12,13,10,11,12,13,10,11,12,13,14,15,14,15,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,5,0,0,15,6,0,15,14,5,0,0,15,6,0,15] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm5, %zmm0, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm14[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7,8,9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7,8,9],ymm13[10],ymm15[11,12],ymm13[13],ymm15[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7,8,9],ymm7[10],ymm15[11,12],ymm7[13],ymm15[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <6,u,u,u,7,u,u,7> -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm27, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $246, (%rsp), %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm5, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm14, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm10[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm4[3],ymm7[4,5],ymm4[6],ymm7[7,8,9,10],ymm4[11],ymm7[12,13],ymm4[14],ymm7[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[10,11,12,13,10,11,12,13,10,11,12,13,14,15,14,15,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512F-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm3, %zmm31, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [6,7,3,3,7,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm16, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm27[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm24[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm21[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm18[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm17[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm24 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm21 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm24, %zmm27, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm18 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm19 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm19, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm24[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,2,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm22[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $182, (%rsp), %ymm18 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm18 = mem[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm21 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm28[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm6, %zmm27, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm18[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm26[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm20[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm16[2,1,3,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm31, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm24, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm26, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm20, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $1128, %rsp # imm = 0x468 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i16_stride7_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: subq $1128, %rsp # imm = 0x468 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm4 -; AVX512DQ-FAST-NEXT: vporq %ymm3, %ymm4, %ymm19 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2 -; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm2 -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vporq %ymm1, %ymm2, %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm2 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,1,8,9,9,11] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,0,1,1,12,13,u,15> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm5 +; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,0,1,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandn %ymm5, %ymm8, %ymm5 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vprold $16, %ymm9, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,10,9,11,10] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm3 -; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpandn %ymm4, %ymm7, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm28 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm8 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,0,1,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm15[2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,u,3,10,10,11,11> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm17, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm9 -; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpandn %ymm5, %ymm7, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm17 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8,9,10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5,6,7,8],ymm2[9],ymm13[10,11],ymm2[12],ymm13[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512DQ-FAST-NEXT: vprold $16, %ymm15, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,4,5,5,7] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermd %ymm5, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandnq %ymm3, %ymm29, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm11 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm11, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vprold $16, %xmm9, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3,4],xmm3[5],xmm7[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm31 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX512DQ-FAST-NEXT: vprold $16, %xmm6, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm24 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6],xmm7[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm9 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm10, %xmm19 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm7, %xmm16 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm15, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5,6,7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm7, %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm9 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm29 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] -; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm27, %zmm9, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX512DQ-FAST-NEXT: vprold $16, %xmm10, %xmm2 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm2[2],xmm8[3,4],xmm2[5],xmm8[6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7,8,9,10],ymm9[11],ymm0[12,13],ymm9[14],ymm0[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm6 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm19 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FAST-NEXT: vprold $16, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5,6,7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vprold $16, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[10,11,12,13,10,11,12,13,10,11,12,13,14,15,14,15,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = [14,5,0,0,15,6,0,15,14,5,0,0,15,6,0,15] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm5, %zmm0, %zmm30 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm14[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7,8,9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm9 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7,8,9],ymm13[10],ymm15[11,12],ymm13[13],ymm15[14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7,8,9],ymm7[10],ymm15[11,12],ymm7[13],ymm15[14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <6,u,u,u,7,u,u,7> -; AVX512DQ-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm15 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm27, %zmm11 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3] -; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm27 = mem[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq $246, (%rsp), %ymm2 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm2 = mem[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512DQ-FAST-NEXT: vprold $16, %ymm5, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm13 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm15 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm14, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm10[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm4[3],ymm7[4,5],ymm4[6],ymm7[7,8,9,10],ymm4[11],ymm7[12,13],ymm4[14],ymm7[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[10,11,12,13,10,11,12,13,10,11,12,13,14,15,14,15,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm15 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm31 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512DQ-FAST-NEXT: # zmm31 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm31, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [6,7,3,3,7,7,6,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm3, %ymm16, %ymm29 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm27[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm24[0,0,1,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm21[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm18[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm17[2,1,3,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm24 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm21 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm24, %zmm27, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm17 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm18 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm19 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm18 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm16 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm19, %zmm16 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = mem[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm24[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,2,2,3] ; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm5 = mem[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm22[2,2,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm28 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm6, %zmm6 +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm7 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $182, (%rsp), %ymm18 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm18 = mem[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm19 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm21 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,3,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm28[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm8, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm6, %zmm27, %zmm8 +; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm6 +; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm6 ; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm17 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm17 -; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,0,1,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm18[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm26[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm20[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm16[2,1,3,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm31, %zmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm29 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm19, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm24, %zmm2 -; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm26, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm2 ; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm1 -; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 -; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm3 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm11, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm30 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm20, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm31 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm31 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm29, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 384(%rax) -; AVX512DQ-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-FAST-NEXT: addq $1128, %rsp # imm = 0x468 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -6139,59 +6311,63 @@ ; SSE-NEXT: subq $1656, %rsp # imm = 0x678 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa 112(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rsi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdx), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdx), %xmm1 -; SSE-NEXT: movdqa 96(%rcx), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rcx), %xmm8 -; SSE-NEXT: movdqa 112(%r8), %xmm5 -; SSE-NEXT: movdqa 112(%r9), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rax), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] +; SSE-NEXT: movdqa 96(%rdx), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdx), %xmm2 +; SSE-NEXT: movdqa 96(%rcx), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rcx), %xmm1 +; SSE-NEXT: movdqa 112(%r8), %xmm7 +; SSE-NEXT: movdqa 112(%r9), %xmm5 +; SSE-NEXT: movdqa 112(%rax), %xmm9 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm14 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,0,0] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[0,2] -; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm1, %xmm2 -; SSE-NEXT: andnps %xmm11, %xmm1 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: orps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,1,1] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[0,2] +; SSE-NEXT: movaps {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: andps %xmm15, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,1] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: orps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa 96(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6205,12 +6381,11 @@ ; SSE-NEXT: movdqa 96(%r8), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movaps %xmm15, %xmm3 +; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: orps %xmm2, %xmm3 ; SSE-NEXT: movdqa 96(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] @@ -6223,461 +6398,469 @@ ; SSE-NEXT: andps %xmm4, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrlq $48, %xmm8 -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm8[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: psrlq $48, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3] +; SSE-NEXT: movaps %xmm15, %xmm3 +; SSE-NEXT: andnps %xmm1, %xmm3 +; SSE-NEXT: orps %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] +; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm3[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: andps %xmm4, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,1] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%r8), %xmm9 -; SSE-NEXT: movdqa (%r9), %xmm8 -; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: andnps %xmm1, %xmm2 +; SSE-NEXT: movdqa (%r8), %xmm11 +; SSE-NEXT: movdqa (%r9), %xmm9 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm10 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa (%rcx), %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa (%rdi), %xmm8 +; SSE-NEXT: movdqa (%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm3 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, %xmm8 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rcx), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa (%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,2,2] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm2[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] -; SSE-NEXT: andps %xmm12, %xmm2 +; SSE-NEXT: andps %xmm6, %xmm2 ; SSE-NEXT: orps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[2,2,2,2,4,5,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,2],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm14[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm1[0,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rax), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 16(%r8), %xmm12 -; SSE-NEXT: movdqa 16(%r9), %xmm5 -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r8), %xmm8 +; SSE-NEXT: movdqa 16(%r9), %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 16(%rcx), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa 16(%rcx), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 16(%rdx), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: movdqa 16(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm7 +; SSE-NEXT: movdqa 16(%rsi), %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm4, %xmm15 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm15, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,2,2] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,4,4,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm2[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] -; SSE-NEXT: andps %xmm8, %xmm2 +; SSE-NEXT: andps %xmm11, %xmm2 ; SSE-NEXT: orps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[2,2,2,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rax), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rax), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 32(%r8), %xmm1 -; SSE-NEXT: movdqa 32(%r9), %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%r8), %xmm13 +; SSE-NEXT: movdqa 32(%r9), %xmm11 +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa 32(%rcx), %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm0, %xmm9 ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: movdqa 32(%rdx), %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm5 -; SSE-NEXT: movdqa 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, %xmm7 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: movdqa 32(%rsi), %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,2,2] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] -; SSE-NEXT: andps %xmm13, %xmm2 -; SSE-NEXT: orps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm2[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,2] +; SSE-NEXT: andps %xmm8, %xmm0 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,3,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rax), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rax), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 48(%r8), %xmm0 +; SSE-NEXT: movdqa 48(%r8), %xmm3 ; SSE-NEXT: movdqa 48(%r9), %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm10 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 48(%rcx), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: movdqa 48(%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm11 ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 48(%rdx), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: movdqa 48(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm14 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa 48(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm8 +; SSE-NEXT: movdqa 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,2,2] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm7 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm2[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] -; SSE-NEXT: andps %xmm13, %xmm2 -; SSE-NEXT: orps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,2] +; SSE-NEXT: andps %xmm6, %xmm0 +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm3[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm1[0,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rax), %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rax), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 64(%r8), %xmm1 -; SSE-NEXT: movdqa 64(%r9), %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: movdqa 64(%r8), %xmm11 +; SSE-NEXT: movdqa 64(%r9), %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm7 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 64(%rcx), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: movdqa 64(%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm14 ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 64(%rdx), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: movdqa 64(%rdx), %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa 64(%rdi), %xmm5 -; SSE-NEXT: movdqa 64(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE-NEXT: movdqa 64(%rdi), %xmm0 +; SSE-NEXT: movdqa 64(%rsi), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm8 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,2,2,2] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,4,4,4] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm2[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,2] -; SSE-NEXT: andps %xmm13, %xmm0 +; SSE-NEXT: andps %xmm15, %xmm0 ; SSE-NEXT: orps %xmm1, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,3] @@ -6685,87 +6868,87 @@ ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rax), %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa 80(%r8), %xmm1 ; SSE-NEXT: movdqa 80(%r9), %xmm4 ; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm9 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa 80(%rcx), %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm3, %xmm10 ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa 80(%rdx), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: movdqa 80(%rdx), %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: movdqa 80(%rdi), %xmm5 -; SSE-NEXT: movdqa 80(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rsi), %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: pand %xmm12, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: pand %xmm6, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm14 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm11, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] ; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm2[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] -; SSE-NEXT: andps %xmm13, %xmm2 +; SSE-NEXT: andps %xmm15, %xmm2 ; SSE-NEXT: orps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,3] @@ -6773,188 +6956,183 @@ ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm15, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: psrld $16, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm15 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm12, %xmm2 +; SSE-NEXT: pandn %xmm3, %xmm12 +; SSE-NEXT: por %xmm2, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pand %xmm2, %xmm12 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm1, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[3,3] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,0],xmm1[3,0] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] -; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,0,0,0,65535,65535,65535,65535] -; SSE-NEXT: andps %xmm0, %xmm1 -; SSE-NEXT: andnps %xmm6, %xmm0 -; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm10[0,2] +; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,0,0,0,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm3, %xmm1 +; SSE-NEXT: andnps %xmm0, %xmm3 +; SSE-NEXT: orps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm7[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[0,3] -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; SSE-NEXT: psrld $16, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,0,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,1],xmm3[3,3] -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: por %xmm3, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pand %xmm12, %xmm14 -; SSE-NEXT: por %xmm14, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,0,1] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: andps %xmm4, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2] -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm1[2,0] +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[0,2] +; SSE-NEXT: andps %xmm13, %xmm2 +; SSE-NEXT: movaps %xmm13, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: andnps %xmm9, %xmm1 +; SSE-NEXT: orps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,2,2] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,0,65535] ; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: pand %xmm6, %xmm4 ; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5,6,6] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,6,6] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm7[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm8[2,2] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: andps %xmm12, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: andps %xmm13, %xmm3 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm3, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,1,3] +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,1,3] ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,0,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -6966,66 +7144,65 @@ ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -7034,71 +7211,71 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm11, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -7109,29 +7286,29 @@ ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4] -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm1, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,1,1,1,4,5,6,7] @@ -7144,13 +7321,12 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm5[1,1] -; SSE-NEXT: movaps %xmm5, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm7[1,1] ; SSE-NEXT: movaps {{.*#+}} xmm4 = [65535,65535,0,0,0,65535,65535,65535] ; SSE-NEXT: movaps %xmm4, %xmm3 ; SSE-NEXT: andnps %xmm1, %xmm3 @@ -7169,12 +7345,13 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm5[1,1] +; SSE-NEXT: movaps %xmm5, %xmm11 ; SSE-NEXT: movaps %xmm4, %xmm3 ; SSE-NEXT: andnps %xmm1, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm0 @@ -7192,7 +7369,7 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: pshufhw $164, (%rsp), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 @@ -7216,13 +7393,13 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[1,1] -; SSE-NEXT: movaps %xmm14, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm8[1,1] ; SSE-NEXT: movaps %xmm4, %xmm3 ; SSE-NEXT: andnps %xmm1, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm0 @@ -7240,13 +7417,14 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm6[1,1] -; SSE-NEXT: movaps %xmm6, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm9[1,1] +; SSE-NEXT: movaps %xmm9, %xmm12 ; SSE-NEXT: movaps %xmm4, %xmm3 ; SSE-NEXT: andnps %xmm1, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm0 @@ -7270,8 +7448,7 @@ ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm6[1,1] ; SSE-NEXT: movaps %xmm4, %xmm3 ; SSE-NEXT: andnps %xmm1, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm0 @@ -7302,41 +7479,42 @@ ; SSE-NEXT: andnps %xmm1, %xmm3 ; SSE-NEXT: orps %xmm0, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[0],mem[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[0,1],mem[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[0,2] -; SSE-NEXT: movaps {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,0,65535,65535] -; SSE-NEXT: andps %xmm9, %xmm7 +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[0],mem[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0,0],mem[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] +; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535] +; SSE-NEXT: andps %xmm5, %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,6,6,7] ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] -; SSE-NEXT: andnps %xmm1, %xmm9 -; SSE-NEXT: orps %xmm7, %xmm9 +; SSE-NEXT: andnps %xmm1, %xmm5 +; SSE-NEXT: orps %xmm3, %xmm5 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE-NEXT: andps %xmm3, %xmm9 +; SSE-NEXT: andps %xmm3, %xmm5 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0] ; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: orps %xmm9, %xmm3 +; SSE-NEXT: orps %xmm5, %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $42, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[2,2],mem[2,0] -; SSE-NEXT: movaps {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: andps %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: andnps %xmm0, %xmm3 -; SSE-NEXT: orps %xmm1, %xmm3 -; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps $42, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2,2],mem[2,0] +; SSE-NEXT: movaps {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: andps %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE-NEXT: andnps %xmm0, %xmm1 ; SSE-NEXT: orps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: andps %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE-NEXT: andnps %xmm0, %xmm3 +; SSE-NEXT: orps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: movdqa %xmm2, %xmm0 @@ -7348,16 +7526,16 @@ ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,0,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm5, %xmm13 -; SSE-NEXT: movaps %xmm12, %xmm6 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[1],mem[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps %xmm10, %xmm5 +; SSE-NEXT: shufps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2,0],mem[0,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[2,1] -; SSE-NEXT: andps %xmm0, %xmm6 -; SSE-NEXT: orps %xmm1, %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[2,1] +; SSE-NEXT: andps %xmm0, %xmm5 +; SSE-NEXT: orps %xmm1, %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] ; SSE-NEXT: movdqa %xmm2, %xmm1 @@ -7368,10 +7546,9 @@ ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: movapd %xmm12, %xmm9 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[1],mem[0] +; SSE-NEXT: movaps %xmm11, %xmm9 +; SSE-NEXT: shufps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[2,0],mem[0,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm3[2,1] @@ -7388,8 +7565,8 @@ ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: movaps %xmm15, %xmm8 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[1],mem[0] +; SSE-NEXT: shufps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[2,0],mem[0,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm3[2,1] @@ -7405,9 +7582,10 @@ ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm11, %xmm7 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[1],mem[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, %xmm7 +; SSE-NEXT: shufps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[2,0],mem[0,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[2,1] @@ -7423,9 +7601,9 @@ ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm14, %xmm5 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[1],mem[0] +; SSE-NEXT: movaps %xmm12, %xmm5 +; SSE-NEXT: shufps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[2,0],mem[0,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[2,1] @@ -7441,13 +7619,12 @@ ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm15 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1],mem[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, %xmm3 +; SSE-NEXT: shufps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[2,0],mem[0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4,4,5,5,6,6,7,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm13[2,1] ; SSE-NEXT: andps %xmm0, %xmm3 ; SSE-NEXT: orps %xmm6, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -7457,28 +7634,28 @@ ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm6, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[1],mem[0] +; SSE-NEXT: shufps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2,0],mem[0,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[2,1] ; SSE-NEXT: andps %xmm0, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0] -; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[2,0],mem[2,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[0],mem[0] +; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,0,0,65535] ; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: andps %xmm2, %xmm4 -; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: andps %xmm2, %xmm12 +; SSE-NEXT: por %xmm12, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload ; SSE-NEXT: # xmm10 = xmm10[0],mem[0] @@ -7486,13 +7663,13 @@ ; SSE-NEXT: # xmm10 = xmm10[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] ; SSE-NEXT: movdqa %xmm2, %xmm13 ; SSE-NEXT: pandn %xmm1, %xmm13 ; SSE-NEXT: andps %xmm2, %xmm10 ; SSE-NEXT: por %xmm10, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm4 # 16-byte Reload ; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = xmm4[0],mem[0] ; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload @@ -7513,7 +7690,7 @@ ; SSE-NEXT: # xmm4 = xmm4[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm15[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; SSE-NEXT: movdqa %xmm2, %xmm11 ; SSE-NEXT: pandn %xmm1, %xmm11 @@ -7526,7 +7703,8 @@ ; SSE-NEXT: # xmm4 = xmm4[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm14[0,0,1,1] +; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] ; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: pandn %xmm1, %xmm10 @@ -7539,7 +7717,7 @@ ; SSE-NEXT: # xmm4 = xmm4[2,0],mem[2,1] ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,0,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; SSE-NEXT: movdqa %xmm2, %xmm15 ; SSE-NEXT: pandn %xmm1, %xmm15 @@ -7577,8 +7755,10 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 864(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 832(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 784(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 752(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7662,8 +7842,6 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 848(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 832(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 800(%rax) ; SSE-NEXT: addq $1656, %rsp # imm = 0x678 ; SSE-NEXT: retq @@ -7672,138 +7850,138 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $1464, %rsp # imm = 0x5B8 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 112(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vmovdqa 112(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5],xmm3[6],xmm4[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 112(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa 112(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0],xmm3[1],xmm8[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm3[3],xmm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa 112(%r9), %xmm3 +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0,1,2,3],xmm8[4,5],xmm4[6,7] ; AVX1-ONLY-NEXT: vmovdqa 112(%rax), %xmm4 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1,2,3,4,5,6],xmm11[7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm11 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2],xmm8[3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5],xmm8[6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm8 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm8[0,1,2,3,4,5,6,6] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpslld $16, %xmm3, %xmm8 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm8[4],xmm5[5,6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm1[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm10[6],xmm11[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm12 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vandps %ymm13, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,0,1,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm11[0,1,2],xmm12[3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5],xmm12[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm15, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm15, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vorps %ymm8, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,1,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1],xmm11[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm2[2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpslld $16, %xmm3, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm4[0],zero,xmm4[1],zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1,2,3,4,5,6],xmm12[7] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm12 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2],xmm11[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm11 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm14, %ymm7 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[2,2,3,3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm14, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,5,6,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1,2,3,4,5,6],xmm9[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1] +; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2,3,4,5,6],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 96(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm7[1],xmm5[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vmovdqa 96(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm11, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa 96(%r8), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 96(%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 96(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa 96(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3,4,5,6],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2,3,4],xmm7[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vmovdqa 96(%r9), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrld $16, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm9[6,7] +; AVX1-ONLY-NEXT: vmovdqa 96(%rax), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1,2,3,4,5,6],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4,5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,0,0,0] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,6,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5],xmm2[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1,2,3,4,5,6],xmm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2],xmm4[3],xmm6[4,5,6,7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1],xmm1[2,3],xmm8[4,5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3],xmm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,0,0] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6],xmm1[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm3 @@ -7825,9 +8003,10 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm12 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7842,9 +8021,10 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm15, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm5 @@ -7897,9 +8077,8 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm2 @@ -7935,8 +8114,8 @@ ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] @@ -7972,8 +8151,8 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7988,9 +8167,9 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm12[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm15, %ymm10 -; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm13, %ymm10 +; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm10, %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm1 @@ -8015,9 +8194,9 @@ ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[0,2],xmm4[1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm1 @@ -8031,40 +8210,40 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpsrld $16, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vpsrld $16, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm12 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm11 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm15[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm14[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm8[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm15[0,2],xmm11[1,3] -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm14[0,2],xmm11[1,3] +; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] @@ -8075,19 +8254,19 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm8[6],xmm1[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[2,2,3,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm8 = xmm14[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm11[3],xmm8[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] @@ -8095,16 +8274,16 @@ ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 64(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa 64(%rcx), %xmm7 ; AVX1-ONLY-NEXT: vpsrld $16, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa 64(%rsi), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm9 @@ -8115,20 +8294,21 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm13, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vmovdqa 64(%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 64(%r8), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa 64(%r8), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vmovdqa 64(%rax), %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vmovdqa 64(%rax), %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5],xmm14[6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[0,1,0,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5],xmm7[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm10 @@ -8141,11 +8321,11 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[3,3,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,2,2,2] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6],xmm8[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,6,6] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,6,6] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] @@ -8153,12 +8333,13 @@ ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm6[3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm12[0,2],xmm6[1,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm12[0,2],xmm2[1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm4, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 80(%rdx), %xmm9 @@ -8182,9 +8363,8 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm13, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vmovdqa 80(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 80(%r8), %xmm1 @@ -8243,13 +8423,12 @@ ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm15, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,1] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 @@ -8299,66 +8478,67 @@ ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm14 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm4 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm4, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm12, %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpsrlq $48, %xmm15, %xmm6 -; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm14[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm3, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpsrlq $48, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm0[1],xmm6[1] ; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = mem[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] -; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm13 = xmm15[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[0,0,0,0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1,2,3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vpsrld $16, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm13 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm12, %ymm12 @@ -8384,12 +8564,12 @@ ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] ; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,4] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm13 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0] ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm12, %ymm12 @@ -8650,21 +8830,21 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 784(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 768(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 880(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 864(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 816(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 784(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 800(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 768(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 848(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 832(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 816(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 800(%rax) ; AVX1-ONLY-NEXT: addq $1464, %rsp # imm = 0x5B8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -8821,7 +9001,7 @@ ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] @@ -8993,7 +9173,7 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm8, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[1,1,2,2] @@ -9281,8 +9461,7 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm7, %ymm10, %ymm7 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm7, %ymm8 ; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload @@ -9536,13 +9715,13 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,0,2,1,4,4,6,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,2,3,3] ; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm3 @@ -9551,378 +9730,374 @@ ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm8 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm6 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm13 ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm14 ; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm9 ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8,9,10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX2-FAST-NEXT: vmovdqa 64(%rax), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm11 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm11 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm12 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm15 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm15[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm15 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm10 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm10[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[0,1,1,3,4,5,5,7] ; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm7, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7,8,9],ymm7[10],ymm1[11,12],ymm7[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm2[2],ymm8[3,4],ymm2[5],ymm8[6,7,8,9],ymm2[10],ymm8[11,12],ymm2[13],ymm8[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm14, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[0,1,1,3,4,5,5,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7,8,9],ymm0[10],ymm14[11,12],ymm0[13],ymm14[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm11, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,2,2,3,5,6,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9,22,23,22,23,22,23,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[1,2,2,3,5,6,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [4,5,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[1,2,2,3,5,6,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0,1],ymm3[2],ymm14[3,4],ymm3[5],ymm14[6,7,8,9],ymm3[10],ymm14[11,12],ymm3[13],ymm14[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[1,2,2,3,5,6,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm7 ; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm10, %ymm14 -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm8, %ymm14 +; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm10[1,2,2,3,5,6,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[1,2,2,3,5,6,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7,8,9],ymm0[10],ymm14[11,12],ymm0[13],ymm14[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-NEXT: vpermd %ymm9, %ymm13, %ymm14 ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm14, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm15, %ymm11 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm12[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm11 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7,8,9],ymm14[10],ymm11[11,12],ymm14[13],ymm11[14,15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm0 ; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm11, %ymm14, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm11, %ymm14, %ymm11 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm11, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm10 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm11[2],ymm1[3,4],ymm11[5],ymm1[6,7,8,9],ymm11[10],ymm1[11,12],ymm11[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5,6,7,8],ymm11[9],ymm15[10,11],ymm11[12],ymm15[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm15 +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm11 = mem[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm3[1],ymm11[2,3],ymm3[4],ymm11[5,6,7,8],ymm3[9],ymm11[10,11],ymm3[12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2,3],ymm2[4],ymm11[5,6,7,8],ymm2[9],ymm11[10,11],ymm2[12],ymm11[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,12,13,10,11,12,13,10,11,12,13,14,15,14,15,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,3,3,7,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,7,3,3,7,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm14[2],ymm3[3,4],ymm14[5],ymm3[6,7,8,9],ymm14[10],ymm3[11,12],ymm14[13],ymm3[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm14[2],ymm2[3,4],ymm14[5],ymm2[6,7,8,9],ymm14[10],ymm2[11,12],ymm14[13],ymm2[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm3, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3,4],ymm14[5],ymm0[6,7,8,9],ymm14[10],ymm0[11,12],ymm14[13],ymm0[14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm11 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7,8,9,10],ymm11[11],ymm14[12,13],ymm11[14],ymm14[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm14 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm11, %ymm14, %ymm11 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm3, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm14, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm14 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7,8,9,10],ymm14[11],ymm15[12,13],ymm14[14],ymm15[15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm12, %ymm12 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm11, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm12, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm1 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3],ymm11[4,5],ymm1[6],ymm11[7,8,9,10],ymm1[11],ymm11[12,13],ymm1[14],ymm11[15] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7,8,9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm11 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5,6,7,8],ymm11[9],ymm15[10,11],ymm11[12],ymm15[13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm12 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5,6,7,8],ymm12[9],ymm15[10,11],ymm12[12],ymm15[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm3[1],ymm11[2,3],ymm3[4],ymm11[5,6,7,8],ymm3[9],ymm11[10,11],ymm3[12],ymm11[13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2,3],ymm2[4],ymm12[5,6,7,8],ymm2[9],ymm12[10,11],ymm2[12],ymm12[13,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <3,u,u,u,4,u,u,4> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <3,u,u,u,4,u,u,4> +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm11, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm3 ; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm6, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <3,u,u,3,u,u,u,4> -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm7, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <3,u,u,3,u,u,u,4> +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm3, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,1,1,3,4,5,5,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,4,2,1,6,5,6,5] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,2,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,2,3,6,7,5,6] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[2,2,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [5,6,2,3,6,7,5,6] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,2,2,6,6,6,6] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpbroadcastd 124(%r8), %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,28,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10026,11 +10201,11 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-FAST-NEXT: vpbroadcastd 64(%rax), %ymm6 ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm6 +; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm6 ; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] @@ -10045,11 +10220,11 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm7, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm1 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,2,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -10261,39 +10436,39 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rcx), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <3,u,u,u,4,u,u,4> ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm12, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm8, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm1, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm4, %ymm1, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm5, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm6, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rax), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rax), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <3,u,u,3,u,u,u,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm9, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10301,98 +10476,99 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm12, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm7, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm8, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm12, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm12, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm7, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm8, %ymm12, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm4, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm12, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm8, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm4, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm12, %ymm14, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm4, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm10, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm14, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm7, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm10, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm12, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm6, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm13, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm8, %ymm10, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm12, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm6, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm12, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm13, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm10, %ymm13, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm13, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm14, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm12, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm13, %ymm11, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm9, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm7, %ymm11, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm9, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm9, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm9, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rax), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm10, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm9, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] @@ -10403,19 +10579,19 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} ymm1 = ymm5[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] @@ -10426,20 +10602,21 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,1,2,2,4,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] @@ -10448,11 +10625,11 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 124(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,28,29,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[2,3,3,3,6,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,3,3,3,6,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -10555,11 +10732,11 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 64(%rax), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm13, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 96(%rax), %ymm15 @@ -10573,12 +10750,12 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm9, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -10732,10 +10909,9 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] @@ -10746,11 +10922,11 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15] @@ -10761,26 +10937,27 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm5, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm6, %ymm7, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm14, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7,8,9,10],ymm9[11],ymm8[12,13],ymm9[14],ymm8[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload @@ -10788,8 +10965,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm5, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm4, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8,9,10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15] @@ -10798,10 +10975,10 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm3, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm13, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm5[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7,8,9,10],ymm11[11],ymm7[12,13],ymm11[14],ymm7[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload @@ -10809,94 +10986,96 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm11, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm15, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm15, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm7, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm7, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm1, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm1, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[2,2,2,2,6,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm12, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm14, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[2,2,2,2,6,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm14, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm9, %ymm10, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [6,7,6,7,6,7,6,7,8,9,8,9,8,9,8,9,22,23,22,23,22,23,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm7[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm0[0,1,2,2,4,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm5, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm11, %ymm12, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[0,1,2,2,4,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm12, %ymm14, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm12, %ymm14, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm13, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7,8,9],ymm10[10],ymm14[11,12],ymm10[13],ymm14[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[0,1,2,2,4,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm14, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm10, %ymm14, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8,9,10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm1, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15] @@ -10905,56 +11084,53 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm10, %ymm11, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm11, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8,9,10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm7, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6,7,8],ymm12[9],ymm13[10,11],ymm12[12],ymm13[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm7, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8,9,10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm13, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm15, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm12, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm15, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,12,13,10,11,12,13,10,11,12,13,14,15,14,15,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm1, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[2,3,3,3,6,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[2,3,3,3,6,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm5, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1],ymm1[2],ymm15[3,4],ymm1[5],ymm15[6,7,8,9],ymm1[10],ymm15[11,12],ymm1[13],ymm15[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm4[2,3,3,3,6,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[2,3,3,3,6,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm3, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm2, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7,8,9],ymm15[10],ymm13[11,12],ymm15[13],ymm13[14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[2,3,3,3,6,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[2,3,3,3,6,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm13, %ymm15, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0] @@ -11019,2429 +11195,2698 @@ ; ; AVX512F-SLOW-LABEL: store_i16_stride7_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $2440, %rsp # imm = 0x988 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm19 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm15, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm1 -; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm16 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm6, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm5 -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm5, %ymm2 -; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm4, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm3, %ymm0 -; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: subq $3304, %rsp # imm = 0xCE8 ; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm3, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm1 +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm31 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %ymm10 +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm10, %ymm1 ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm7 +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm11 +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm11, %ymm1 ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm14, %ymm9 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm9, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm9, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm12, %ymm13 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm13, %ymm0 +; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpandn %ymm0, %ymm3, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm9 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm13, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm22 -; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm11, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm6 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm5 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpbroadcastd 72(%rax), %ymm0 +; AVX512F-SLOW-NEXT: vpandn %ymm0, %ymm3, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm9, %ymm19 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %xmm1 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm20 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %xmm9 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm12 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm15, %ymm21 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm17[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm18[1,1,1,1,5,5,5,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpandnq %ymm0, %ymm16, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm15 +; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm1, %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm22 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vprold $16, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7,8,9],ymm0[10],ymm11[11,12],ymm0[13],ymm11[14,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm13[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,1,3,2,10,10,10,11] -; AVX512F-SLOW-NEXT: vpermi2q %zmm8, %zmm0, %zmm10 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512F-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm10 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm10, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm17 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm24[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7,8,9],ymm8[10],ymm0[11,12],ymm8[13],ymm0[14,15] +; AVX512F-SLOW-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm25[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm25, %ymm19 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8,9,10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm15, %ymm23 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm10[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm24[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm10[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2,3],ymm0[4],ymm15[5,6,7,8],ymm0[9],ymm15[10,11],ymm0[12],ymm15[13,14,15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm7[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2,3],ymm0[4],ymm15[5,6,7,8],ymm0[9],ymm15[10,11],ymm0[12],ymm15[13,14,15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7,8,9,10],ymm0[11],ymm8[12,13],ymm0[14],ymm8[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3],ymm15[4,5],ymm0[6],ymm15[7,8,9,10],ymm0[11],ymm15[12,13],ymm0[14],ymm15[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %ymm0 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm0[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX512F-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm24[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7,8,9],ymm11[10],ymm8[11,12],ymm11[13],ymm8[14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm15[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7,8,9],ymm7[10],ymm11[11,12],ymm7[13],ymm11[14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512F-SLOW-NEXT: vprold $16, %ymm0, %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm10 -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm19, %zmm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm16, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm8 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm8[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[14,15],zero,zero,ymm8[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[16,17],zero,zero,ymm8[u,u],zero,zero -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm19, %ymm7, %ymm8 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm0 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm7, %ymm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm15[2,1,3,3] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm8 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm11[2,2,2,2] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %ymm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,1,2,2,4,5,6,6] -; AVX512F-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[2,3,3,3,6,7,7,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX512F-SLOW-NEXT: vpandnq %ymm7, %ymm19, %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpbroadcastd 72(%rax), %ymm0 -; AVX512F-SLOW-NEXT: vpandn %ymm0, %ymm10, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %ymm7 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %xmm15 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm11 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastd 8(%rax), %ymm11 -; AVX512F-SLOW-NEXT: vpandnq %ymm11, %ymm10, %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm11 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm11, %ymm30 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm20, %zmm10 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[1,1,1,1,5,5,5,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm10, %ymm26 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm11 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm11[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3] -; AVX512F-SLOW-NEXT: vpandnq %ymm10, %ymm19, %ymm10 -; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm11, %ymm27 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm13[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7,8,9,10],ymm8[11],ymm1[12,13],ymm8[14],ymm1[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm10 -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm14[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7,8,9],ymm8[10],ymm1[11,12],ymm8[13],ymm1[14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[2,1,2,3,6,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm15[2,2,2,2] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm0[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm15[3,3,3,3] ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm15[0,0,1,1] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vprold $16, %xmm2, %xmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm14, %ymm28 -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm6, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2,3],xmm1[4],xmm8[5,6],xmm1[7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm8 -; AVX512F-SLOW-NEXT: vprold $16, %ymm31, %ymm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm9 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm18[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3],ymm2[4,5],ymm8[6],ymm2[7,8,9,10],ymm8[11],ymm2[12,13],ymm8[14],ymm2[15] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,2,3,3,10,9,11,10] -; AVX512F-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[0,1,2,2,4,5,6,6] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm11, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX512F-SLOW-NEXT: vprold $16, %xmm13, %xmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3,4],xmm1[5],xmm4[6,7] ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,5,4,5,4,5,6,7,16,17,16,17,16,17,17,19] -; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0],xmm6[1],xmm15[2,3],xmm6[4],xmm15[5,6],xmm6[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm26 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vprold $16, %xmm12, %xmm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3,4],xmm6[5],xmm3[6,7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vprold $16, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,2,2] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm2[1],xmm6[2,3],xmm2[4],xmm6[5,6],xmm2[7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm13, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[2,3,3,3,6,7,7,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2] -; AVX512F-SLOW-NEXT: vpbroadcastd 96(%rax), %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX512F-SLOW-NEXT: vprold $16, %xmm1, %xmm9 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[1,1,2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm23 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm7 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm9 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0],xmm9[1],xmm10[2,3],xmm9[4],xmm10[5,6],xmm9[7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5,7,6] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> -; AVX512F-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm3 -; AVX512F-SLOW-NEXT: vpbroadcastd 104(%rax), %ymm6 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm7[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm2[2],ymm6[3,4],ymm2[5],ymm6[6,7,8,9],ymm2[10],ymm6[11,12],ymm2[13],ymm6[14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpandn %ymm6, %ymm7, %ymm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[14,15,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,17,u,u],zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vprold $16, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpandn %ymm0, %ymm6, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm19 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512F-SLOW-NEXT: vprold $16, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6,7] +; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,2,2,4,5,6,6] +; AVX512F-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[2,3,3,3,6,7,7,7] +; AVX512F-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512F-SLOW-NEXT: vpandnq %ymm0, %ymm16, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[1,1,1,1,5,5,5,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm1[1],xmm6[2,3],xmm1[4],xmm6[5,6],xmm1[7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-SLOW-NEXT: vmovdqa %ymm6, %ymm11 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vmovdqa %ymm7, %ymm12 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u> -; AVX512F-SLOW-NEXT: vpermt2d %zmm1, %zmm6, %zmm0 -; AVX512F-SLOW-NEXT: vpbroadcastd 64(%rax), %ymm1 -; AVX512F-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm22 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; AVX512F-SLOW-NEXT: vprold $16, %xmm7, %xmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2],xmm0[3,4],xmm7[5],xmm0[6,7] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm20 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm29 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2,3],xmm0[4],xmm7[5,6],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm7 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5,7,6] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm6, %zmm0 -; AVX512F-SLOW-NEXT: vpbroadcastd (%rax), %ymm6 -; AVX512F-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm6, %ymm9 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vmovdqa %ymm7, %ymm10 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vprold $16, %ymm6, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm27 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm12[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm18[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-SLOW-NEXT: vprold $16, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm19[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[1,1,1,1,5,5,5,5] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm24 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm10 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm13 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-SLOW-NEXT: vprold $16, %ymm4, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[1,2,2,3,5,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7,8,9,10],ymm6[11],ymm1[12,13],ymm6[14],ymm1[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,0,2,1] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpermt2q %zmm0, %zmm12, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm30[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm30[0,1,2,2,4,5,6,6] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm11, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,1,1,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm11 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vmovdqa %ymm6, %ymm12 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,0,2,1,4,4,6,5] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512F-SLOW-NEXT: vprold $16, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[1,2,2,3,5,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3,4],ymm0[5],ymm7[6,7,8,9],ymm0[10],ymm7[11,12],ymm0[13],ymm7[14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512F-SLOW-NEXT: vpshufhw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm7 = mem[0,1,2,3,4,5,7,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[0,2,3,3] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm6[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermt2d %zmm11, %zmm13, %zmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[2,3,3,3,6,7,7,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX512F-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm30 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm5, %zmm30 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512F-SLOW-NEXT: vprold $16, %xmm5, %xmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[1,1,2,3] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm7[2],xmm13[3,4],xmm7[5],xmm13[6,7] +; AVX512F-SLOW-NEXT: vpermpd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,0,2,1] +; AVX512F-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm8[0,0,1,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm26[0,0,1,1] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[0,2,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm8[0,0,2,1] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5,6,7,8],ymm10[9],ymm8[10,11],ymm10[12],ymm8[13,14,15] ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX512F-SLOW-NEXT: vpshufhw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm3 = mem[0,1,2,3,4,5,7,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,2,3,3] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm8 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm13[0,0,1,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm2[0,0,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm5[0,0,2,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm14 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm7[0,0,1,3] -; AVX512F-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm22 = mem[2,1,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,2,3,3] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm3 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,0,1,1] +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm3[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm16 = mem[0,0,2,1] +; AVX512F-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm3 = mem[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm12 = mem[0,0,1,1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm10[3],ymm3[4,5],ymm10[6],ymm3[7,8,9,10],ymm10[11],ymm3[12,13],ymm10[14],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm2[0,1,2,3,4,5,7,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[0,2,3,3] +; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm18 = mem[0,0,2,1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[0,0,1,3] +; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm13 = mem[0,0,1,1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,2,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm3[0,0,2,1] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm8 = mem[0,1,2,3,4,5,7,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm3[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm26 = mem[2,1,3,2] +; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm25 = mem[2,2,2,3] ; AVX512F-SLOW-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm0 = mem[2,2,2,3] ; AVX512F-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm0 = mem[0,2,2,3] ; AVX512F-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[0,2,2,3] +; AVX512F-SLOW-NEXT: vpermpd $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = mem[2,1,3,2] ; AVX512F-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[2,1,3,3] -; AVX512F-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,6] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3> -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm6 -; AVX512F-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm13 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm6, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm13 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[1,1,2,2] -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm1[1],xmm15[2,3],xmm1[4],xmm15[5,6],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm27[0,1,2,2,4,5,6,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm27[2,3,3,3,6,7,7,7] ; AVX512F-SLOW-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: # ymm0 = mem[2,2,2,3] +; AVX512F-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermpd $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = mem[2,2,3,3] ; AVX512F-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm31 = mem[0,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,1,3,2] -; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm27 = mem[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm7 = mem[0,0,2,1] -; AVX512F-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm14 = mem[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm4[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,0,2,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm24[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm31[2,1,3,2] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm28[0,2,2,3] +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,0,1,1] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm20[2,1,3,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm19[2,2,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[3,3,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm20 = ymm0[0,1,2,2,4,5,6,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm19 = ymm0[2,3,3,3,6,7,7,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm22[0,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,1,3,2] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm18[2,1,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm30[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm29[2,1,3,2] ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512F-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm30 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm6, %zmm30 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm4, %zmm30 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm24, %zmm0 -; AVX512F-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[2,1,3,2] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512F-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm29[4,5,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm6, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm31, %zmm1 +; AVX512F-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = mem[2,1,3,2] +; AVX512F-SLOW-NEXT: vpbroadcastd 124(%r8), %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm4, %zmm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm15 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm4, %zmm15 +; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpbroadcastd (%rax), %ymm4 +; AVX512F-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm12 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm16, %zmm4 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm9[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpbroadcastd 64(%rax), %ymm7 +; AVX512F-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm9 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm12 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm16, %zmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm15, %zmm0, %zmm12 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm19 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm29, %zmm19 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm19 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm20 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm20 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm10, %zmm2, %zmm20 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm6, %zmm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm18, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm6, %zmm7 +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm6 +; AVX512F-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm9 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm9, %zmm6 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm8[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm8 +; AVX512F-SLOW-NEXT: vpbroadcastd 104(%rax), %ymm10 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm9, %zmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm0, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm23 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm10 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm22[0,1,2,3],zmm10[4,5,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm10 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[2,1,3,3] -; AVX512F-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm22 = mem[2,1,3,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm12[2,1,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm15[2,1,3,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm0 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm14[0,1,1,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm8, %zmm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm11, %zmm4 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm8, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm4, %zmm25 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm7, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm9, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm7, %zmm9 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm0 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm10 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm9 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm4, %zmm26 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm28, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm5, %zmm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm21, %zmm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm18, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm5, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm3, %zmm22 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm3, %zmm27 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm24, %zmm4 -; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm3 = mem[0,2,2,3] +; AVX512F-SLOW-NEXT: vpternlogq $248, %zmm7, %zmm9, %zmm26 +; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm7 = mem[2,1,3,3] +; AVX512F-SLOW-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm9 = mem[2,1,3,2] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm28, %zmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm20[2,1,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm19[2,1,3,2] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm10 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm9 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm24, %zmm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm23, %zmm13 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm13 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm15[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[0,1,2,2,4,5,6,6] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm15, %ymm17 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm11, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm14, %zmm11 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm5, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,2,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm5[0,1,2,2,4,5,6,6] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,1,3,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm14, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm9, %zmm1, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm13, %zmm1, %zmm15 +; AVX512F-SLOW-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,1,3] +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm9 = mem[2,1,3,3] +; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm13 = mem[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm16 = mem[2,2,2,3] +; AVX512F-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm14 = mem[0,1,3,2,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm28 = xmm14[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[2,3,3,3,6,7,7,7] +; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm18 = mem[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm19 = mem[0,2,2,3] +; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm20 = mem[0,2,2,3] +; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm21 = mem[2,1,3,3] +; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm22 = mem[0,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm5 = mem[0,1,3,2,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[2,1,3,3] -; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,0,1,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm21 = mem[0,0,2,1] -; AVX512F-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm7 = mem[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,3] -; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm8 = mem[0,0,1,1] -; AVX512F-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm9 = mem[0,2,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512F-SLOW-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm12 = mem[0,2,2,3] +; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm23 = mem[2,1,3,3] +; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm24 = mem[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm25 = mem[2,2,2,3] ; AVX512F-SLOW-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm14 = mem[0,1,3,2,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm15 = mem[2,1,3,3] -; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm16 = mem[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm17 = mem[0,0,2,1] -; AVX512F-SLOW-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm11 = mem[2,1,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,4] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,3] -; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm18 = mem[0,0,1,1] -; AVX512F-SLOW-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm6 = mem[0,2,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm5 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,1,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm12, %zmm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm12 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm3, %zmm14 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm3, %zmm30 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm21, %zmm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm5 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm17, %zmm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm3, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm3, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,1,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm27 = ymm27[2,3,3,3,6,7,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm9, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm9 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm20, %zmm13 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm31, %zmm13 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm5[0,1,1,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm22, %zmm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm31, %zmm5 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm28[0,0,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm16, %zmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm17[2,1,3,2] +; AVX512F-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm17 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm18, %zmm16 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,0,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm25, %zmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm27[2,1,3,2] +; AVX512F-SLOW-NEXT: vpbroadcastd 96(%rax), %ymm17 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm18, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm9, %zmm1, %zmm16 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm14 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm10 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, 64(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 640(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 576(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 512(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 768(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 832(%rax) -; AVX512F-SLOW-NEXT: addq $2440, %rsp # imm = 0x988 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 768(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 704(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 832(%rax) +; AVX512F-SLOW-NEXT: addq $3304, %rsp # imm = 0xCE8 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i16_stride7_vf64: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $2264, %rsp # imm = 0x8D8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm2, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm5, %ymm6, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm5, %ymm6, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: subq $3656, %rsp # imm = 0xE48 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm6, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm5, %ymm10, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm11, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vporq %ymm15, %ymm0, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm4, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7,8,9],ymm15[10],ymm11[11,12],ymm15[13],ymm11[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,1,3,2,10,10,10,11] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3],ymm11[4,5],ymm0[6],ymm11[7,8,9,10],ymm0[11],ymm11[12,13],ymm0[14],ymm11[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7,8,9],ymm11[10],ymm0[11,12],ymm11[13],ymm0[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7,8,9],ymm11[10],ymm8[11,12],ymm11[13],ymm8[14,15] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm17, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <5,u,u,u,6,u,u,6> -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15],zero,zero,ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17],zero,zero,ymm11[u,u],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm16, %ymm8, %ymm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vextracti64x4 $1, %zmm8, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm1, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm15[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,28,29,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm14, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rax), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm23, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,1,4,5,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm8, %ymm16, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 72(%rax), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rax), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm25 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,1,1,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm26, %zmm0, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm26, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm3[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 72(%rax), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm3, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm28 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm15[4],xmm3[4],xmm15[5],xmm3[5],xmm15[6],xmm3[6],xmm15[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm12, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm26 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm9[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm9, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm17[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7,8,9],ymm12[10],ymm9[11,12],ymm12[13],ymm9[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,1,3,2] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm23[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7,8,9],ymm12[10],ymm9[11,12],ymm12[13],ymm9[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,2,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,u,3,10,10,11,11> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm22, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm2, %ymm16, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [14,21,0,0,15,22,0,15,14,21,0,0,15,22,0,15] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm0, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm18[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm12[3],ymm9[4,5],ymm12[6],ymm9[7,8,9,10],ymm12[11],ymm9[12,13],ymm12[14],ymm9[15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,2,3,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [0,1,4,5,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm19, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm12, %ymm20, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5,6,7,8],ymm9[9],ymm12[10,11],ymm9[12],ymm12[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm9[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7,8,9,10],ymm9[11],ymm12[12,13],ymm9[14],ymm12[15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm12, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7,8,9],ymm12[10],ymm9[11,12],ymm12[13],ymm9[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5,6,7,8],ymm9[9],ymm12[10,11],ymm9[12],ymm12[13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm4, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm7[1],xmm12[2,3],xmm7[4],xmm12[5,6],xmm7[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [2,2,3,3,10,9,11,10] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm23, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm13, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm12, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm2, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm7, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm29 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 96(%rax), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm3, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1],xmm9[2],xmm12[3,4],xmm9[5],xmm12[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm12, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2,3],xmm9[4],xmm12[5,6],xmm9[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm26 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm28 ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,1,3,8,8,9,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 100(%rax), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 104(%rax), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm3, %ymm6, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <5,u,u,u,6,u,u,6> +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,u,u],zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm31 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm8, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm4, %ymm6, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm3, %ymm19, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,5,0,0,15,6,0,15,14,5,0,0,15,6,0,15] +; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm3, %zmm19, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm4, %ymm20, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm24 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm6, %xmm22 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm5, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm9, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm7, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,1,8,9,9,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 64(%rax), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 68(%rax), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm28 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm18 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm8, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm6 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[1,1,1,1,5,5,5,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm8, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm9, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,2,2,3,5,6,6,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7,8,9,10],ymm7[11],ymm6[12,13],ymm7[14],ymm6[15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm6, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [10,11,12,13,10,11,12,13,10,11,12,13,14,15,14,15,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm2, %zmm24, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [6,7,3,3,7,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm21, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm30 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,1,1,3,4,5,5,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[2,2,2,2,6,6,6,6] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm7[2],ymm3[3,4],ymm7[5],ymm3[6,7,8,9],ymm7[10],ymm3[11,12],ymm7[13],ymm3[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm10, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermd %zmm3, %zmm13, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,2,3,8,8,8,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7,8,9,10],ymm11[11],ymm13[12,13],ymm11[14],ymm13[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm0[2],ymm9[3,4],ymm0[5],ymm9[6,7,8,9],ymm0[10],ymm9[11,12],ymm0[13],ymm9[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6,7,8],ymm0[9],ymm12[10,11],ymm0[12],ymm12[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,3,3,7,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm30, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm2, %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1],xmm7[2],xmm12[3,4],xmm7[5],xmm12[6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm18, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm23 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm12, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,2,2] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm2[1],xmm15[2,3],xmm2[4],xmm15[5,6],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm29, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[2,1,3,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm19, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[2,1,3,3] -; AVX512F-ONLY-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm31 = mem[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm30 = mem[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm16[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7,8,9],ymm12[10],ymm4[11,12],ymm12[13],ymm4[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm11, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm4, %zmm24, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm21, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm10[0,1,1,3] ; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm11[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm17[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm8[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm29[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm26[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,1,3,2] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm6[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm17 = mem[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm1[0,0,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm0[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermpd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm31 = mem[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm0[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm8[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm16[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm14[0,0,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm3[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm26 = mem[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm0[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm5[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermpd $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm9, %zmm29, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm11 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm3, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm14[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm14 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm24, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm22, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm31, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm28, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm27, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 124(%r8), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermq $170, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm4, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm20, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm15, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm15, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm24, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm10, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 64(%rax), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 68(%rax), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm11, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm10, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm8, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm31, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm25, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm6, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm10, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 100(%rax), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 104(%rax), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm10, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm8, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm8, %zmm0, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm8, %zmm10, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm26 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[2,1,3,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm26 = mem[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm9 = mem[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,2,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,1,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[2,1,3,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm17 = mem[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm18 = mem[0,0,1,3] -; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,0,1,1] -; AVX512F-ONLY-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm13 = mem[0,2,3,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm10, %zmm1, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm19, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm8, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm8, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm10, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $212, (%rsp), %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,0,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm17 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm18 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm20 = mem[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm21 = mem[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,0,0,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm21, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm12, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 96(%rax), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm15, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm30 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 832(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $2264, %rsp # imm = 0x8D8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $3656, %rsp # imm = 0xE48 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i16_stride7_vf64: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $2264, %rsp # imm = 0x8D8 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm9 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm2 -; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm2, %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512DQ-FAST-NEXT: vporq %ymm5, %ymm6, %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm6 -; AVX512DQ-FAST-NEXT: vporq %ymm5, %ymm6, %ymm24 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: subq $3656, %rsp # imm = 0xE48 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm6 -; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm6, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm10 -; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm10 -; AVX512DQ-FAST-NEXT: vporq %ymm5, %ymm10, %ymm19 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm10 -; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm10 -; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm10, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm12 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm3 -; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm3, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm2 -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512DQ-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm11, %ymm5 -; AVX512DQ-FAST-NEXT: vporq %ymm15, %ymm0, %ymm22 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] -; AVX512DQ-FAST-NEXT: vprold $16, %ymm4, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7,8,9],ymm15[10],ymm11[11,12],ymm15[13],ymm11[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,1,3,2,10,10,10,11] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm25 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3],ymm11[4,5],ymm0[6],ymm11[7,8,9,10],ymm0[11],ymm11[12,13],ymm0[14],ymm11[15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7,8,9],ymm11[10],ymm0[11,12],ymm11[13],ymm0[14,15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7,8],ymm0[9],ymm11[10,11],ymm0[12],ymm11[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7,8,9],ymm11[10],ymm8[11,12],ymm11[13],ymm8[14,15] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm17, %zmm8 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <5,u,u,u,6,u,u,6> -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm11 -; AVX512DQ-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512DQ-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15],zero,zero,ymm11[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17],zero,zero,ymm11[u,u],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm16, %ymm8, %ymm11 ; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm15 -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm15 -; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm8, %ymm8 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm0 -; AVX512DQ-FAST-NEXT: vprold $16, %ymm1, %ymm8 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm15[0,1,2,3],zmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,28,29,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm1 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm14 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm14, %ymm1 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm23 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm1 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rax), %ymm0 -; AVX512DQ-FAST-NEXT: vpermi2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,1,4,5,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm8 -; AVX512DQ-FAST-NEXT: vpandnq %ymm8, %ymm16, %ymm8 +; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm12 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpbroadcastd 72(%rax), %ymm0 -; AVX512DQ-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rax), %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm11 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm27 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %xmm11 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm6 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm25 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,1,1,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm26, %zmm0, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX512DQ-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm15 -; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm26, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, %xmm8 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm3[0,0,1,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpbroadcastd 72(%rax), %ymm3 +; AVX512DQ-FAST-NEXT: vpandn %ymm3, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm24 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm28 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,0,1,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm15 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm15[4],xmm3[4],xmm15[5],xmm3[5],xmm15[6],xmm3[6],xmm15[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,0,2,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm12, %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm26 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,0,1,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm9[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm9, %ymm31 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm17[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7,8,9],ymm12[10],ymm9[11,12],ymm12[13],ymm9[14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,1,3,2] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm22 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm23[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7,8,9],ymm12[10],ymm9[11,12],ymm12[13],ymm9[14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,2,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,u,3,10,10,11,11> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm22, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm0 -; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm2 -; AVX512DQ-FAST-NEXT: vpandnq %ymm2, %ymm16, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm10 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm1 = [14,21,0,0,15,22,0,15,14,21,0,0,15,22,0,15] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2d %zmm0, %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm18[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm12[3],ymm9[4,5],ymm12[6],ymm9[7,8,9,10],ymm12[11],ymm9[12,13],ymm12[14],ymm9[15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,2,3,3] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [0,1,4,5,4,5,5,7] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm19, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandnq %ymm12, %ymm20, %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [10,11,8,9,10,11,8,9,10,11,8,9,10,11,8,9,26,27,24,25,26,27,24,25,26,27,24,25,26,27,24,25] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5,6,7,8],ymm9[9],ymm12[10,11],ymm9[12],ymm12[13,14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm9[0,2,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7,8,9,10],ymm9[11],ymm12[12,13],ymm9[14],ymm12[15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [6,7,10,11,4,5,8,9,10,11,10,11,10,11,10,11,22,23,26,27,20,21,24,25,26,27,26,27,26,27,26,27] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm12, %ymm21 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7,8,9],ymm12[10],ymm9[11,12],ymm12[13],ymm9[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5,6,7,8],ymm9[9],ymm12[10,11],ymm9[12],ymm12[13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm7 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,4,5,4,5,4,5,6,7,10,11,4,5,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm7[1],xmm12[2,3],xmm7[4],xmm12[5,6],xmm7[7] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vprold $16, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [2,2,3,3,10,9,11,10] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm13 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm13, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm12, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vprold $16, %xmm2, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512DQ-FAST-NEXT: vprold $16, %xmm7, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm29 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vprold $16, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9] -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpbroadcastd 96(%rax), %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX512DQ-FAST-NEXT: vprold $16, %xmm3, %xmm9 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1],xmm9[2],xmm12[3,4],xmm9[5],xmm12[6,7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm23 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm4 -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm9 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm12, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2,3],xmm9[4],xmm12[5,6],xmm9[7] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm4 +; AVX512DQ-FAST-NEXT: vprold $16, %xmm4, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm26 +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm28 ; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,1,3,8,8,9,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vpbroadcastd 100(%rax), %ymm3 -; AVX512DQ-FAST-NEXT: vpbroadcastd 104(%rax), %ymm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm3 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX512DQ-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandn %ymm3, %ymm6, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <5,u,u,u,6,u,u,6> +; AVX512DQ-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,u,u],zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm31 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm8, %xmm9 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vprold $16, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandn %ymm4, %ymm6, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermd %ymm3, %ymm19, %ymm4 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm19 = [14,5,0,0,15,6,0,15,14,5,0,0,15,6,0,15] +; AVX512DQ-FAST-NEXT: # zmm19 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm19, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpandnq %ymm4, %ymm20, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm5 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm24 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm6, %xmm22 -; AVX512DQ-FAST-NEXT: vprold $16, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, %ymm15 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm3 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm7, %xmm11 -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,1,8,9,9,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 -; AVX512DQ-FAST-NEXT: vpbroadcastd 64(%rax), %ymm5 -; AVX512DQ-FAST-NEXT: vpbroadcastd 68(%rax), %ymm7 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm19 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm20 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm28 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm18 -; AVX512DQ-FAST-NEXT: vprold $16, %xmm8, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm8 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm3 -; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm6 -; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[1,1,1,1,5,5,5,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, %ymm14 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vprold $16, %ymm8, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm10 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, %ymm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,3,4,5,2,3,4,5,2,3,4,5,2,3,4,5,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,2,2,3,5,6,6,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[0,0,2,1,4,4,6,5] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7,8,9,10],ymm7[11],ymm6[12,13],ymm7[14],ymm6[15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm6 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vprold $16, %ymm6, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [14,15,12,13,14,15,12,13,14,15,12,13,14,15,12,13,30,31,28,29,30,31,28,29,30,31,28,29,30,31,28,29] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm20 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [10,11,12,13,10,11,12,13,10,11,12,13,14,15,14,15,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm24 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermd %zmm2, %zmm24, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [6,7,3,3,7,7,6,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm21, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm31, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vprold $16, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm30 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,1,1,3,4,5,5,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[2,2,2,2,6,6,6,6] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm7[2],ymm3[3,4],ymm7[5],ymm3[6,7,8,9],ymm7[10],ymm3[11,12],ymm7[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm25 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm10, %zmm3 -; AVX512DQ-FAST-NEXT: vpermd %zmm3, %zmm13, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[2,3,4,5,2,3,4,5,8,9,10,11,6,7,6,7,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm13 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vprold $16, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm0[2],ymm9[3,4],ymm0[5],ymm9[6,7,8,9],ymm0[10],ymm9[11,12],ymm0[13],ymm9[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm10[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6,7,8],ymm0[9],ymm12[10,11],ymm0[12],ymm12[13,14,15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,2,3,8,8,8,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7,8,9,10],ymm11[11],ymm13[12,13],ymm11[14],ymm13[15] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,3,3,7,7,6,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm30, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm11 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm21 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512DQ-FAST-NEXT: vprold $16, %xmm2, %xmm7 -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1],xmm7[2],xmm12[3,4],xmm7[5],xmm12[6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm18, %xmm4 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm8 -; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm12 -; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm23 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm12, %zmm23 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm23 -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,2,2] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm2[1],xmm15[2,3],xmm2[4],xmm15[5,6],xmm2[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm3 = mem[2,1,3,3] -; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm2 = mem[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm10 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm4 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm4 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm4 = mem[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = mem[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,2,2,3] -; AVX512DQ-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[2,1,3,3] -; AVX512DQ-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,2,2,3] -; AVX512DQ-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[2,2,2,3] -; AVX512DQ-FAST-NEXT: vmovups %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,2,2,3] -; AVX512DQ-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm31 = mem[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm30 = mem[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm16[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm12[2],ymm4[3,4],ymm12[5],ymm4[6,7,8,9],ymm12[10],ymm4[11,12],ymm12[13],ymm4[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm14 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm11 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm11, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermd %zmm4, %zmm24, %zmm7 +; AVX512DQ-FAST-NEXT: vpermd %ymm4, %ymm21, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm22 = mem[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm10[0,1,1,3] ; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm11[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm17[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm8[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm29[2,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm26[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,1,3,2] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,2,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm14 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm29, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm6[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm17 = mem[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm1[0,0,0,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm0[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermpd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,0,2,1] +; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,0,1,3] +; AVX512DQ-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm31 = mem[0,0,1,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm0[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,1,1,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm8[0,0,1,1] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm16[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm14[0,0,0,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm3[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm27 = mem[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,0,1,3] +; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm25 = mem[0,0,1,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm0[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm5[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermpd $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[2,1,3,2] +; AVX512DQ-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[2,1,3,2] +; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[2,2,3,3] +; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[2,1,3,2] +; AVX512DQ-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[2,1,3,2] +; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[2,1,3,2] +; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[2,2,3,3] +; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[2,1,3,2] +; AVX512DQ-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm9, %zmm29, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm7 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm11 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm3, %ymm1 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm10 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm14[0,1,2,3],zmm1[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm14 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm24, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] ; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm22, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm31, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm28, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm27, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vpbroadcastd 124(%r8), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpermq $170, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm1 = mem[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm20, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm15, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm15, %zmm8 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm24, %zmm0 +; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm9 +; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm10 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm10, %zmm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm16, %zmm0 +; AVX512DQ-FAST-NEXT: vpbroadcastd 64(%rax), %ymm11 +; AVX512DQ-FAST-NEXT: vpbroadcastd 68(%rax), %ymm14 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm11, %zmm11 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm10, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm31, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm27, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm25, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm6, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm6 +; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm10 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm10, %zmm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vpbroadcastd 100(%rax), %ymm13 +; AVX512DQ-FAST-NEXT: vpbroadcastd 104(%rax), %ymm14 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm10, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm1, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm28 +; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm8, %zmm0, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm8, %zmm10, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm26 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,1,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm5 = mem[2,1,3,3] -; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,0,1,1] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,0,1,3] -; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm27 = mem[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm9 = mem[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,2,2,3] -; AVX512DQ-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,1,1,3] -; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm15 = mem[2,1,3,3] -; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm17 = mem[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm18 = mem[0,0,1,3] -; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,0,1,1] -; AVX512DQ-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm13 = mem[0,2,3,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm10, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm10 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm15 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm10, %zmm1, %zmm21 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm27, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm19, %zmm5 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm29, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm1, %zmm23 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm14 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm8, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm8, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm10, %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $212, (%rsp), %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,1,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = mem[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm10 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm14 = mem[0,0,0,1] +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm16 = mem[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm17 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm18 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm20 = mem[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm21 = mem[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm22 = mem[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm23 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,0,0,1] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm8 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm21, %zmm15 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm15 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm12, %zmm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vpbroadcastd 96(%rax), %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm12, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm15, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm30 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 832(%rax) -; AVX512DQ-FAST-NEXT: addq $2264, %rsp # imm = 0x8D8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 832(%rax) +; AVX512DQ-FAST-NEXT: addq $3656, %rsp # imm = 0xE48 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride7_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $136, %rsp +; AVX512BW-NEXT: pushq %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm20 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm10 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm31 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm19 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm7 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm14 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] +; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm20, %zmm11 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm9, %zmm0 +; AVX512BW-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18 +; AVX512BW-NEXT: kmovd %ecx, %k2 +; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm0 {%k2} ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm12 -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm0, %zmm12 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm7, %zmm13 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0] +; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,59,u,u,4,5,6,7,60,u,u,11,12,13,14,61,u,u,18,19,20,21,62,u,u,25,26,27,28,63,u,u> -; AVX512BW-NEXT: vpermi2w %zmm31, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm0, %zmm14 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm13, %zmm15 -; AVX512BW-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18 -; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermi2w %zmm31, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm16 -; AVX512BW-NEXT: vpermt2w %zmm10, %zmm28, %zmm16 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34] -; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm21, %zmm30, %zmm15 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm22, %zmm15 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm11 +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm12, %zmm11 ; AVX512BW-NEXT: movl $-1048377844, %ecx # imm = 0xC183060C ; AVX512BW-NEXT: kmovd %ecx, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm15 {%k3} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm17 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0] +; AVX512BW-NEXT: vmovdqu16 %zmm15, %zmm11 {%k3} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0] +; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm23 -; AVX512BW-NEXT: vpermt2w %zmm26, %zmm24, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm28, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm28 -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm30, %zmm26 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm0 -; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 -; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm26 {%k3} -; AVX512BW-NEXT: kmovd %ecx, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm15 {%k3} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm26 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm24, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm16, %zmm25, %zmm13 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm19, %zmm18 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0] +; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm28, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm30 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm17, %zmm20 +; AVX512BW-NEXT: vpermi2w %zmm27, %zmm16, %zmm9 +; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm9 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] +; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm20 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u> +; AVX512BW-NEXT: vpermi2w %zmm14, %zmm20, %zmm0 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52] +; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: movl $473460961, %ecx # imm = 0x1C3870E1 +; AVX512BW-NEXT: kmovd %ecx, %k2 +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm9 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm20 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm31, %zmm20 +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm3, %zmm16 +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm2 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm31, %zmm0, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm30 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm14, %zmm31 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm20 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm4, %zmm31 -; AVX512BW-NEXT: vmovdqu16 %zmm31, %zmm26 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermi2w %zmm21, %zmm2, %zmm4 -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm10, %zmm7 -; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm7 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = -; AVX512BW-NEXT: vpermi2w %zmm9, %zmm4, %zmm14 -; AVX512BW-NEXT: movl $-507279602, %eax # imm = 0xE1C3870E -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm7 {%k3} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm4, %zmm12 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm14, %zmm16 -; AVX512BW-NEXT: movl $202911840, %eax # imm = 0xC183060 +; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512BW-NEXT: vpermi2w %zmm14, %zmm0, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm23, %zmm0 +; AVX512BW-NEXT: movl $-507279602, %ecx # imm = 0xE1C3870E +; AVX512BW-NEXT: kmovd %ecx, %k2 +; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm16, %zmm0 +; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 +; AVX512BW-NEXT: kmovd %ecx, %k4 +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm11 {%k4} +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-NEXT: vpermi2w %zmm8, %zmm1, %zmm22 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm10, %zmm12 +; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm12 {%k3} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9] +; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm17, %zmm15 +; AVX512BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 +; AVX512BW-NEXT: kmovd %ecx, %k2 +; AVX512BW-NEXT: vmovdqu16 %zmm26, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm22 +; AVX512BW-NEXT: vpermi2w %zmm22, %zmm0, %zmm23 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm26 +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm16, %zmm23 +; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm12 {%k4} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm16 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38] +; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm23, %zmm16 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u> +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm3, %zmm16 +; AVX512BW-NEXT: movl $1893843847, %eax # imm = 0x70E1C387 ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm16 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermi2w %zmm10, %zmm5, %zmm12 -; AVX512BW-NEXT: vpermi2w %zmm21, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm13 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm3, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = <54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u> -; AVX512BW-NEXT: vpermi2w %zmm9, %zmm12, %zmm31 -; AVX512BW-NEXT: movl $473460961, %eax # imm = 0x1C3870E1 -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm31, %zmm13 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm12, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm31, %zmm18 -; AVX512BW-NEXT: movl $-1014559204, %eax # imm = 0xC3870E1C -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm16 {%k2} -; AVX512BW-NEXT: vpermi2w %zmm21, %zmm2, %zmm4 -; AVX512BW-NEXT: vpermi2w %zmm10, %zmm5, %zmm14 -; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm14 {%k3} -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm3, %zmm12 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm31, %zmm12 -; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm14 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm4, %zmm29 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm12, %zmm17 -; AVX512BW-NEXT: vmovdqu16 %zmm29, %zmm17 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm10, %zmm5, %zmm4 -; AVX512BW-NEXT: vpermi2w %zmm21, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u> -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm4, %zmm6 -; AVX512BW-NEXT: movl $946921923, %eax # imm = 0x3870E1C3 -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm17 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm3, %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm12 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm15 {%k3} +; AVX512BW-NEXT: vpermi2w %zmm8, %zmm1, %zmm24 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm10, %zmm17 +; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm17 {%k2} +; AVX512BW-NEXT: vpermi2w %zmm22, %zmm0, %zmm23 +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm3, %zmm23 +; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm17 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 ; AVX512BW-NEXT: movl $405823681, %eax # imm = 0x183060C1 -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm22 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm4, %zmm1 -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm21, %zmm24 -; AVX512BW-NEXT: vpermi2w %zmm10, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm6, %zmm1 +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm18 {%k3} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm3 +; AVX512BW-NEXT: vpermi2w %zmm10, %zmm5, %zmm25 +; AVX512BW-NEXT: vpermi2w %zmm8, %zmm1, %zmm19 +; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm19 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm16, %zmm3 ; AVX512BW-NEXT: movl $-2029118408, %eax # imm = 0x870E1C38 +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm18 {%k3} +; AVX512BW-NEXT: vpermi2w %zmm22, %zmm0, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm16, %zmm13 +; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm19 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm13 +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm4, %zmm13 +; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm13 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm3, %zmm21 +; AVX512BW-NEXT: vmovdqu16 %zmm29, %zmm21 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm16, %zmm23 +; AVX512BW-NEXT: vpermi2w %zmm8, %zmm1, %zmm28 +; AVX512BW-NEXT: vpermi2w %zmm5, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = <0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u> +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm24, %zmm23 +; AVX512BW-NEXT: movl $946921923, %eax # imm = 0x3870E1C3 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm22 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm0 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm1, %zmm27 -; AVX512BW-NEXT: vpermt2w %zmm10, %zmm1, %zmm5 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm1, %zmm28 -; AVX512BW-NEXT: vpermt2w %zmm21, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm28 {%k3} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm1, %zmm30 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u> -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm20, %zmm1, %zmm30 -; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm2 {%k3} -; AVX512BW-NEXT: movl $1893843847, %eax # imm = 0x70E1C387 +; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm21 {%k1} +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm22, %zmm16 +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm24, %zmm16 +; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm3 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm16, %zmm30 +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm16, %zmm10 +; AVX512BW-NEXT: vpermt2w %zmm8, %zmm31, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,59,u,u,4,5,6,7,60,u,u,11,12,13,14,61,u,u,18,19,20,21,62,u,u,25,26,27,28,63,u,u> +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm13, %zmm8 +; AVX512BW-NEXT: vmovdqu16 %zmm30, %zmm20 {%k2} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54] +; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm7, %zmm6 +; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm1 {%k2} +; AVX512BW-NEXT: movl $-1014559204, %eax # imm = 0xC3870E1C ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm30, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,59,u,4,5,6,7,8,60,u,11,12,13,14,15,61,u,18,19,20,21,22,62,u,25,26,27,28,29,63,u> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermi2w %zmm19, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermi2w %zmm19, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] -; AVX512BW-NEXT: vpermi2w %zmm20, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] -; AVX512BW-NEXT: vpermi2w %zmm20, %zmm3, %zmm1 +; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm20 {%k1} +; AVX512BW-NEXT: vpermt2w %zmm22, %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-NEXT: vpermi2w %zmm22, %zmm5, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,1,59,u,4,5,6,7,8,60,u,11,12,13,14,15,61,u,18,19,20,21,22,62,u,25,26,27,28,29,63,u> +; AVX512BW-NEXT: vpermi2w %zmm22, %zmm8, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31] +; AVX512BW-NEXT: vpermi2w %zmm26, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63] +; AVX512BW-NEXT: vpermi2w %zmm26, %zmm5, %zmm0 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 832(%rax) -; AVX512BW-NEXT: addq $136, %rsp +; AVX512BW-NEXT: vmovdqa64 %zmm1, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll @@ -25,24 +25,22 @@ ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa (%r8), %xmm2 ; SSE-NEXT: movdqa (%r11), %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,7,5] -; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, 16(%rax) -; SSE-NEXT: movapd %xmm3, (%rax) +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,0,0] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] +; SSE-NEXT: movaps %xmm4, 16(%rax) +; SSE-NEXT: movapd %xmm6, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride8_vf2: @@ -54,12 +52,12 @@ ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%r11), %xmm3 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] @@ -77,15 +75,17 @@ ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-ONLY-NEXT: vmovdqa (%r11), %xmm3 +; AVX2-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastd (%rcx), %xmm2 +; AVX2-ONLY-NEXT: vpbroadcastd (%rdx), %xmm3 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpbroadcastd (%r11), %xmm2 +; AVX2-ONLY-NEXT: vpbroadcastd (%r10), %xmm3 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] @@ -99,15 +99,15 @@ ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-NEXT: vmovdqa (%r11), %xmm3 +; AVX512F-NEXT: vmovdqa (%r8), %xmm1 +; AVX512F-NEXT: vpbroadcastd (%rdx), %xmm2 +; AVX512F-NEXT: vpunpckldq (%rcx){1to4}, %xmm2, %xmm2 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512F-NEXT: vpbroadcastd (%r11), %xmm2 +; AVX512F-NEXT: vpunpckldq (%r10){1to4}, %xmm2, %xmm2 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31] ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] @@ -121,15 +121,15 @@ ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512BW-NEXT: vmovdqa (%r11), %xmm3 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm1 +; AVX512BW-NEXT: vpbroadcastd (%rdx), %xmm2 +; AVX512BW-NEXT: vpunpckldq (%rcx){1to4}, %xmm2, %xmm2 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512BW-NEXT: vpbroadcastd (%r11), %xmm2 +; AVX512BW-NEXT: vpunpckldq (%r10){1to4}, %xmm2, %xmm2 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15] ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 @@ -206,64 +206,60 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm3[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm9 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm10 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm10[0],xmm9[0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[0,1,3,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,1,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[3,1,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm13[4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm14[0,1,2,0,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm15[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm1[1,2,3],xmm11[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm1[1,2,3],xmm8[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm1[1,2,3],xmm7[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3],xmm4[4],xmm1[5,6,7] -; AVX1-ONLY-NEXT: vpackusdw %xmm7, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,3,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[3,1,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[3,1,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,0,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[2,0,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; AVX1-ONLY-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3],xmm3[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7] +; AVX1-ONLY-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -533,77 +529,77 @@ ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa (%rsi), %xmm9 ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa (%rcx), %xmm11 -; SSE-NEXT: movdqa (%r8), %xmm4 +; SSE-NEXT: movdqa (%r8), %xmm0 ; SSE-NEXT: movdqa (%r9), %xmm8 -; SSE-NEXT: movdqa (%r10), %xmm2 +; SSE-NEXT: movdqa (%r10), %xmm4 ; SSE-NEXT: movdqa (%rax), %xmm10 -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] -; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] +; SSE-NEXT: movdqa %xmm4, %xmm14 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: movdqa %xmm0, %xmm15 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1] ; SSE-NEXT: movdqa %xmm15, %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm13[0],xmm6[1] ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm15[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm14[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm13[2],xmm7[3],xmm13[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm12[0],xmm7[1] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm15[2,3] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,0,0,0] -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,0,0] ; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm9[0],xmm8[1] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm9[2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm4[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm9, 96(%rax) -; SSE-NEXT: movaps %xmm10, 112(%rax) +; SSE-NEXT: movaps %xmm9, 112(%rax) +; SSE-NEXT: movapd %xmm10, 96(%rax) ; SSE-NEXT: movaps %xmm11, 80(%rax) ; SSE-NEXT: movapd %xmm8, 64(%rax) -; SSE-NEXT: movapd %xmm7, 32(%rax) -; SSE-NEXT: movaps %xmm6, 48(%rax) +; SSE-NEXT: movaps %xmm7, 48(%rax) +; SSE-NEXT: movapd %xmm6, 32(%rax) ; SSE-NEXT: movaps %xmm5, 16(%rax) ; SSE-NEXT: movapd %xmm3, (%rax) ; SSE-NEXT: retq @@ -850,786 +846,752 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride8_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $72, %rsp +; SSE-NEXT: pushq %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm9 -; SSE-NEXT: movaps 16(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa (%rsi), %xmm12 +; SSE-NEXT: movdqa (%rdx), %xmm13 ; SSE-NEXT: movdqa (%rcx), %xmm1 -; SSE-NEXT: movdqa (%r8), %xmm8 -; SSE-NEXT: movdqa (%r9), %xmm2 -; SSE-NEXT: movdqa (%r10), %xmm10 -; SSE-NEXT: movdqa (%rax), %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE-NEXT: movdqa %xmm8, %xmm12 -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: movdqa (%r8), %xmm14 +; SSE-NEXT: movdqa (%r9), %xmm0 +; SSE-NEXT: movdqa (%r10), %xmm15 +; SSE-NEXT: movdqa (%rax), %xmm7 +; SSE-NEXT: movdqa %xmm13, %xmm6 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] -; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,0,0,0] +; SSE-NEXT: movdqa %xmm14, %xmm9 +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm2[0],xmm5[1] ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm14 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] -; SSE-NEXT: movdqa 16(%rcx), %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r8), %xmm5 +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movdqa 16(%r9), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE-NEXT: movdqa 16(%r10), %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rax), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3] -; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 16(%r10), %xmm6 -; SSE-NEXT: movdqa 16(%rax), %xmm10 -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] -; SSE-NEXT: movdqa 16(%r8), %xmm4 -; SSE-NEXT: movdqa 16(%r9), %xmm11 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[0,0,0,0] +; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] -; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm14[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm11[0],xmm12[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm14[2,3] +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,0,0,0] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm13[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm12[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] -; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm13[0],xmm7[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm8[2,2,2,2] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm8[2],xmm13[3],xmm8[3] -; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm9[0],xmm13[1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm15[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm2[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,2,2] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm3[0],xmm8[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,2,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE-NEXT: movdqa 16(%rdx), %xmm2 +; SSE-NEXT: movdqa 16(%rcx), %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3] +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rsi), %xmm7 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1] +; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm3[0],xmm11[1] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm3[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm3[2],xmm15[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1] ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,2,2,2] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7] +; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm11[0],xmm1[1] +; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movapd %xmm1, 224(%rax) -; SSE-NEXT: movaps %xmm3, 240(%rax) -; SSE-NEXT: movapd %xmm8, 160(%rax) -; SSE-NEXT: movaps %xmm9, 176(%rax) -; SSE-NEXT: movapd %xmm13, 96(%rax) -; SSE-NEXT: movaps %xmm12, 112(%rax) -; SSE-NEXT: movapd %xmm7, 32(%rax) -; SSE-NEXT: movaps %xmm10, 48(%rax) -; SSE-NEXT: movapd %xmm14, 192(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 208(%rax) +; SSE-NEXT: movaps %xmm0, 240(%rax) +; SSE-NEXT: movapd %xmm7, 224(%rax) +; SSE-NEXT: movaps %xmm6, 208(%rax) +; SSE-NEXT: movapd %xmm4, 192(%rax) +; SSE-NEXT: movaps %xmm3, 176(%rax) +; SSE-NEXT: movapd %xmm15, 160(%rax) +; SSE-NEXT: movaps %xmm13, 144(%rax) +; SSE-NEXT: movapd %xmm11, 128(%rax) +; SSE-NEXT: movaps %xmm9, 112(%rax) +; SSE-NEXT: movapd %xmm12, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rax) +; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rax) +; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: addq $72, %rsp +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: popq %rax ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i16_stride8_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $136, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 16(%r10), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm12[0],zero,xmm12[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm13, %ymm10 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm11 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm10[0],ymm14[1],ymm10[2,3,4],ymm14[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm10 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm12 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm14 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0],ymm15[1],ymm4[2,3,4],ymm15[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm5, %ymm15 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2],ymm2[3],ymm15[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0],ymm15[1],ymm6[2,3,4],ymm15[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm2[2,3],ymm15[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2],ymm3[3],ymm12[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm15[0],zero,xmm15[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3],ymm9[4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm12 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm3[2,3],ymm12[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm15, %ymm12 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[2,2,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm14, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm11[3],ymm8[4,5,6],ymm11[7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2],ymm8[3],ymm12[4,5,6],ymm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3],ymm4[4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4],ymm7[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3],ymm11[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3],ymm13[4,5,6],ymm11[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1],ymm11[2,3],ymm1[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm15[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm14[0],zero,xmm14[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,0,0,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $136, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride8_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: pushq %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm7 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm9 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm11 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm14 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm9[0],zero,xmm9[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm6[0],zero,xmm6[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm13 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm15 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm8[0],zero,xmm8[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r10), %ymm9 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm3[3],ymm11[4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm12 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqa (%r10), %ymm8 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm11[0],zero,xmm11[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3],ymm10[4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm10 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm3[2,3],ymm13[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[8],ymm12[8],ymm9[9],ymm12[9],ymm9[10],ymm12[10],ymm9[11],ymm12[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3],ymm11[4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[8],ymm10[8],ymm8[9],ymm10[9],ymm8[10],ymm10[10],ymm8[11],ymm10[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3],ymm14[4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[12],ymm12[12],ymm9[13],ymm12[13],ymm9[14],ymm12[14],ymm9[15],ymm12[15] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm11[2,3],ymm2[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm2[3],ymm12[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm4[1],ymm8[2,3,4],ymm4[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 192(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 224(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm11, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 224(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 192(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) -; AVX2-SLOW-NEXT: popq %rax ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride8_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: pushq %rax +; AVX2-FAST-NEXT: subq $40, %rsp ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm3 -; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm4 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm5 +; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,1,1> ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm6 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm4 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm6 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm0[3],ymm9[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm9 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm13 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm14 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm15 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3,4],ymm13[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm13[1],ymm8[2,3,4],ymm13[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,2,2,2,u,u,3,3> ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm7, %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm7, %ymm8 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3],ymm2[4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm10 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm11[1],ymm7[2,3,4],ymm11[5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm12 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm13 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,0,u,u,1,1> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm14 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm6 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm11 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,0,1,1,1,1,u,u> +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm3, %ymm9 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3],ymm12[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[8],ymm14[8],ymm12[9],ymm14[9],ymm12[10],ymm14[10],ymm12[11],ymm14[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm0 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm13, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm9[2,3],ymm2[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm14[4],ymm12[5],ymm14[5],ymm12[6],ymm14[6],ymm12[7],ymm14[7],ymm12[12],ymm14[12],ymm12[13],ymm14[13],ymm12[14],ymm14[14],ymm12[15],ymm14[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[12],ymm11[12],ymm10[13],ymm11[13],ymm10[14],ymm11[14],ymm10[15],ymm11[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm11[2,3],ymm4[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3],ymm10[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,4,4,4,4,6,5] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[2],ymm11[2],ymm6[3],ymm11[3],ymm6[8],ymm11[8],ymm6[9],ymm11[9],ymm6[10],ymm11[10],ymm6[11],ymm11[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm2[3],ymm10[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm15 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm15[0],ymm3[0],ymm15[1],ymm3[1],ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[8],ymm3[8],ymm15[9],ymm3[9],ymm15[10],ymm3[10],ymm15[11],ymm3[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm13[0],ymm4[1],ymm13[1],ymm4[2],ymm13[2],ymm4[3],ymm13[3],ymm4[8],ymm13[8],ymm4[9],ymm13[9],ymm4[10],ymm13[10],ymm4[11],ymm13[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2,3,4],ymm2[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,1,6,5,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [4,6,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm6[4],ymm11[4],ymm6[5],ymm11[5],ymm6[6],ymm11[6],ymm6[7],ymm11[7],ymm6[12],ymm11[12],ymm6[13],ymm11[13],ymm6[14],ymm11[14],ymm6[15],ymm11[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,4,4,4,4,6,5] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm15[4],ymm3[4],ymm15[5],ymm3[5],ymm15[6],ymm3[6],ymm15[7],ymm3[7],ymm15[12],ymm3[12],ymm15[13],ymm3[13],ymm15[14],ymm3[14],ymm15[15],ymm3[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm4[4],ymm13[4],ymm4[5],ymm13[5],ymm4[6],ymm13[6],ymm4[7],ymm13[7],ymm4[12],ymm13[12],ymm4[13],ymm13[13],ymm4[14],ymm13[14],ymm4[15],ymm13[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,1,3,5,7,5,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm4, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm1, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 192(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 224(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 160(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 224(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 192(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-NEXT: popq %rax +; AVX2-FAST-NEXT: addq $40, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride8_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: pushq %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm6[0],zero,xmm6[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm9[0],zero,xmm9[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm5[2,3],ymm12[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm6[0],zero,xmm6[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm8[0],zero,xmm8[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm2[2,3],ymm8[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm3[3],ymm11[4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm11[0],zero,xmm11[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3],ymm10[4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm3[2,3],ymm13[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm14[0],zero,xmm14[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[2],ymm12[2],ymm9[3],ymm12[3],ymm9[8],ymm12[8],ymm9[9],ymm12[9],ymm9[10],ymm12[10],ymm9[11],ymm12[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3],ymm11[4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[8],ymm10[8],ymm8[9],ymm10[9],ymm8[10],ymm10[10],ymm8[11],ymm10[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3],ymm14[4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm12[4],ymm9[5],ymm12[5],ymm9[6],ymm12[6],ymm9[7],ymm12[7],ymm9[12],ymm12[12],ymm9[13],ymm12[13],ymm9[14],ymm12[14],ymm9[15],ymm12[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm11[2,3],ymm2[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm2[3],ymm12[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm15[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm4[1],ymm8[2,3,4],ymm4[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: popq %rax ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -1637,84 +1599,77 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512F-NEXT: vmovdqa (%rdx), %ymm9 -; AVX512F-NEXT: vmovdqa (%rcx), %ymm10 -; AVX512F-NEXT: vmovdqa (%r8), %ymm15 -; AVX512F-NEXT: vmovdqa (%r9), %ymm3 -; AVX512F-NEXT: vmovdqa (%r10), %ymm6 -; AVX512F-NEXT: vmovdqa (%rax), %ymm0 -; AVX512F-NEXT: vmovdqa (%rax), %xmm2 -; AVX512F-NEXT: vmovdqa (%r10), %xmm4 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512F-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512F-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm16 -; AVX512F-NEXT: vmovdqa (%r9), %xmm2 +; AVX512F-NEXT: vmovdqa (%rax), %xmm5 +; AVX512F-NEXT: vmovdqa (%r10), %xmm6 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm16 +; AVX512F-NEXT: vmovdqa (%r9), %xmm7 ; AVX512F-NEXT: vmovdqa (%r8), %xmm8 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; AVX512F-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm20 -; AVX512F-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa (%rcx), %xmm9 +; AVX512F-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX512F-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm17 -; AVX512F-NEXT: vmovdqa (%rsi), %xmm13 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm14 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512F-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX512F-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[8],ymm0[8],ymm6[9],ymm0[9],ymm6[10],ymm0[10],ymm6[11],ymm0[11] -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm18 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm15[0],ymm3[0],ymm15[1],ymm3[1],ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[8],ymm3[8],ymm15[9],ymm3[9],ymm15[10],ymm3[10],ymm15[11],ymm3[11] -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm19 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX512F-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512F-NEXT: vmovdqa %ymm7, %ymm1 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11] -; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm0[4],ymm6[5],ymm0[5],ymm6[6],ymm0[6],ymm6[7],ymm0[7],ymm6[12],ymm0[12],ymm6[13],ymm0[13],ymm6[14],ymm0[14],ymm6[15],ymm0[15] +; AVX512F-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm18 +; AVX512F-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512F-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm14 +; AVX512F-NEXT: vmovdqa (%r8), %ymm8 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512F-NEXT: vmovdqa (%r9), %ymm10 +; AVX512F-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm19 +; AVX512F-NEXT: vmovdqa (%r10), %ymm15 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512F-NEXT: vmovdqa (%rax), %ymm12 +; AVX512F-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm15[4],ymm3[4],ymm15[5],ymm3[5],ymm15[6],ymm3[6],ymm15[7],ymm3[7],ymm15[12],ymm3[12],ymm15[13],ymm3[13],ymm15[14],ymm3[14],ymm15[15],ymm3[15] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[8],ymm10[8],ymm8[9],ymm10[9],ymm8[10],ymm10[10],ymm8[11],ymm10[11] ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11] +; AVX512F-NEXT: vinserti64x4 $1, %ymm13, %zmm13, %zmm13 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] +; AVX512F-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm9 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm15[4],ymm12[4],ymm15[5],ymm12[5],ymm15[6],ymm12[6],ymm15[7],ymm12[7],ymm15[12],ymm12[12],ymm15[13],ymm12[13],ymm15[14],ymm12[14],ymm15[15],ymm12[15] +; AVX512F-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15] +; AVX512F-NEXT: vinserti64x4 $1, %ymm8, %zmm8, %zmm8 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15] ; AVX512F-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm6 -; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15] -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 -; AVX512F-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512F-NEXT: vmovdqa64 %xmm22, %xmm9 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; AVX512F-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 -; AVX512F-NEXT: vmovdqa64 %xmm23, %xmm1 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX512F-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512F-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX512F-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,u,0,16,u,u,1,17,10,10,10,26,u,u,11,27> -; AVX512F-NEXT: vpermt2d %zmm16, %zmm12, %zmm20 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,16,1,u,1,17,u,u,10,26,11,11,11,27,u,u> -; AVX512F-NEXT: vpermt2d %zmm17, %zmm13, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,u,0,16,u,u,1,17,10,10,10,26,u,u,11,27> +; AVX512F-NEXT: vpermt2d %zmm16, %zmm5, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,16,1,u,1,17,u,u,10,26,11,11,11,27,u,u> +; AVX512F-NEXT: vpermt2d %zmm17, %zmm7, %zmm2 ; AVX512F-NEXT: movb $-86, %cl ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-NEXT: vpermt2d %zmm18, %zmm5, %zmm14 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm7, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,4,20,4,5,5,21,10,9,14,30,14,13,15,31] -; AVX512F-NEXT: vpermt2d %zmm18, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = [4,20,1,3,5,21,5,7,14,30,11,11,15,31,15,15] -; AVX512F-NEXT: vpermt2d %zmm19, %zmm14, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm14, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512F-NEXT: vpermt2d %zmm9, %zmm12, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm13, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,20,1,3,5,21,5,7,14,30,11,11,15,31,15,15] +; AVX512F-NEXT: vpermt2d %zmm13, %zmm0, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-NEXT: vpermt2d %zmm12, %zmm1, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2127,259 +2082,283 @@ ; ; AVX1-ONLY-LABEL: store_i16_stride8_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128 +; AVX1-ONLY-NEXT: subq $232, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovdqa 48(%r10), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm11, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm14 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm7[0],ymm13[1],ymm7[2,3,4],ymm13[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1],ymm9[2,3],ymm13[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3],ymm9[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm11 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4],ymm12[5],ymm5[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm11, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm1[0],zero,xmm1[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6],ymm5[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm13[0],zero,xmm13[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1],ymm9[2,3],ymm2[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3],ymm2[4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0],ymm6[1],ymm13[2,3,4],ymm6[5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm6[0],zero,xmm6[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3],ymm10[4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 48(%r10), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm6[0],zero,xmm6[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1],ymm12[2,3],ymm2[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; AVX1-ONLY-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm6 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] @@ -2389,248 +2368,198 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm7 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4],ymm10[5],ymm7[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[2,2,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,2,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 288(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 352(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $296, %rsp # imm = 0x128 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: addq $232, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i16_stride8_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $264, %rsp # imm = 0x108 +; AVX2-SLOW-NEXT: subq $232, %rsp ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm9[0],zero,xmm9[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm14 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm15 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm8[1],ymm12[2,3,4],ymm8[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %xmm8 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3],ymm12[4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3],ymm9[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %xmm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm5[2,3],ymm14[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,0,1,1] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm4[0,1,1,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm0[0],zero,xmm0[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm3[1],ymm12[2,3,4],ymm3[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm13 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm14 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm7[0],zero,xmm7[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa (%r10), %ymm15 +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm3[2,3],ymm6[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm10[4],ymm5[5],ymm10[5],ymm5[6],ymm10[6],ymm5[7],ymm10[7],ymm5[12],ymm10[12],ymm5[13],ymm10[13],ymm5[14],ymm10[14],ymm5[15],ymm10[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[8],ymm10[8],ymm15[9],ymm10[9],ymm15[10],ymm10[10],ymm15[11],ymm10[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm11 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3],ymm8[4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm12 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm4[4],ymm13[5],ymm4[5],ymm13[6],ymm4[6],ymm13[7],ymm4[7],ymm13[12],ymm4[12],ymm13[13],ymm4[13],ymm13[14],ymm4[14],ymm13[15],ymm4[15] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[0,2,2,3,4,6,6,7] @@ -2640,15 +2569,15 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm15[4],ymm10[4],ymm15[5],ymm10[5],ymm15[6],ymm10[6],ymm15[7],ymm10[7],ymm15[12],ymm10[12],ymm15[13],ymm10[13],ymm15[14],ymm10[14],ymm15[15],ymm10[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[8],ymm4[8],ymm13[9],ymm4[9],ymm13[10],ymm4[10],ymm13[11],ymm4[11] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7] @@ -2656,60 +2585,60 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r10), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %ymm13 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm9 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm9[4],ymm12[5],ymm9[5],ymm12[6],ymm9[6],ymm12[7],ymm9[7],ymm12[12],ymm9[12],ymm12[13],ymm9[13],ymm12[14],ymm9[14],ymm12[15],ymm9[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm4 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm9[0],ymm13[1],ymm9[1],ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[8],ymm9[8],ymm13[9],ymm9[9],ymm13[10],ymm9[10],ymm13[11],ymm9[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm13 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm6[4],ymm13[4],ymm6[5],ymm13[5],ymm6[6],ymm13[6],ymm6[7],ymm13[7],ymm6[12],ymm13[12],ymm6[13],ymm13[13],ymm6[14],ymm13[14],ymm6[15],ymm13[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm14 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm6[0],ymm14[0],ymm6[1],ymm14[1],ymm6[2],ymm14[2],ymm6[3],ymm14[3],ymm6[8],ymm14[8],ymm6[9],ymm14[9],ymm6[10],ymm14[10],ymm6[11],ymm14[11] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm15[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[8],ymm9[8],ymm12[9],ymm9[9],ymm12[10],ymm9[10],ymm12[11],ymm9[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm13[4],ymm9[4],ymm13[5],ymm9[5],ymm13[6],ymm9[6],ymm13[7],ymm9[7],ymm13[12],ymm9[12],ymm13[13],ymm9[13],ymm13[14],ymm9[14],ymm13[15],ymm9[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm14[4],ymm6[5],ymm14[5],ymm6[6],ymm14[6],ymm6[7],ymm14[7],ymm6[12],ymm14[12],ymm6[13],ymm14[13],ymm6[14],ymm14[14],ymm6[15],ymm14[15] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,1,3,4,5,5,7] @@ -2728,492 +2657,466 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 224(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm15, 192(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm11, 416(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 480(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 448(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 416(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, 384(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 224(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 480(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 448(%rax) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-SLOW-NEXT: addq $264, %rsp # imm = 0x108 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: addq $232, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i16_stride8_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $296, %rsp # imm = 0x128 +; AVX2-FAST-NEXT: subq $264, %rsp # imm = 0x108 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3,4],ymm3[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,0,u,u,1,1> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm9 +; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm7 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,1,1> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm6 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1,2],ymm4[3],ymm8[4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm15, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3,4],ymm14[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,2,2,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3],ymm8[4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,2,3,3,3,3,u,u> +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2],ymm0[3],ymm12[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3,4],ymm4[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <2,2,2,2,u,u,3,3> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <2,2,3,3,3,3,u,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,0,0,0,u,u,1,1> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,u,0,u,u,u,1,u> +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm9 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,0,1,1,1,1,u,u> +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm11, %ymm6 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm3[2,3],ymm6[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,0,0,0,u,u,1,1> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,0,u,u,u,1,u> -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,0,1,1,1,1,u,u> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,u,1,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm12 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm11, %ymm15 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,u,1,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm11, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm15, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,2,2,2,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,2,3,3,3,3,u,u> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm15 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,1,1> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,0,u,u,u,1,u> +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm13 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,1,1,1,1,u,u> +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm11, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r10), %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] +; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm10 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm15, %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3,4],ymm4[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3,4],ymm1[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3],ymm1[4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3],ymm8[4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,1,3,5,7,5,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3,4],ymm14[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3],ymm8[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [2,1,6,5,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm7, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm4[3],ymm8[4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,6,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm8[2,3],ymm1[4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm10 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,4,4,4,4,6,5] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm0, %ymm10 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,4,2,1,6,5,6,5] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm14, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm15, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm13, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3,4],ymm1[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm11 -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm8 -; AVX2-FAST-NEXT: vmovdqa (%r10), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm5 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm11[4],ymm8[4],ymm11[5],ymm8[5],ymm11[6],ymm8[6],ymm11[7],ymm8[7],ymm11[12],ymm8[12],ymm11[13],ymm8[13],ymm11[14],ymm8[14],ymm11[15],ymm8[15] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm3, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm12 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm7 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[12],ymm12[12],ymm10[13],ymm12[13],ymm10[14],ymm12[14],ymm10[15],ymm12[15] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm2[1],ymm14[2,3,4],ymm2[5],ymm14[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm13[2,3],ymm2[4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm13, %ymm6 +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [4,4,2,1,6,5,6,5] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm15, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,1,3,5,7,5,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,1,6,5,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [4,6,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3,4],ymm2[5],ymm5[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 32(%r10), %ymm6 +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm8 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[8],ymm8[8],ymm6[9],ymm8[9],ymm6[10],ymm8[10],ymm6[11],ymm8[11] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm7, %ymm10 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm12[0],ymm5[0],ymm12[1],ymm5[1],ymm12[2],ymm5[2],ymm12[3],ymm5[3],ymm12[8],ymm5[8],ymm12[9],ymm5[9],ymm12[10],ymm5[10],ymm12[11],ymm5[11] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3],ymm13[4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm7[0],ymm13[1],ymm7[1],ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[8],ymm7[8],ymm13[9],ymm7[9],ymm13[10],ymm7[10],ymm13[11],ymm7[11] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm14, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm15, %ymm9 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [2,1,6,5,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm14, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,6,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [6,5,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm14, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [4,6,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [6,5,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm11[0],ymm8[0],ymm11[1],ymm8[1],ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[8],ymm8[8],ymm11[9],ymm8[9],ymm11[10],ymm8[10],ymm11[11],ymm8[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,4,4,4,4,6,5] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,4,5,5,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[8],ymm12[8],ymm10[9],ymm12[9],ymm10[10],ymm12[10],ymm10[11],ymm12[11] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3],ymm0[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm8[4],ymm6[5],ymm8[5],ymm6[6],ymm8[6],ymm6[7],ymm8[7],ymm6[12],ymm8[12],ymm6[13],ymm8[13],ymm6[14],ymm8[14],ymm6[15],ymm8[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm12[4],ymm5[4],ymm12[5],ymm5[5],ymm12[6],ymm5[6],ymm12[7],ymm5[7],ymm12[12],ymm5[12],ymm12[13],ymm5[13],ymm12[14],ymm5[14],ymm12[15],ymm5[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,4,4,4,4,6,5] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3],ymm8[4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm13[4],ymm7[4],ymm13[5],ymm7[5],ymm13[6],ymm7[6],ymm13[7],ymm7[7],ymm13[12],ymm7[12],ymm13[13],ymm7[13],ymm13[14],ymm7[14],ymm13[15],ymm7[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,4,2,1,6,5,6,5] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,1,3,5,7,5,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm13, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm14, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm15, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm15, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm11, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm1, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 224(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 192(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 480(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 448(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 416(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 384(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 384(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 480(%rax) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 448(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $296, %rsp # imm = 0x128 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i16_stride8_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $264, %rsp # imm = 0x108 +; AVX2-FAST-PERLANE-NEXT: subq $232, %rsp ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm9[0],zero,xmm9[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0],ymm8[1],ymm12[2,3,4],ymm8[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3],ymm12[4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3],ymm9[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm5[2,3],ymm14[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm4[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm3[1],ymm12[2,3,4],ymm3[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3,4],ymm15[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm7[0],zero,xmm7[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm8[0],zero,xmm8[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm3[2,3],ymm6[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm10[4],ymm5[5],ymm10[5],ymm5[6],ymm10[6],ymm5[7],ymm10[7],ymm5[12],ymm10[12],ymm5[13],ymm10[13],ymm5[14],ymm10[14],ymm5[15],ymm10[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[8],ymm10[8],ymm15[9],ymm10[9],ymm15[10],ymm10[10],ymm15[11],ymm10[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3],ymm8[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3],ymm8[4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm4[4],ymm13[5],ymm4[5],ymm13[6],ymm4[6],ymm13[7],ymm4[7],ymm13[12],ymm4[12],ymm13[13],ymm4[13],ymm13[14],ymm4[14],ymm13[15],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[0,2,2,3,4,6,6,7] @@ -3223,15 +3126,15 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm10[0],ymm5[1],ymm10[1],ymm5[2],ymm10[2],ymm5[3],ymm10[3],ymm5[8],ymm10[8],ymm5[9],ymm10[9],ymm5[10],ymm10[10],ymm5[11],ymm10[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm15[4],ymm10[4],ymm15[5],ymm10[5],ymm15[6],ymm10[6],ymm15[7],ymm10[7],ymm15[12],ymm10[12],ymm15[13],ymm10[13],ymm15[14],ymm10[14],ymm15[15],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[8],ymm4[8],ymm13[9],ymm4[9],ymm13[10],ymm4[10],ymm13[11],ymm4[11] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,1,1,3,4,5,5,7] @@ -3239,60 +3142,60 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm9[4],ymm12[5],ymm9[5],ymm12[6],ymm9[6],ymm12[7],ymm9[7],ymm12[12],ymm9[12],ymm12[13],ymm9[13],ymm12[14],ymm9[14],ymm12[15],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm9[0],ymm13[1],ymm9[1],ymm13[2],ymm9[2],ymm13[3],ymm9[3],ymm13[8],ymm9[8],ymm13[9],ymm9[9],ymm13[10],ymm9[10],ymm13[11],ymm9[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm6[4],ymm13[4],ymm6[5],ymm13[5],ymm6[6],ymm13[6],ymm6[7],ymm13[7],ymm6[12],ymm13[12],ymm6[13],ymm13[13],ymm6[14],ymm13[14],ymm6[15],ymm13[15] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,0,2,1,4,4,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm6[0],ymm14[0],ymm6[1],ymm14[1],ymm6[2],ymm14[2],ymm6[3],ymm14[3],ymm6[8],ymm14[8],ymm6[9],ymm14[9],ymm6[10],ymm14[10],ymm6[11],ymm14[11] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm15[0,0,2,1,4,4,6,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4],ymm15[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[8],ymm9[8],ymm12[9],ymm9[9],ymm12[10],ymm9[10],ymm12[11],ymm9[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm13[4],ymm9[4],ymm13[5],ymm9[5],ymm13[6],ymm9[6],ymm13[7],ymm9[7],ymm13[12],ymm9[12],ymm13[13],ymm9[13],ymm13[14],ymm9[14],ymm13[15],ymm9[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[1],ymm3[1],ymm7[2],ymm3[2],ymm7[3],ymm3[3],ymm7[8],ymm3[8],ymm7[9],ymm3[9],ymm7[10],ymm3[10],ymm7[11],ymm3[11] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm14[4],ymm6[5],ymm14[5],ymm6[6],ymm14[6],ymm6[7],ymm14[7],ymm6[12],ymm14[12],ymm6[13],ymm14[13],ymm6[14],ymm14[14],ymm6[15],ymm14[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm7[4],ymm3[4],ymm7[5],ymm3[5],ymm7[6],ymm3[6],ymm7[7],ymm3[7],ymm7[12],ymm3[12],ymm7[13],ymm3[13],ymm7[14],ymm3[14],ymm7[15],ymm3[15] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,0,2,1,4,4,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,1,3,4,5,5,7] @@ -3311,34 +3214,34 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 416(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 480(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 448(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 416(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 384(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 384(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 480(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 448(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $264, %rsp # imm = 0x108 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $232, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -3346,340 +3249,309 @@ ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm4 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3> -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm26, %zmm30 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3> +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3> +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm18, %zmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3> ; AVX512F-SLOW-NEXT: movw $-30584, %r11w # imm = 0x8888 ; AVX512F-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm27, %zmm30 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u> -; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm28, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = <0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u> +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm19, %zmm24 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u> +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm20, %zmm29 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u> ; AVX512F-SLOW-NEXT: movw $8738, %r11w # imm = 0x2222 ; AVX512F-SLOW-NEXT: kmovw %r11d, %k2 -; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm29, %zmm7 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm18, %zmm31 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512F-SLOW-NEXT: vpermd %zmm15, %zmm19, %zmm31 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm14 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm1[0],ymm14[0],ymm1[1],ymm14[1],ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[8],ymm14[8],ymm1[9],ymm14[9],ymm1[10],ymm14[10],ymm1[11],ymm14[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpermd %zmm15, %zmm20, %zmm15 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm21, %zmm15 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm5 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] +; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm21, %zmm29 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm18, %zmm27 +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm19, %zmm27 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm20, %zmm3 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm21, %zmm3 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[8],ymm14[8],ymm15[9],ymm14[9],ymm15[10],ymm14[10],ymm15[11],ymm14[11] +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm22, %zmm28 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm23, %zmm28 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[2],ymm11[2],ymm0[3],ymm11[3],ymm0[8],ymm11[8],ymm0[9],ymm11[9],ymm0[10],ymm11[10],ymm0[11],ymm11[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm25, %zmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm26, %zmm9 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm4 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm15[4],ymm14[4],ymm15[5],ymm14[5],ymm15[6],ymm14[6],ymm15[7],ymm14[7],ymm15[12],ymm14[12],ymm15[13],ymm14[13],ymm15[14],ymm14[14],ymm15[15],ymm14[15] ; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm12 -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm18, %zmm16 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm19, %zmm16 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm14[4],ymm1[5],ymm14[5],ymm1[6],ymm14[6],ymm1[7],ymm14[7],ymm1[12],ymm14[12],ymm1[13],ymm14[13],ymm1[14],ymm14[14],ymm1[15],ymm14[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm20, %zmm17 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm21, %zmm17 {%k2} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm26, %zmm22 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm22 {%k1} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm28, %zmm23 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm29, %zmm23 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm26, %zmm24 -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm27, %zmm24 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm28, %zmm25 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm29, %zmm25 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm7 +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm22, %zmm30 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512F-SLOW-NEXT: vpermd %zmm15, %zmm23, %zmm30 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm13 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm10[4],ymm8[4],ymm10[5],ymm8[5],ymm10[6],ymm8[6],ymm10[7],ymm8[7],ymm10[12],ymm8[12],ymm10[13],ymm8[13],ymm10[14],ymm8[14],ymm10[15],ymm8[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm11[4],ymm0[5],ymm11[5],ymm0[6],ymm11[6],ymm0[7],ymm11[7],ymm0[12],ymm11[12],ymm0[13],ymm11[13],ymm0[14],ymm11[14],ymm0[15],ymm11[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm25, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm26, %zmm15 {%k2} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm18, %zmm16 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm19, %zmm16 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm20, %zmm17 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm21, %zmm17 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] ; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm18, %zmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm19, %zmm5 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm18, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm19, %zmm2 {%k1} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm3[0],ymm12[1],ymm3[1],ymm12[2],ymm3[2],ymm12[3],ymm3[3],ymm12[8],ymm3[8],ymm12[9],ymm3[9],ymm12[10],ymm3[10],ymm12[11],ymm3[11] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm19, %zmm5 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm20, %zmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm1[0],ymm6[0],ymm1[1],ymm6[1],ymm1[2],ymm6[2],ymm1[3],ymm6[3],ymm1[8],ymm6[8],ymm1[9],ymm6[9],ymm1[10],ymm6[10],ymm1[11],ymm6[11] -; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm21, %zmm0 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm6[4],ymm1[5],ymm6[5],ymm1[6],ymm6[6],ymm1[7],ymm6[7],ymm1[12],ymm6[12],ymm1[13],ymm6[13],ymm1[14],ymm6[14],ymm1[15],ymm6[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm12[4],ymm3[4],ymm12[5],ymm3[5],ymm12[6],ymm3[6],ymm12[7],ymm3[7],ymm12[12],ymm3[12],ymm12[13],ymm3[13],ymm12[14],ymm3[14],ymm12[15],ymm3[15] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm20, %zmm3 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm21, %zmm3 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm26, %zmm4 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm27, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm28, %zmm6 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm29, %zmm6 {%k2} -; AVX512F-SLOW-NEXT: movb $-86, %al -; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, %zmm15 {%k1} +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm21, %zmm0 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm7[0],ymm4[0],ymm7[1],ymm4[1],ymm7[2],ymm4[2],ymm7[3],ymm4[3],ymm7[8],ymm4[8],ymm7[9],ymm4[9],ymm7[10],ymm4[10],ymm7[11],ymm4[11] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] +; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm22, %zmm12 +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm23, %zmm12 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm25, %zmm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm13[0],ymm8[0],ymm13[1],ymm8[1],ymm13[2],ymm8[2],ymm13[3],ymm8[3],ymm13[8],ymm8[8],ymm13[9],ymm8[9],ymm13[10],ymm8[10],ymm13[11],ymm8[11] +; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm26, %zmm1 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm7[4],ymm4[4],ymm7[5],ymm4[5],ymm7[6],ymm4[6],ymm7[7],ymm4[7],ymm7[12],ymm4[12],ymm7[13],ymm4[13],ymm7[14],ymm4[14],ymm7[15],ymm4[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] +; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm22, %zmm7 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm23, %zmm7 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm13[4],ymm8[4],ymm13[5],ymm8[5],ymm13[6],ymm8[6],ymm13[7],ymm8[7],ymm13[12],ymm8[12],ymm13[13],ymm8[13],ymm13[14],ymm8[14],ymm13[15],ymm8[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm25, %zmm2 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm26, %zmm2 {%k2} +; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-SLOW-NEXT: movb $-86, %cl +; AVX512F-SLOW-NEXT: kmovw %ecx, %k1 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm29 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, %zmm9 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm15 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, (%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride8_vf32: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $552, %rsp # imm = 0x228 +; AVX512F-FAST-NEXT: subq $136, %rsp ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm2 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm26 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm27 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm21 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm22 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm24 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm23 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm25 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm2 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm17 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm10 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm16 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm11 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm5 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm12 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm17 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm14 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm7 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vmovdqa (%r10), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm26 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm27 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm28 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm31 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm29 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm30 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm10 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm23 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm24 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm25 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm26 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm27 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa (%r10), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm3 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm29 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm30 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[8],ymm7[8],ymm9[9],ymm7[9],ymm9[10],ymm7[10],ymm9[11],ymm7[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm31 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm19 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm18 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm20 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm7[4],ymm9[5],ymm7[5],ymm9[6],ymm7[6],ymm9[7],ymm7[7],ymm9[12],ymm7[12],ymm9[13],ymm7[13],ymm9[14],ymm7[14],ymm9[15],ymm7[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm21 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm22 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %ymm7 +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm9 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[8],ymm7[8],ymm9[9],ymm7[9],ymm9[10],ymm7[10],ymm9[11],ymm7[11] +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm12 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm13 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm14 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm9[4],ymm7[4],ymm9[5],ymm7[5],ymm9[6],ymm7[6],ymm9[7],ymm7[7],ymm9[12],ymm7[12],ymm9[13],ymm7[13],ymm9[14],ymm7[14],ymm9[15],ymm7[15] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[8],ymm14[8],ymm15[9],ymm14[9],ymm15[10],ymm14[10],ymm15[11],ymm14[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm15[4],ymm14[4],ymm15[5],ymm14[5],ymm15[6],ymm14[6],ymm15[7],ymm14[7],ymm15[12],ymm14[12],ymm15[13],ymm14[13],ymm15[14],ymm14[14],ymm15[15],ymm14[15] +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11> +; AVX512F-FAST-NEXT: vpermd %zmm17, %zmm15, %zmm17 ; AVX512F-FAST-NEXT: movw $-30584, %ax # imm = 0x8888 ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] -; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm0, %zmm24 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] -; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm1, %zmm24 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm17, %zmm0, %zmm17 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11> ; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm1, %zmm17 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm16 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm22 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm13 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm14 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm11, %zmm11 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm15 -; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm0, %zmm7 -; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm1, %zmm7 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u> +; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm16, %zmm24 ; AVX512F-FAST-NEXT: movw $8738, %ax # imm = 0x2222 ; AVX512F-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-FAST-NEXT: vpermd %zmm11, %zmm1, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] -; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm1, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] -; AVX512F-FAST-NEXT: vpermd %zmm23, %zmm11, %zmm9 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm1, %zmm12 -; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm11, %zmm12 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm3, %zmm1, %zmm3 -; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm11, %zmm3 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm11, %zmm1 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm5 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm4, %zmm11 -; AVX512F-FAST-NEXT: vpermd %zmm28, %zmm6, %zmm11 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm4, %zmm18 -; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm6, %zmm18 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm6, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,u,1,u,1,u,u,u,10,u,11,u,11,u,u,u> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm2 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u> -; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm10, %zmm16 -; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm6, %zmm16 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm10, %zmm14 -; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm6, %zmm14 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm15, %zmm10, %zmm10 -; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm6, %zmm10 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u> +; AVX512F-FAST-NEXT: vpermd %zmm23, %zmm0, %zmm24 {%k2} ; AVX512F-FAST-NEXT: movb $-86, %al -; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm16 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, %zmm14 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512F-FAST-NEXT: kmovw %eax, %k3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm24 {%k3} +; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm15, %zmm17 +; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm1, %zmm17 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm28, %zmm16, %zmm23 +; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm0, %zmm23 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 {%k3} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] +; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm17, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] +; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm26, %zmm25 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] +; AVX512F-FAST-NEXT: vpermd %zmm19, %zmm27, %zmm19 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] +; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm28, %zmm19 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm29 +; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm30 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm31 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm6 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm11, %zmm11 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm14, %zmm14 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm13, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 {%k3} +; AVX512F-FAST-NEXT: vpermd %zmm20, %zmm17, %zmm20 +; AVX512F-FAST-NEXT: vpermd %zmm18, %zmm26, %zmm20 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm27, %zmm18 +; AVX512F-FAST-NEXT: vpermd %zmm21, %zmm28, %zmm18 {%k2} ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-FAST-NEXT: addq $552, %rsp # imm = 0x228 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k3} +; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm15, %zmm20 +; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm1, %zmm20 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm16, %zmm5 +; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm0, %zmm5 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 {%k3} +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm15, %zmm10 +; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm1, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm3, %zmm16, %zmm1 +; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm0, %zmm1 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k3} +; AVX512F-FAST-NEXT: vpermd %zmm11, %zmm17, %zmm0 +; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm26, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm27, %zmm2 +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm28, %zmm2 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k3} +; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm17, %zmm0 +; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm26, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm27, %zmm3 +; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm28, %zmm3 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k3} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, 64(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, (%rax) +; AVX512F-FAST-NEXT: addq $136, %rsp ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -6839,661 +6711,644 @@ ; ; AVX512F-SLOW-LABEL: store_i16_stride8_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512F-SLOW-NEXT: subq $264, %rsp # imm = 0x108 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%r10), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %xmm8 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3> -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm16, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3> +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3> +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm15, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3> ; AVX512F-SLOW-NEXT: movw $-30584, %r11w # imm = 0x8888 ; AVX512F-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm27, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm4, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u> +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm5, %zmm31 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm16 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u> ; AVX512F-SLOW-NEXT: movw $8738, %r11w # imm = 0x2222 ; AVX512F-SLOW-NEXT: kmovw %r11d, %k2 -; AVX512F-SLOW-NEXT: vmovdqa 96(%r10), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %ymm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm29, %zmm0 +; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm4, %zmm31 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm15, %zmm3 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm18, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm5, %zmm19 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm4, %zmm19 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[2],ymm7[2],ymm11[3],ymm7[3],ymm11[8],ymm7[8],ymm11[9],ymm7[9],ymm11[10],ymm7[10],ymm11[11],ymm7[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm12, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm13, %zmm0 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[8],ymm6[8],ymm0[9],ymm6[9],ymm0[10],ymm6[10],ymm0[11],ymm6[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm14, %zmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm5[0],ymm11[1],ymm5[1],ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[8],ymm5[8],ymm11[9],ymm5[9],ymm11[10],ymm5[10],ymm11[11],ymm5[11] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm15, %zmm4 {%k2} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm2 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm13, %zmm2 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm11[4],ymm5[4],ymm11[5],ymm5[5],ymm11[6],ymm5[6],ymm11[7],ymm5[7],ymm11[12],ymm5[12],ymm11[13],ymm5[13],ymm11[14],ymm5[14],ymm11[15],ymm5[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[12],ymm6[12],ymm0[13],ymm6[13],ymm0[14],ymm6[14],ymm0[15],ymm6[15] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm14, %zmm31 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm15, %zmm31 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 96(%r10), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpermd %zmm11, %zmm16, %zmm4 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm27, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm16, %zmm1 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm13, %zmm1 {%k1} ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%r10), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %ymm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm12 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm12[0],ymm6[0],ymm12[1],ymm6[1],ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[8],ymm6[8],ymm12[9],ymm6[9],ymm12[10],ymm6[10],ymm12[11],ymm6[11] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm29, %zmm4 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm13, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm14, %zmm28 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm15, %zmm28 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm8, %zmm17 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, %zmm22 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm10, %zmm17 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm9 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm8 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm11[4],ymm7[4],ymm11[5],ymm7[5],ymm11[6],ymm7[6],ymm11[7],ymm7[7],ymm11[12],ymm7[12],ymm11[13],ymm7[13],ymm11[14],ymm7[14],ymm11[15],ymm7[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 +; AVX512F-SLOW-NEXT: vpermd %zmm7, %zmm12, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm13, %zmm7 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm22, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm10, %zmm14 {%k2} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm15, %zmm11 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm18, %zmm11 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm16, %zmm16 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm21, %zmm16 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm15, %zmm5 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm18, %zmm5 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, %zmm26 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm20, %zmm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm21, %zmm18 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm12, %zmm20 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm13, %zmm20 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm0[0],ymm9[0],ymm0[1],ymm9[1],ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[8],ymm9[8],ymm0[9],ymm9[9],ymm0[10],ymm9[10],ymm0[11],ymm9[11] +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm22, %zmm21 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm10, %zmm21 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa 64(%r10), %xmm5 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] ; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %xmm4 +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm12, %zmm23 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %xmm3 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm13, %zmm23 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15] +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm9[4],ymm0[5],ymm9[5],ymm0[6],ymm9[6],ymm0[7],ymm9[7],ymm0[12],ymm9[12],ymm0[13],ymm9[13],ymm0[14],ymm9[14],ymm0[15],ymm9[15] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm22, %zmm22 +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm10, %zmm22 {%k2} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm15, %zmm25 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm26, %zmm25 {%k1} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm24, %zmm24 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm11, %zmm24 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm15, %zmm27 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm26, %zmm27 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, %zmm5 +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm7, %zmm26 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm11, %zmm26 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa 64(%r10), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r9), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %ymm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm12, %zmm28 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rax), %ymm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm13, %zmm28 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15] ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] -; AVX512F-SLOW-NEXT: vmovdqa 64(%r8), %xmm2 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm12[4],ymm6[4],ymm12[5],ymm6[5],ymm12[6],ymm6[6],ymm12[7],ymm6[7],ymm12[12],ymm6[12],ymm12[13],ymm6[13],ymm12[14],ymm6[14],ymm12[15],ymm6[15] -; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm29, %zmm6 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm13, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm14, %zmm23 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm15, %zmm23 {%k2} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm16, %zmm1 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm1 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[8],ymm6[8],ymm10[9],ymm6[9],ymm10[10],ymm6[10],ymm10[11],ymm6[11] -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm29, %zmm30 -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm13, %zmm30 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm11[0],ymm5[0],ymm11[1],ymm5[1],ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[8],ymm5[8],ymm11[9],ymm5[9],ymm11[10],ymm5[10],ymm11[11],ymm5[11] -; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm14, %zmm22 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm12[0],ymm0[1],ymm12[1],ymm0[2],ymm12[2],ymm0[3],ymm12[3],ymm0[8],ymm12[8],ymm0[9],ymm12[9],ymm0[10],ymm12[10],ymm0[11],ymm12[11] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm15, %zmm22 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[12],ymm6[12],ymm10[13],ymm6[13],ymm10[14],ymm6[14],ymm10[15],ymm6[15] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm29, %zmm24 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm13, %zmm24 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm12[4],ymm0[5],ymm12[5],ymm0[6],ymm12[6],ymm0[7],ymm12[7],ymm0[12],ymm12[12],ymm0[13],ymm12[13],ymm0[14],ymm12[14],ymm0[15],ymm12[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm5[4],ymm11[5],ymm5[5],ymm11[6],ymm5[6],ymm11[7],ymm5[7],ymm11[12],ymm5[12],ymm11[13],ymm5[13],ymm11[14],ymm5[14],ymm11[15],ymm5[15] -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm14, %zmm25 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm15, %zmm25 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm29, %zmm26 -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm13, %zmm26 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm29, %zmm29 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm12, %zmm29 ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm13, %zmm29 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm14, %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11] -; AVX512F-SLOW-NEXT: vpermd %zmm10, %zmm15, %zmm8 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm3[4],ymm6[5],ymm3[5],ymm6[6],ymm3[6],ymm6[7],ymm3[7],ymm6[12],ymm3[12],ymm6[13],ymm3[13],ymm6[14],ymm3[14],ymm6[15],ymm3[15] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm4 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm14, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm6 -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm15, %zmm7 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm16, %zmm17 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm27, %zmm17 {%k1} -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm16, %zmm16 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm27, %zmm16 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm0, %zmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm27, %zmm10 {%k1} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm20 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm13 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm15 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vpermd %zmm15, %zmm1, %zmm15 -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm18 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm27, %zmm15 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u> -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm13, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = <0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u> -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm27, %zmm6 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm15, %zmm30 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r10), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm5, %zmm30 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm15, %zmm7 +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm5, %zmm7 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm13, %zmm9 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpermd %zmm11, %zmm27, %zmm9 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm13, %zmm1 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm1 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm13, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; AVX512F-SLOW-NEXT: vpermd %zmm14, %zmm27, %zmm4 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm13, %zmm2 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm2 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpermd %zmm11, %zmm13, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm27, %zmm11 {%k2} -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm13, %zmm3 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm3 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm12 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] -; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm13, %zmm5 -; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm27, %zmm5 {%k2} +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpermd %zmm3, %zmm8, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm11, %zmm3 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm8, %zmm1 +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm11, %zmm1 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa 96(%r9), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] +; AVX512F-SLOW-NEXT: vpermd %zmm5, %zmm12, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa 96(%r10), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rax), %ymm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] +; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm13, %zmm5 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] +; AVX512F-SLOW-NEXT: vpermd %zmm2, %zmm12, %zmm2 +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm13, %zmm2 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa 64(%rsi), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] +; AVX512F-SLOW-NEXT: vpermd %zmm8, %zmm11, %zmm8 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %ymm12 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[8],ymm9[8],ymm12[9],ymm9[9],ymm12[10],ymm9[10],ymm12[11],ymm9[11] +; AVX512F-SLOW-NEXT: vpermd %zmm13, %zmm10, %zmm8 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm12[4],ymm9[4],ymm12[5],ymm9[5],ymm12[6],ymm9[6],ymm12[7],ymm9[7],ymm12[12],ymm9[12],ymm12[13],ymm9[13],ymm12[14],ymm9[14],ymm12[15],ymm9[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm4[4],ymm6[5],ymm4[5],ymm6[6],ymm4[6],ymm6[7],ymm4[7],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15] +; AVX512F-SLOW-NEXT: vpermd %zmm4, %zmm11, %zmm4 +; AVX512F-SLOW-NEXT: vpermd %zmm9, %zmm10, %zmm4 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[8],ymm6[8],ymm9[9],ymm6[9],ymm9[10],ymm6[10],ymm9[11],ymm6[11] +; AVX512F-SLOW-NEXT: vpermd %zmm12, %zmm11, %zmm12 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %ymm13 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %ymm0 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[8],ymm13[8],ymm0[9],ymm13[9],ymm0[10],ymm13[10],ymm0[11],ymm13[11] +; AVX512F-SLOW-NEXT: vpermd %zmm15, %zmm10, %zmm12 {%k2} +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm13[4],ymm0[5],ymm13[5],ymm0[6],ymm13[6],ymm0[7],ymm13[7],ymm0[12],ymm13[12],ymm0[13],ymm13[13],ymm0[14],ymm13[14],ymm0[15],ymm13[15] +; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[12],ymm6[12],ymm9[13],ymm6[13],ymm9[14],ymm6[14],ymm9[15],ymm6[15] +; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm11, %zmm6 +; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm10, %zmm6 {%k2} ; AVX512F-SLOW-NEXT: movb $-86, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512F-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm25 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm11 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, %zmm8 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, %zmm21 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, %zmm26 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm29, %zmm4 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 576(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 512(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 704(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm28, 640(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 832(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 768(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, 960(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 960(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 896(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512F-SLOW-NEXT: addq $520, %rsp # imm = 0x208 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 832(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 704(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 640(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 576(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, 512(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm31, (%rax) +; AVX512F-SLOW-NEXT: addq $264, %rsp # imm = 0x108 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i16_stride8_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $2504, %rsp # imm = 0x9C8 +; AVX512F-FAST-NEXT: subq $2440, %rsp # imm = 0x988 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm11 -; AVX512F-FAST-NEXT: vmovdqa 64(%r10), %xmm7 -; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%r10), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 96(%rax), %ymm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%r9), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %ymm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %ymm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%r10), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 96(%rax), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm9 +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%r9), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm11 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %xmm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm12 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm6 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm6 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm14 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm15 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm15 +; AVX512F-FAST-NEXT: vmovdqa (%r10), %ymm8 +; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm9 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm10, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm10 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm11 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm12 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm14 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[8],ymm14[8],ymm9[9],ymm14[9],ymm9[10],ymm14[10],ymm9[11],ymm14[11] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm9[4],ymm14[4],ymm9[5],ymm14[5],ymm9[6],ymm14[6],ymm9[7],ymm14[7],ymm9[12],ymm14[12],ymm9[13],ymm14[13],ymm9[14],ymm14[14],ymm9[15],ymm14[15] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%r10), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa 64(%rax), %ymm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %ymm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm8 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm7 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm11 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm7[0],ymm4[0],ymm7[1],ymm4[1],ymm7[2],ymm4[2],ymm7[3],ymm4[3],ymm7[8],ymm4[8],ymm7[9],ymm4[9],ymm7[10],ymm4[10],ymm7[11],ymm4[11] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm7[4],ymm4[4],ymm7[5],ymm4[5],ymm7[6],ymm4[6],ymm7[7],ymm4[7],ymm7[12],ymm4[12],ymm7[13],ymm4[13],ymm7[14],ymm4[14],ymm7[15],ymm4[15] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm4 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm4 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 64(%r10), %xmm4 ; AVX512F-FAST-NEXT: vmovdqa 64(%rax), %xmm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm0 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %xmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm31 -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %xmm1 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm5 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm5 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 64(%r10), %ymm7 +; AVX512F-FAST-NEXT: vmovdqa 64(%rax), %ymm8 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm31 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm30 +; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm3[4],ymm6[5],ymm3[5],ymm6[6],ymm3[6],ymm6[7],ymm3[7],ymm6[12],ymm3[12],ymm6[13],ymm3[13],ymm6[14],ymm3[14],ymm6[15],ymm3[15] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm23 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm22 +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm1 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm25 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm30 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm8, %zmm27 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm26 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm21 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm19 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm24 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm23 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm22 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm20 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm28 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm26 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm29 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa 96(%r10), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa 96(%rax), %xmm7 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm21 +; AVX512F-FAST-NEXT: vmovdqa 96(%r9), %xmm6 +; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %xmm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm19 +; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %xmm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm20 +; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm18 +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm17 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm16 +; AVX512F-FAST-NEXT: vmovdqa 96(%r10), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 96(%rax), %ymm3 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm11 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa 96(%r9), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %ymm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm13 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 +; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %ymm7 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %ymm0 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[2],ymm7[2],ymm0[3],ymm7[3],ymm0[8],ymm7[8],ymm0[9],ymm7[9],ymm0[10],ymm7[10],ymm0[11],ymm7[11] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm7[4],ymm0[5],ymm7[5],ymm0[6],ymm7[6],ymm0[7],ymm7[7],ymm0[12],ymm7[12],ymm0[13],ymm7[13],ymm0[14],ymm7[14],ymm0[15],ymm7[15] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r10), %ymm7 -; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm15, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm14, %zmm17 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm13, %zmm15 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm12 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm10 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm7 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm11 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11> +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: movw $-30584, %ax # imm = 0x8888 ; AVX512F-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11> ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k2} # 64-byte Folded Reload ; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 {%k2} # 64-byte Folded Reload ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm1, %zmm31 -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd %zmm28, %zmm1, %zmm28 -; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm0, %zmm28 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm1, %zmm25 -; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm0, %zmm25 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm1, %zmm16 -; AVX512F-FAST-NEXT: vpermd %zmm18, %zmm0, %zmm16 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm1, %zmm3 -; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm0, %zmm3 {%k2} +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm1, %zmm22 +; AVX512F-FAST-NEXT: vpermd %zmm23, %zmm0, %zmm22 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm19, %zmm1, %zmm19 +; AVX512F-FAST-NEXT: vpermd %zmm21, %zmm0, %zmm19 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm16, %zmm1, %zmm4 +; AVX512F-FAST-NEXT: vpermd %zmm17, %zmm0, %zmm4 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u> +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: movw $8738, %ax # imm = 0x2222 ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm18 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm26 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm30 {%k1} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm0, %zmm27 -; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm12, %zmm27 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm19, %zmm0, %zmm19 -; AVX512F-FAST-NEXT: vpermd %zmm21, %zmm12, %zmm19 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm15, %zmm0, %zmm15 -; AVX512F-FAST-NEXT: vpermd %zmm17, %zmm12, %zmm15 {%k1} -; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm12, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm7 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11> -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm7 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm17 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm17 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm21 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm21 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm29 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm29 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 {%k2} # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpermd %zmm23, %zmm5, %zmm23 -; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm12, %zmm23 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm20, %zmm5, %zmm20 -; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm12, %zmm20 {%k2} -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm5, %zmm5 -; AVX512F-FAST-NEXT: vpermd %zmm11, %zmm12, %zmm5 {%k2} -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,u,1,u,1,u,u,u,10,u,11,u,11,u,u,u> +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u> +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm16 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm15, %zmm0, %zmm21 +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm21 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm23 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm1 {%k1} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm30, %zmm0, %zmm30 +; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm17, %zmm30 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm0, %zmm24 +; AVX512F-FAST-NEXT: vpermd %zmm25, %zmm17, %zmm24 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm18, %zmm0, %zmm18 +; AVX512F-FAST-NEXT: vpermd %zmm20, %zmm17, %zmm18 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpermd %zmm11, %zmm17, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,4,5,4,5,5,7,10,9,14,13,14,13,15,15] +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm11 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,0,4,4,4,4,6,5,8,10,12,14,12,14,14,15] +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm11 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm20 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm20 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm25 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm25 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm31 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm31 {%k2} # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpermd %zmm26, %zmm9, %zmm26 +; AVX512F-FAST-NEXT: vpermd %zmm28, %zmm17, %zmm26 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm27, %zmm9, %zmm27 +; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm17, %zmm27 {%k2} +; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm9, %zmm12 +; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm17, %zmm12 {%k2} ; AVX512F-FAST-NEXT: vpermd %zmm8, %zmm9, %zmm8 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u> -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm11, %zmm8 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm12 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u> -; AVX512F-FAST-NEXT: vpermd %zmm12, %zmm13, %zmm12 -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm11, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm22 -; AVX512F-FAST-NEXT: vpermd %zmm22, %zmm13, %zmm22 -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm11, %zmm22 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm24 -; AVX512F-FAST-NEXT: vpermd %zmm24, %zmm13, %zmm24 -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm11, %zmm24 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm14 +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm17, %zmm8 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm9 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,5,1,3,5,7,5,7,14,13,11,11,15,15,15,15] +; AVX512F-FAST-NEXT: vpermd %zmm10, %zmm13, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,12,14,10,11,14,15,14,15] +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm17, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm14, %zmm28 +; AVX512F-FAST-NEXT: vpermd %zmm28, %zmm13, %zmm28 +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm17, %zmm28 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm9 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm29 +; AVX512F-FAST-NEXT: vpermd %zmm29, %zmm13, %zmm29 +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm17, %zmm29 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm9 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm15 +; AVX512F-FAST-NEXT: vpermd %zmm15, %zmm13, %zmm15 +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm17, %zmm15 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm9 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm14 ; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm13, %zmm14 -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm11, %zmm14 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm6 -; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm13, %zmm6 -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm11, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm13, %zmm4 -; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm11, %zmm4 {%k1} +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm17, %zmm14 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm9 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm7 +; AVX512F-FAST-NEXT: vpermd %zmm7, %zmm13, %zmm7 +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm17, %zmm7 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm9 +; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm13, %zmm5 +; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm17, %zmm5 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm9, %zmm9 ; AVX512F-FAST-NEXT: vpermd %zmm9, %zmm13, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm13 -; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm11, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm13, %zmm13 +; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm17, %zmm9 {%k1} ; AVX512F-FAST-NEXT: movb $-86, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm12 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm16 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm28 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm23 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, %zmm15 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, %zmm30 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, %zmm24 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, %zmm27 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, %zmm15 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm14 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, %zmm7 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, 512(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 704(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, 640(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 832(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 768(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 960(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 896(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512F-FAST-NEXT: addq $2504, %rsp # imm = 0x9C8 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 960(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 896(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 832(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 768(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 640(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, 576(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 512(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512F-FAST-NEXT: addq $2440, %rsp # imm = 0x988 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -7508,56 +7363,56 @@ ; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm3, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm4, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm5, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm9, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm11, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm12, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm14, %zmm15 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm16, %zmm18 ; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm31, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 @@ -7579,11 +7434,11 @@ ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2w %zmm17, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm21, %zmm31 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 @@ -7604,33 +7459,33 @@ ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm13 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm9 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm29 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm27, %zmm29 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm28, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm22, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm24, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm30 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm26, %zmm30 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm19, %zmm9 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -7644,28 +7499,28 @@ ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm19, %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm17 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm20 ; AVX512BW-NEXT: vpermt2w %zmm19, %zmm11, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 ; AVX512BW-NEXT: vpermt2w %zmm19, %zmm8, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm16 ; AVX512BW-NEXT: vpermt2w %zmm19, %zmm7, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 ; AVX512BW-NEXT: vpermt2w %zmm19, %zmm6, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm19, %zmm5, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm12 ; AVX512BW-NEXT: vpermt2w %zmm19, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm10 ; AVX512BW-NEXT: vpermt2w %zmm19, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermt2w %zmm19, %zmm3, %zmm17 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm19 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -7766,22 +7621,22 @@ ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm19 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 {%k3} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm19, 896(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 960(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rax) ; AVX512BW-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll @@ -89,15 +89,15 @@ ; SSE-NEXT: movaps (%rsi), %xmm2 ; SSE-NEXT: movaps 16(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps %xmm4, 16(%rdx) +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps %xmm2, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps %xmm4, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride2_vf8: @@ -159,25 +159,25 @@ ; SSE-NEXT: movaps 32(%rsi), %xmm6 ; SSE-NEXT: movaps 48(%rsi), %xmm7 ; SSE-NEXT: movaps %xmm0, %xmm8 -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] ; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; SSE-NEXT: movaps %xmm3, 96(%rdx) -; SSE-NEXT: movaps %xmm6, 112(%rdx) -; SSE-NEXT: movaps %xmm2, 64(%rdx) -; SSE-NEXT: movaps %xmm5, 80(%rdx) -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps %xmm4, 48(%rdx) -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps %xmm8, 16(%rdx) +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE-NEXT: movaps %xmm3, 112(%rdx) +; SSE-NEXT: movaps %xmm6, 96(%rdx) +; SSE-NEXT: movaps %xmm2, 80(%rdx) +; SSE-NEXT: movaps %xmm5, 64(%rdx) +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps %xmm4, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps %xmm8, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride2_vf16: @@ -190,12 +190,12 @@ ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 @@ -204,8 +204,8 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -217,16 +217,16 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -269,100 +269,100 @@ ; SSE-NEXT: movaps 32(%rsi), %xmm14 ; SSE-NEXT: movaps 48(%rsi), %xmm15 ; SSE-NEXT: movaps %xmm8, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; SSE-NEXT: movaps %xmm1, %xmm9 -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] ; SSE-NEXT: movaps %xmm2, %xmm10 -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] ; SSE-NEXT: movaps %xmm5, %xmm14 -; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm15[2],xmm5[3],xmm15[3] ; SSE-NEXT: movaps %xmm3, %xmm15 -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3] ; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm12[2],xmm4[3],xmm12[3] ; SSE-NEXT: movaps %xmm6, %xmm12 -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm11[2],xmm6[3],xmm11[3] ; SSE-NEXT: movaps 112(%rsi), %xmm11 ; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm11[2],xmm7[3],xmm11[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: movaps %xmm0, 224(%rdx) -; SSE-NEXT: movaps %xmm7, 240(%rdx) -; SSE-NEXT: movaps %xmm6, 192(%rdx) -; SSE-NEXT: movaps %xmm12, 208(%rdx) -; SSE-NEXT: movaps %xmm4, 160(%rdx) -; SSE-NEXT: movaps %xmm13, 176(%rdx) -; SSE-NEXT: movaps %xmm3, 128(%rdx) -; SSE-NEXT: movaps %xmm15, 144(%rdx) -; SSE-NEXT: movaps %xmm5, 96(%rdx) -; SSE-NEXT: movaps %xmm14, 112(%rdx) -; SSE-NEXT: movaps %xmm2, 64(%rdx) -; SSE-NEXT: movaps %xmm10, 80(%rdx) -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps %xmm9, 48(%rdx) -; SSE-NEXT: movaps %xmm8, (%rdx) +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE-NEXT: movaps %xmm0, 240(%rdx) +; SSE-NEXT: movaps %xmm7, 224(%rdx) +; SSE-NEXT: movaps %xmm6, 208(%rdx) +; SSE-NEXT: movaps %xmm12, 192(%rdx) +; SSE-NEXT: movaps %xmm4, 176(%rdx) +; SSE-NEXT: movaps %xmm13, 160(%rdx) +; SSE-NEXT: movaps %xmm3, 144(%rdx) +; SSE-NEXT: movaps %xmm15, 128(%rdx) +; SSE-NEXT: movaps %xmm5, 112(%rdx) +; SSE-NEXT: movaps %xmm14, 96(%rdx) +; SSE-NEXT: movaps %xmm2, 80(%rdx) +; SSE-NEXT: movaps %xmm10, 64(%rdx) +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps %xmm9, 32(%rdx) +; SSE-NEXT: movaps %xmm8, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride2_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm6[2],xmm10[3],xmm6[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -378,28 +378,28 @@ ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm7 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[6],ymm6[6],ymm2[7],ymm6[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[4],ymm6[4],ymm2[5],ymm6[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[6],ymm7[6],ymm3[7],ymm7[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[4],ymm7[4],ymm3[5],ymm7[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3],ymm8[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[0,1],ymm8[0,1] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 160(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[0,1],ymm8[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -409,17 +409,17 @@ ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-NEXT: vpermt2d %zmm2, %zmm4, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512-NEXT: vpermi2d %zmm3, %zmm1, %zmm4 ; AVX512-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <32 x i32>, ptr %in.vecptr0, align 64 @@ -585,106 +585,106 @@ ; ; AVX1-ONLY-LABEL: store_i32_stride2_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovaps 240(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 144(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovaps 144(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vmovaps 176(%rsi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm11, %ymm11 ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm12 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 208(%rsi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm14[2],xmm11[2],xmm14[3],xmm11[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm14[2],xmm9[2],xmm14[3],xmm9[3] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vmovaps 176(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 208(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 240(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 416(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm12, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 352(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 288(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 288(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -707,56 +707,56 @@ ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm14 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[0,1],ymm15[0,1] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm7[2],ymm13[2],ymm7[3],ymm13[3],ymm7[6],ymm13[6],ymm7[7],ymm13[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm7[0],ymm13[0],ymm7[1],ymm13[1],ymm7[4],ymm13[4],ymm7[5],ymm13[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm13[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm13[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm9[2],ymm14[2],ymm9[3],ymm14[3],ymm9[6],ymm14[6],ymm9[7],ymm14[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm14[0],ymm9[1],ymm14[1],ymm9[4],ymm14[4],ymm9[5],ymm14[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm9[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm9[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[4],ymm12[4],ymm8[5],ymm12[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm8[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm8[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm6[2],ymm11[2],ymm6[3],ymm11[3],ymm6[6],ymm11[6],ymm6[7],ymm11[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[4],ymm11[4],ymm6[5],ymm11[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm6[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm6[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[4],ymm10[4],ymm3[5],ymm10[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[0,1],ymm15[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm15 ; AVX2-ONLY-NEXT: vmovaps %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[0,1],ymm0[0,1] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 384(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 416(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 320(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 256(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 288(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 192(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 128(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 416(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 384(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 352(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 288(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 256(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 160(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -770,10 +770,10 @@ ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] ; AVX512-NEXT: vpermt2d %zmm4, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm4 @@ -783,14 +783,14 @@ ; AVX512-NEXT: vpermt2d %zmm6, %zmm10, %zmm2 ; AVX512-NEXT: vpermi2d %zmm7, %zmm3, %zmm8 ; AVX512-NEXT: vpermt2d %zmm7, %zmm10, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm3, 384(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm8, 448(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm5, 320(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm8, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm2, 320(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm5, 256(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll @@ -35,33 +35,46 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,0],xmm0[2,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,0,3,7,5,4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,u,1,u,5,u,u] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd (%rdx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-ONLY-NEXT: vmovlps %xmm1, 16(%rcx) ; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; -; AVX2-LABEL: store_i32_stride3_vf2: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,1,3,5,u,u> -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovlps %xmm1, 16(%rcx) -; AVX2-NEXT: vmovaps %xmm0, (%rcx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-ONLY-LABEL: store_i32_stride3_vf2: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = <0,4,u,1,5,u,u,u> +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-ONLY-NEXT: vmovlps %xmm1, 16(%rcx) +; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rcx) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: store_i32_stride3_vf2: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastq (%rdx), %ymm1 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,10,1,5,13,u,u> +; AVX512-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0 +; AVX512-NEXT: vmovq %xmm0, 16(%rcx) +; AVX512-NEXT: vmovdqa %xmm2, (%rcx) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.vec0 = load <2 x i32>, ptr %in.vecptr0, align 64 %in.vec1 = load <2 x i32>, ptr %in.vecptr1, align 64 %in.vec2 = load <2 x i32>, ptr %in.vecptr2, align 64 @@ -79,19 +92,19 @@ ; SSE-NEXT: movaps (%rdi), %xmm0 ; SSE-NEXT: movaps (%rsi), %xmm1 ; SSE-NEXT: movaps (%rdx), %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[0,3] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[1,0] ; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: movaps %xmm1, 16(%rcx) +; SSE-NEXT: movaps %xmm2, 16(%rcx) ; SSE-NEXT: movaps %xmm4, (%rcx) ; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: retq @@ -193,40 +206,40 @@ ; SSE: # %bb.0: ; SSE-NEXT: movaps (%rdi), %xmm0 ; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps (%rsi), %xmm3 -; SSE-NEXT: movaps 16(%rsi), %xmm5 -; SSE-NEXT: movaps (%rdx), %xmm2 +; SSE-NEXT: movaps (%rsi), %xmm5 +; SSE-NEXT: movaps 16(%rsi), %xmm2 +; SSE-NEXT: movaps (%rdx), %xmm6 ; SSE-NEXT: movaps 16(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE-NEXT: movaps %xmm1, %xmm7 -; SSE-NEXT: movaps %xmm1, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm5[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm4[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[2,0] -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm0[1,0] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm7[0,2] ; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: movaps %xmm0, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm2[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm7[2,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm7[0,2] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm1[1,0] +; SSE-NEXT: movaps %xmm1, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[0,2] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm4[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: movaps %xmm9, (%rcx) -; SSE-NEXT: movaps %xmm3, 16(%rcx) -; SSE-NEXT: movaps %xmm8, 48(%rcx) -; SSE-NEXT: movaps %xmm5, 64(%rcx) -; SSE-NEXT: movaps %xmm0, 32(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] +; SSE-NEXT: movaps %xmm4, 64(%rcx) +; SSE-NEXT: movaps %xmm7, 48(%rcx) +; SSE-NEXT: movaps %xmm6, 16(%rcx) +; SSE-NEXT: movaps %xmm3, (%rcx) ; SSE-NEXT: movaps %xmm1, 80(%rcx) +; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride3_vf8: @@ -294,27 +307,28 @@ ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm3 +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] ; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7] ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [5,0,7,6,5,0,7,6] ; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -374,83 +388,78 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride3_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm4 -; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps 32(%rdi), %xmm1 -; SSE-NEXT: movaps 48(%rdi), %xmm8 -; SSE-NEXT: movaps (%rsi), %xmm5 +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movaps 32(%rdi), %xmm2 +; SSE-NEXT: movaps 48(%rdi), %xmm3 +; SSE-NEXT: movaps (%rsi), %xmm11 ; SSE-NEXT: movaps 16(%rsi), %xmm9 -; SSE-NEXT: movaps 32(%rsi), %xmm10 -; SSE-NEXT: movaps 48(%rsi), %xmm11 -; SSE-NEXT: movaps 16(%rdx), %xmm0 -; SSE-NEXT: movaps 32(%rdx), %xmm3 -; SSE-NEXT: movaps 48(%rdx), %xmm7 -; SSE-NEXT: movaps %xmm8, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] -; SSE-NEXT: movaps %xmm8, %xmm13 -; SSE-NEXT: movaps %xmm8, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm11[3,3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm12[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm7[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm13[2,0] -; SSE-NEXT: movaps %xmm1, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] +; SSE-NEXT: movaps 32(%rsi), %xmm7 +; SSE-NEXT: movaps 48(%rsi), %xmm4 +; SSE-NEXT: movaps (%rdx), %xmm12 +; SSE-NEXT: movaps 16(%rdx), %xmm10 +; SSE-NEXT: movaps 32(%rdx), %xmm8 +; SSE-NEXT: movaps 48(%rdx), %xmm6 +; SSE-NEXT: movaps %xmm12, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,0],xmm0[1,0] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm13[0,2] +; SSE-NEXT: movaps %xmm0, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm11[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm11[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm13[0,2] +; SSE-NEXT: movaps %xmm10, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,0],xmm1[1,0] ; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm10[3,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm12[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm3[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm13[2,0] -; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm9[1] -; SSE-NEXT: movaps %xmm2, %xmm13 -; SSE-NEXT: movaps %xmm2, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm9[3,3] -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm12[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm0[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2,0] -; SSE-NEXT: movaps %xmm4, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm5[1] -; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm5[3,3] -; SSE-NEXT: movaps (%rdx), %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm12[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm3[2,0] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[1,2],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm7[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm0[2,3] -; SSE-NEXT: movaps %xmm13, (%rcx) -; SSE-NEXT: movaps %xmm5, 16(%rcx) -; SSE-NEXT: movaps %xmm15, 48(%rcx) -; SSE-NEXT: movaps %xmm9, 64(%rcx) -; SSE-NEXT: movaps %xmm14, 96(%rcx) -; SSE-NEXT: movaps %xmm10, 112(%rcx) -; SSE-NEXT: movaps %xmm6, 144(%rcx) -; SSE-NEXT: movaps %xmm11, 160(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] -; SSE-NEXT: movaps %xmm4, 32(%rcx) +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm11[0,2] +; SSE-NEXT: movaps %xmm1, %xmm11 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm9[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm10[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm11[0,2] +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,0],xmm2[1,0] +; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm9[0,2] +; SSE-NEXT: movaps %xmm2, %xmm9 +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm8[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm9[0,2] +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm3[1,0] +; SSE-NEXT: movaps %xmm3, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm7[0,2] +; SSE-NEXT: movaps %xmm3, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm6[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm7[0,2] +; SSE-NEXT: movaps %xmm6, 160(%rcx) +; SSE-NEXT: movaps %xmm9, 144(%rcx) +; SSE-NEXT: movaps %xmm8, 112(%rcx) +; SSE-NEXT: movaps %xmm11, 96(%rcx) +; SSE-NEXT: movaps %xmm10, 64(%rcx) +; SSE-NEXT: movaps %xmm13, 48(%rcx) +; SSE-NEXT: movaps %xmm12, 16(%rcx) +; SSE-NEXT: movaps %xmm5, (%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0,1,3] +; SSE-NEXT: movaps %xmm3, 176(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3] -; SSE-NEXT: movaps %xmm2, 80(%rcx) +; SSE-NEXT: movaps %xmm2, 128(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] -; SSE-NEXT: movaps %xmm1, 128(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3] -; SSE-NEXT: movaps %xmm8, 176(%rcx) +; SSE-NEXT: movaps %xmm1, 80(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: movaps %xmm0, 32(%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride3_vf16: @@ -471,14 +480,13 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vbroadcastsd (%rdx), %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm6[3,3],xmm5[3,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,1],xmm6[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3],xmm3[3,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm7[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6],ymm6[7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm4[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[1,1],xmm6[0,2] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0] @@ -486,13 +494,14 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3],xmm3[3,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm7[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[3,3],xmm5[3,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,1],xmm6[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm7 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] @@ -505,9 +514,9 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 160(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -526,11 +535,11 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vbroadcastsd (%rdx), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 56(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[2,1,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm6[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,1] @@ -538,11 +547,11 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7] ; AVX2-SLOW-NEXT: vbroadcastsd 32(%rdx), %ymm9 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vbroadcastsd 56(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm3[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,1,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm1[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] @@ -556,60 +565,62 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm7, 160(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%rcx) ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i32_stride3_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm3 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm4 -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm5 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm6 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm1[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3,4],ymm0[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm0[1,2],ymm7[3],ymm0[4,5],ymm7[6],ymm0[7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm7, %ymm8 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5,6],ymm8[7] -; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm6[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = [5,6,5,6,5,6,7,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm9, %ymm10 -; AVX2-FAST-NEXT: vbroadcastsd 56(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm5 +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm6 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm2 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm7 +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm3[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2,3],ymm2[4],ymm9[5,6],ymm2[7] +; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm9[2],ymm2[3,4],ymm9[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7] +; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm6[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0],ymm3[1,2],ymm8[3],ymm3[4,5],ymm8[6],ymm3[7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm8 = [5,6,5,6,5,6,7,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm9[2],ymm5[3,4],ymm9[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5,6],ymm6[7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] -; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm2, 96(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm6, 160(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm8, 128(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm0[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm1[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vbroadcastsd 56(%rdi), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm7, 96(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -627,11 +638,11 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rdx), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm6[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,1] @@ -639,11 +650,11 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 32(%rdx), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm3[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm1[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] @@ -657,9 +668,9 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -700,250 +711,245 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride3_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movaps (%rdi), %xmm1 -; SSE-NEXT: movaps 16(%rdi), %xmm2 +; SSE-NEXT: subq $40, %rsp +; SSE-NEXT: movaps 64(%rdi), %xmm2 +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm3 ; SSE-NEXT: movaps 48(%rdi), %xmm4 +; SSE-NEXT: movaps 64(%rsi), %xmm6 ; SSE-NEXT: movaps (%rsi), %xmm12 -; SSE-NEXT: movaps 16(%rsi), %xmm11 -; SSE-NEXT: movaps 32(%rsi), %xmm10 -; SSE-NEXT: movaps 48(%rsi), %xmm9 -; SSE-NEXT: movaps (%rdx), %xmm5 -; SSE-NEXT: movaps 16(%rdx), %xmm6 -; SSE-NEXT: movaps 32(%rdx), %xmm7 -; SSE-NEXT: movaps 48(%rdx), %xmm8 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[0,3] -; SSE-NEXT: movaps %xmm5, %xmm14 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: movaps 16(%rsi), %xmm10 +; SSE-NEXT: movaps 32(%rsi), %xmm8 +; SSE-NEXT: movaps 48(%rsi), %xmm7 +; SSE-NEXT: movaps (%rdx), %xmm14 +; SSE-NEXT: movaps 16(%rdx), %xmm13 +; SSE-NEXT: movaps 32(%rdx), %xmm11 +; SSE-NEXT: movaps 48(%rdx), %xmm9 +; SSE-NEXT: movaps %xmm14, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,0],xmm0[1,0] +; SSE-NEXT: movaps %xmm0, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm15[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm12[3,3] +; SSE-NEXT: movaps %xmm0, %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm12[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm14[2,3] +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm12[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm15[0,2] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,0],xmm1[1,0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm14[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm13[0,2] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[0,3] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm10[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm10[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm13[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm11[3,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm6[1,1] -; SSE-NEXT: movaps %xmm6, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm10[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm15[0,2] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,0],xmm3[1,0] ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[0,3] -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm10[3,3] -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[0,3] -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm9[3,3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm9 -; SSE-NEXT: movaps 64(%rdx), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps 64(%rsi), %xmm12 -; SSE-NEXT: movaps %xmm9, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm12[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,2] -; SSE-NEXT: movaps 80(%rdi), %xmm2 -; SSE-NEXT: movaps 80(%rdx), %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm8[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm11[2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm15[0,2] +; SSE-NEXT: movaps %xmm9, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,0],xmm4[1,0] +; SSE-NEXT: movaps %xmm4, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm15[0,2] +; SSE-NEXT: movaps %xmm4, %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm9[2,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm7[1,1] +; SSE-NEXT: movaps 64(%rdx), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm15[0,2] +; SSE-NEXT: movaps %xmm0, %xmm15 ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[0,3] -; SSE-NEXT: movaps 80(%rsi), %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,0],xmm2[1,0] ; SSE-NEXT: movaps %xmm2, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm8[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm1[0,2] +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm15[0,2] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm2[0,2] +; SSE-NEXT: movaps 80(%rdi), %xmm6 +; SSE-NEXT: movaps 80(%rdx), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm6[1,0] +; SSE-NEXT: movaps 80(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm2[0,2] +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,2],xmm8[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm2[0,2] ; SSE-NEXT: movaps 96(%rdi), %xmm0 -; SSE-NEXT: movaps 96(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[0,3] -; SSE-NEXT: movaps 96(%rsi), %xmm6 -; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm3[0,2] -; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps 112(%rdx), %xmm10 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm10[0,3] -; SSE-NEXT: movaps 112(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movaps 96(%rdx), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[1,0] +; SSE-NEXT: movaps 96(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,0] -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm10[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; SSE-NEXT: movaps %xmm15, %xmm5 -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[1,2],mem[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm14[2,3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[1,2],mem[2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,2],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm10[2,3] -; SSE-NEXT: movaps %xmm1, 352(%rcx) -; SSE-NEXT: movaps %xmm4, 336(%rcx) -; SSE-NEXT: movaps %xmm6, 304(%rcx) -; SSE-NEXT: movaps %xmm7, 288(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[0,2] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm5[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[0,2] +; SSE-NEXT: movaps 112(%rdi), %xmm1 +; SSE-NEXT: movaps 112(%rdx), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm1[1,0] +; SSE-NEXT: movaps 112(%rsi), %xmm3 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm7[0,2] +; SSE-NEXT: movaps %xmm1, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm9[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm7[0,2] +; SSE-NEXT: movaps %xmm9, 352(%rcx) +; SSE-NEXT: movaps %xmm2, 336(%rcx) +; SSE-NEXT: movaps %xmm5, 304(%rcx) +; SSE-NEXT: movaps %xmm4, 288(%rcx) ; SSE-NEXT: movaps %xmm8, 256(%rcx) -; SSE-NEXT: movaps %xmm11, 240(%rcx) -; SSE-NEXT: movaps %xmm12, 208(%rcx) -; SSE-NEXT: movaps %xmm13, 192(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 144(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0,1,3] -; SSE-NEXT: movaps %xmm3, 368(%rcx) +; SSE-NEXT: movaps %xmm10, 240(%rcx) +; SSE-NEXT: movaps %xmm15, 208(%rcx) +; SSE-NEXT: movaps %xmm11, 192(%rcx) +; SSE-NEXT: movaps %xmm12, 160(%rcx) +; SSE-NEXT: movaps %xmm13, 144(%rcx) +; SSE-NEXT: movaps %xmm14, 112(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] +; SSE-NEXT: movaps %xmm1, 368(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 320(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3] -; SSE-NEXT: movaps %xmm2, 272(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3] -; SSE-NEXT: movaps %xmm9, 224(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3] +; SSE-NEXT: movaps %xmm6, 272(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: movaps %xmm0, 224(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 176(%rcx) -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[1,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 128(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 80(%rcx) -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,3] +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: addq $152, %rsp +; SSE-NEXT: addq $40, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride3_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd (%rdx), %ymm4 -; AVX1-ONLY-NEXT: vmovapd 32(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 64(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vmovapd (%rdx), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 32(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vmovapd 64(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovapd 96(%rdx), %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm5[1],xmm1[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm1[1,1],xmm6[0,2] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd (%rdx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm6[3,3],xmm5[3,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,1],xmm6[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm3[2,3,2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7] -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm12[1],xmm6[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm6[1,1],xmm13[0,2] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm12[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0],xmm12[2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm12[2],ymm6[3,4],ymm12[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[3,3],xmm9[3,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],xmm9[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[1,1],xmm12[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0,0,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5,6],ymm12[7] +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm8[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm1[1,1],xmm11[0,2] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd (%rdx), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm9[3,3],xmm4[3,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm4[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,1],xmm9[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm5[2,3,2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0,0,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1],ymm4[2,3],ymm8[4],ymm4[5,6],ymm8[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm10[1],xmm6[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm6[1,1],xmm8[0,2] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm10[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0],xmm10[2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3,4],ymm8[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[3,3],xmm7[3,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1],xmm8[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0,0,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm8[1,1],xmm10[0,2] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm8[0],xmm9[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,0],xmm9[2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm10[3,3],xmm9[3,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[1,1],xmm10[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[0,0,3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm11[1],xmm10[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm10[1,1],xmm12[0,2] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,0],xmm11[2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm11 +; AVX1-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] ; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm12 @@ -954,53 +960,37 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0,0,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6],ymm12[7] -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm13[1],xmm12[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm12[1,1],xmm14[0,2] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm12[0],xmm13[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,0],xmm13[2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm8[3,3],xmm7[3,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1],xmm8[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0,0,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6],ymm8[7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1,1,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1,1,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1,1,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm12[1,2],ymm3[3],ymm12[4,5],ymm3[6],ymm12[7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1,1,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = mem[0,0,3,3,4,4,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm12[1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = mem[0,0,3,3,4,4,7,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1,2],ymm0[3],ymm12[4,5],ymm0[6],ymm12[7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 288(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm11, 352(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 192(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 256(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 288(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 256(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -1008,106 +998,106 @@ ; AVX2-SLOW-LABEL: store_i32_stride3_vf32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $40, %rsp -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm9 -; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm7 -; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm10 -; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm3 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm10 +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm7 +; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm4 +; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm11 +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm8 +; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm5 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm8[0,0,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vbroadcastsd (%rdx), %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[0,0,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vbroadcastsd (%rdx), %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastsd 88(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm9[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,1,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6],ymm11[7] +; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm10[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm2[2],ymm9[3,4],ymm2[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm11[2,1,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm9[1],ymm2[2,3],ymm9[4],ymm2[5,6],ymm9[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm11 = mem[1,0,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm6[0,0,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6],ymm11[7] -; AVX2-SLOW-NEXT: vbroadcastsd 64(%rdx), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm9 = mem[1,0,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm3[0,0,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5,6],ymm9[7] +; AVX2-SLOW-NEXT: vbroadcastsd 32(%rdx), %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 56(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm7[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm7[2,1,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm8[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,1] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] -; AVX2-SLOW-NEXT: vbroadcastsd 32(%rdx), %ymm14 +; AVX2-SLOW-NEXT: vbroadcastsd 64(%rdx), %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastsd 120(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vbroadcastsd 88(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm3[2,1,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm5[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm15 = mem[1,0,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,0,1] ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6],ymm15[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vbroadcastsd 96(%rdx), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm15 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm15[1,2,3,3,5,6,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm15 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm15[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-SLOW-NEXT: vbroadcastsd 120(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm12 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,1,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,0,3,3,4,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm8[1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2,3],ymm13[4],ymm9[5,6],ymm13[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm10[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm11[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0],ymm6[1,2],ymm10[3],ymm6[4,5],ymm10[6],ymm6[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2],ymm3[3,4],ymm7[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm8[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0],ymm3[1,2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm7[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm15[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm12[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 32(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm11, 64(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm4, 288(%rcx) -; AVX2-SLOW-NEXT: vmovaps %ymm14, 352(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm9, 352(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm2, 288(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm14, 256(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rcx) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-SLOW-NEXT: addq $40, %rsp @@ -1116,203 +1106,203 @@ ; ; AVX2-FAST-LABEL: store_i32_stride3_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm10 -; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm13 -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm12 -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm14 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = [5,6,5,6,5,6,7,7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm9, %ymm5 -; AVX2-FAST-NEXT: vbroadcastsd 88(%rdi), %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3,4],ymm7[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm14[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5,6],ymm7[7] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm11[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6],ymm7[7] -; AVX2-FAST-NEXT: vbroadcastsd 64(%rdx), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm14[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0],ymm11[1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm6[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm4 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm8 +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm12 +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm11 +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %xmm6 +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm10[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0],ymm1[1],ymm13[2,3],ymm1[4],ymm13[5,6],ymm1[7] +; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm13[2],ymm1[3,4],ymm13[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm7[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0],ymm3[1],ymm13[2,3],ymm3[4],ymm13[5,6],ymm3[7] +; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm5[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5,6],ymm6[7] +; AVX2-FAST-NEXT: vbroadcastsd 64(%rdx), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovaps 96(%rsi), %xmm13 +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm2[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0],ymm9[1],ymm13[2,3],ymm9[4],ymm13[5,6],ymm9[7] +; AVX2-FAST-NEXT: vbroadcastsd 96(%rdx), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2],ymm9[3,4],ymm13[5],ymm9[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm11[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm13 = [5,6,5,6,5,6,7,7] +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm13, %ymm12 +; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7] +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm14 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6],ymm11[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm8[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm14[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0],ymm7[1,2],ymm12[3],ymm7[4,5],ymm12[6],ymm7[7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm13, %ymm8 +; AVX2-FAST-NEXT: vbroadcastsd 56(%rdi), %ymm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2],ymm8[3,4],ymm12[5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm12 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3],ymm14[4],ymm8[5,6],ymm14[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm4[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2],ymm5[3,4],ymm14[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm12[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm9, %ymm14 -; AVX2-FAST-NEXT: vbroadcastsd 56(%rdi), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm15 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm8, %ymm10 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5,6],ymm10[7] -; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm15[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm3[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0],ymm5[1,2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vbroadcastsd 88(%rdi), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm14[2],ymm4[3,4],ymm14[5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm14 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm14[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm9, %ymm10 -; AVX2-FAST-NEXT: vbroadcastsd 120(%rdi), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm4[2],ymm10[3,4],ymm4[5],ymm10[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm14[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm10[1],ymm4[2,3],ymm10[4],ymm4[5,6],ymm10[7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5,6],ymm8[7] -; AVX2-FAST-NEXT: vbroadcastsd 96(%rdx), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm3, 288(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm4, 352(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm12, 160(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm13, 128(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm11, 224(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm7, 192(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm5, 256(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3],ymm12[4],ymm4[5,6],ymm12[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm14[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0],ymm2[1,2],ymm12[3],ymm2[4,5],ymm12[6],ymm2[7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm13, %ymm0 +; AVX2-FAST-NEXT: vbroadcastsd 120(%rdi), %ymm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm14[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7] +; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm2, 320(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm4, 256(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 224(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm8, 160(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm7, 128(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm11, 64(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm10, 32(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm9, 288(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm6, 192(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride3_vf32: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $40, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rdx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 88(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm9[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm10[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm2[2],ymm9[3,4],ymm2[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm11[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm9[1],ymm2[2,3],ymm9[4],ymm2[5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm11 = mem[1,0,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm6[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 64(%rdx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm9 = mem[1,0,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm3[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 32(%rdx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm7[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm7[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm8[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 32(%rdx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 64(%rdx), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 88(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm3[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm5[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm15 = mem[1,0,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 96(%rdx), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm15[1,2,3,3,5,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm15[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4],ymm12[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,0,3,3,4,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm8[1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2,3],ymm13[4],ymm9[5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm10[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm11[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0],ymm6[1,2],ymm10[3],ymm6[4,5],ymm10[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2],ymm3[3,4],ymm7[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm8[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0],ymm3[1,2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm7[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm15[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm12[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 288(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 352(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 352(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 288(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 256(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: addq $40, %rsp @@ -1332,27 +1322,26 @@ ; AVX512-NEXT: vpermt2d %zmm2, %zmm6, %zmm7 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512-NEXT: vpermt2d %zmm3, %zmm9, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] -; AVX512-NEXT: vpermt2d %zmm5, %zmm11, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = <5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10> -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512-NEXT: vpermt2d %zmm1, %zmm12, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] -; AVX512-NEXT: vpermt2d %zmm5, %zmm14, %zmm13 -; AVX512-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 -; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm1 -; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm9 -; AVX512-NEXT: vpermt2d %zmm4, %zmm11, %zmm9 -; AVX512-NEXT: vpermt2d %zmm0, %zmm12, %zmm2 -; AVX512-NEXT: vpermt2d %zmm4, %zmm14, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm9, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm13, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm10, 320(%rcx) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = <5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10> +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512-NEXT: vpermt2d %zmm0, %zmm9, %zmm10 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512-NEXT: vpermt2d %zmm4, %zmm11, %zmm10 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512-NEXT: vpermt2d %zmm2, %zmm12, %zmm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512-NEXT: vpermt2d %zmm4, %zmm2, %zmm0 +; AVX512-NEXT: vpermi2d %zmm3, %zmm1, %zmm6 +; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm6 +; AVX512-NEXT: vpermi2d %zmm1, %zmm3, %zmm9 +; AVX512-NEXT: vpermt2d %zmm5, %zmm11, %zmm9 +; AVX512-NEXT: vpermt2d %zmm3, %zmm12, %zmm1 +; AVX512-NEXT: vpermt2d %zmm5, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm1, 320(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm9, 256(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm6, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm10, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1370,386 +1359,338 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride3_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $664, %rsp # imm = 0x298 +; SSE-NEXT: subq $424, %rsp # imm = 0x1A8 +; SSE-NEXT: movaps 64(%rdi), %xmm3 ; SSE-NEXT: movaps (%rdi), %xmm1 ; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps 32(%rdi), %xmm3 -; SSE-NEXT: movaps 48(%rdi), %xmm4 -; SSE-NEXT: movaps (%rsi), %xmm12 -; SSE-NEXT: movaps 16(%rsi), %xmm11 -; SSE-NEXT: movaps 32(%rsi), %xmm10 -; SSE-NEXT: movaps 48(%rsi), %xmm9 -; SSE-NEXT: movaps (%rdx), %xmm5 -; SSE-NEXT: movaps 16(%rdx), %xmm6 -; SSE-NEXT: movaps 32(%rdx), %xmm7 -; SSE-NEXT: movaps 48(%rdx), %xmm8 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[0,3] -; SSE-NEXT: movaps %xmm5, %xmm14 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm12[3,3] +; SSE-NEXT: movaps 32(%rdi), %xmm4 +; SSE-NEXT: movaps 48(%rdi), %xmm5 +; SSE-NEXT: movaps 64(%rsi), %xmm6 +; SSE-NEXT: movaps (%rsi), %xmm11 +; SSE-NEXT: movaps 16(%rsi), %xmm9 +; SSE-NEXT: movaps 32(%rsi), %xmm8 +; SSE-NEXT: movaps 48(%rsi), %xmm7 +; SSE-NEXT: movaps (%rdx), %xmm14 +; SSE-NEXT: movaps 16(%rdx), %xmm13 +; SSE-NEXT: movaps 32(%rdx), %xmm12 +; SSE-NEXT: movaps 48(%rdx), %xmm10 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,0] +; SSE-NEXT: movaps %xmm1, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm11[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm14[2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm14[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm13[0,2] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm11[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm15[0,2] +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[1,0] ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm11[3,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm9[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,2],xmm13[2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm6[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[0,3] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm10[3,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[0,2] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm4[1,0] ; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm9[3,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm8[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm12[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdi), %xmm2 -; SSE-NEXT: movaps 64(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] -; SSE-NEXT: movaps 64(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[0,2] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm5[1,0] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdi), %xmm2 -; SSE-NEXT: movaps 80(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] -; SSE-NEXT: movaps 80(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,2],xmm10[2,3] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm7[1,1] +; SSE-NEXT: movaps 64(%rdx), %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm0[0,2] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[1,0] +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm1[2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdi), %xmm2 -; SSE-NEXT: movaps 96(%rdx), %xmm4 +; SSE-NEXT: movaps 80(%rdi), %xmm3 +; SSE-NEXT: movaps 80(%rdx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[1,0] +; SSE-NEXT: movaps 80(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] -; SSE-NEXT: movaps 96(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm0[2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdi), %xmm2 -; SSE-NEXT: movaps 112(%rdx), %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdi), %xmm3 +; SSE-NEXT: movaps 96(%rdx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[1,0] +; SSE-NEXT: movaps 96(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] -; SSE-NEXT: movaps 112(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm0[2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm2 -; SSE-NEXT: movaps 128(%rdx), %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rdi), %xmm3 +; SSE-NEXT: movaps 112(%rdx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[1,0] +; SSE-NEXT: movaps 112(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3] -; SSE-NEXT: movaps 128(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm0[2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm13 -; SSE-NEXT: movaps 144(%rdx), %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdi), %xmm3 +; SSE-NEXT: movaps 128(%rdx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[1,0] +; SSE-NEXT: movaps 128(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm0[2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[0,3] -; SSE-NEXT: movaps 144(%rsi), %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rdi), %xmm15 +; SSE-NEXT: movaps 144(%rdx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm15[1,0] +; SSE-NEXT: movaps 144(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm15, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,2],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm13 +; SSE-NEXT: movaps 160(%rdx), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm13[1,0] +; SSE-NEXT: movaps 160(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm13, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm9 -; SSE-NEXT: movaps 160(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[0,3] -; SSE-NEXT: movaps 160(%rsi), %xmm15 -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,2],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[0,2] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%rdi), %xmm10 +; SSE-NEXT: movaps 176(%rdx), %xmm9 ; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm15[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdi), %xmm15 -; SSE-NEXT: movaps 176(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[0,3] -; SSE-NEXT: movaps 176(%rsi), %xmm11 -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,3],xmm11[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdi), %xmm11 -; SSE-NEXT: movaps 192(%rdx), %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm10[1,0] +; SSE-NEXT: movaps 176(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[0,3] -; SSE-NEXT: movaps 192(%rsi), %xmm14 -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3],xmm14[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] -; SSE-NEXT: movaps 208(%rdi), %xmm8 -; SSE-NEXT: movaps 208(%rdx), %xmm2 +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,2],xmm9[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[0,2] +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdi), %xmm12 +; SSE-NEXT: movaps 192(%rdx), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm12[1,0] +; SSE-NEXT: movaps 192(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm12, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[0,3] -; SSE-NEXT: movaps 208(%rsi), %xmm7 -; SSE-NEXT: movaps %xmm8, %xmm10 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[0,2] -; SSE-NEXT: movaps 224(%rdi), %xmm2 -; SSE-NEXT: movaps 224(%rdx), %xmm3 -; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[0,3] -; SSE-NEXT: movaps 224(%rsi), %xmm5 -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm5[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] -; SSE-NEXT: movaps 240(%rdi), %xmm0 -; SSE-NEXT: movaps 240(%rdx), %xmm12 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm12[0,3] -; SSE-NEXT: movaps 240(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,2],xmm14[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[0,2] +; SSE-NEXT: movaps 208(%rdi), %xmm7 +; SSE-NEXT: movaps 208(%rdx), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm7[1,0] +; SSE-NEXT: movaps 208(%rsi), %xmm0 +; SSE-NEXT: movaps %xmm7, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm3[0,2] +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm8[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm3[0,2] +; SSE-NEXT: movaps 224(%rdi), %xmm4 +; SSE-NEXT: movaps 224(%rdx), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm4[1,0] +; SSE-NEXT: movaps 224(%rsi), %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm3[0,2] +; SSE-NEXT: movaps %xmm4, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm12[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: # xmm11 = xmm11[1,2],mem[2,3] -; SSE-NEXT: shufps $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[1,2],mem[2,3] -; SSE-NEXT: shufps $233, (%rsp), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1,2],mem[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm12[2,3] -; SSE-NEXT: movaps %xmm1, 736(%rcx) -; SSE-NEXT: movaps %xmm4, 720(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,2],xmm5[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[0,2] +; SSE-NEXT: movaps 240(%rdi), %xmm1 +; SSE-NEXT: movaps 240(%rdx), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm1[1,0] +; SSE-NEXT: movaps 240(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,2] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm9[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm3[0,2] +; SSE-NEXT: movaps %xmm9, 736(%rcx) +; SSE-NEXT: movaps %xmm0, 720(%rcx) ; SSE-NEXT: movaps %xmm5, 688(%rcx) ; SSE-NEXT: movaps %xmm6, 672(%rcx) -; SSE-NEXT: movaps %xmm7, 640(%rcx) -; SSE-NEXT: movaps %xmm10, 624(%rcx) +; SSE-NEXT: movaps %xmm8, 640(%rcx) +; SSE-NEXT: movaps %xmm11, 624(%rcx) ; SSE-NEXT: movaps %xmm14, 592(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 576(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 544(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 528(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 496(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 480(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 448(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 432(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 400(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 384(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 352(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 336(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 304(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 288(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 256(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 240(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 208(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 192(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 160(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 144(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 112(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 96(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 64(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 48(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 16(%rcx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: movaps %xmm0, 752(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0,1,3] -; SSE-NEXT: movaps %xmm2, 704(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0,1,3] -; SSE-NEXT: movaps %xmm8, 656(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0,1,3] -; SSE-NEXT: movaps %xmm11, 608(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] -; SSE-NEXT: movaps %xmm15, 560(%rcx) -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3] -; SSE-NEXT: movaps %xmm9, 512(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 576(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 544(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 528(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 496(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 480(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 448(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 432(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 400(%rcx) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 384(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 304(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 288(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 256(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 192(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] +; SSE-NEXT: movaps %xmm1, 752(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0,1,3] +; SSE-NEXT: movaps %xmm4, 704(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3] +; SSE-NEXT: movaps %xmm7, 656(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0,1,3] +; SSE-NEXT: movaps %xmm12, 608(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0,1,3] +; SSE-NEXT: movaps %xmm10, 560(%rcx) ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0,1,3] -; SSE-NEXT: movaps %xmm13, 464(%rcx) -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,3] +; SSE-NEXT: movaps %xmm13, 512(%rcx) +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0,1,3] +; SSE-NEXT: movaps %xmm15, 464(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 416(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] @@ -1775,7 +1716,7 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; SSE-NEXT: movaps %xmm0, 32(%rcx) -; SSE-NEXT: addq $664, %rsp # imm = 0x298 +; SSE-NEXT: addq $424, %rsp # imm = 0x1A8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride3_vf64: @@ -2269,197 +2210,205 @@ ; AVX2-FAST-LABEL: store_i32_stride3_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $232, %rsp -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm8 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm0 -; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm13 -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm10 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm14 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm6, %ymm7 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6],ymm7[7] -; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm10[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm7 = [5,6,5,6,5,6,7,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm10[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1],ymm4[2,3],ymm8[4],ymm4[5,6],ymm8[7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5,6],ymm4[7] -; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm3[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vbroadcastsd 56(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm9[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX2-FAST-NEXT: vbroadcastsd 64(%rdx), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm9[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vbroadcastsd 88(%rdi), %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[0,0,2,1] +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %xmm3 +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm13[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6],ymm1[7] +; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm14[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm12[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-NEXT: vbroadcastsd 64(%rdx), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 96(%rsi), %xmm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm9[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-NEXT: vbroadcastsd 96(%rdx), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 128(%rsi), %xmm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-NEXT: vbroadcastsd 128(%rdx), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 160(%rsi), %xmm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm8[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-NEXT: vbroadcastsd 160(%rdx), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 192(%rsi), %xmm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm10[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] +; AVX2-FAST-NEXT: vbroadcastsd 192(%rdx), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 224(%rsi), %xmm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm6 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,0,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] -; AVX2-FAST-NEXT: vbroadcastsd 96(%rdx), %ymm1 +; AVX2-FAST-NEXT: vbroadcastsd 224(%rdx), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[1,1,2,2] +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm3 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm3[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vbroadcastsd 120(%rdi), %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm14[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] -; AVX2-FAST-NEXT: vbroadcastsd 128(%rdx), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps 128(%rdx), %ymm2 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vbroadcastsd 152(%rdi), %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6],ymm2[7] -; AVX2-FAST-NEXT: vbroadcastsd 160(%rdx), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [5,6,5,6,5,6,7,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps 160(%rdx), %ymm4 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[1,1,2,2] +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm3 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm3[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm4 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vbroadcastsd 184(%rdi), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm5 -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm4[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3],ymm0[4],ymm9[5,6],ymm0[7] -; AVX2-FAST-NEXT: vbroadcastsd 192(%rdx), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm5[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovaps 192(%rdx), %ymm9 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0],ymm4[1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vbroadcastsd 216(%rdi), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2],ymm5[3,4],ymm10[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovaps 224(%rsi), %ymm9 -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm10 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6],ymm6[7] -; AVX2-FAST-NEXT: vbroadcastsd 224(%rdx), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2],ymm6[3,4],ymm11[5],ymm6[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vbroadcastsd 56(%rdi), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm4[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[1,1,2,2] +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm3 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm3[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm4 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm4[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm1[1,2],ymm12[3],ymm1[4,5],ymm12[6],ymm1[7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vbroadcastsd 88(%rdi), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm4[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm9[1,1,2,2] +; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm3 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm3[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2],ymm1[3,4],ymm9[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm0 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm1[1,2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vbroadcastsd 120(%rdi), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm7[1,1,2,2] +; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 128(%rdx), %ymm7 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm7[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm0[1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vbroadcastsd 152(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm11[2],ymm1[3,4],ymm11[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3],ymm7[4],ymm1[5,6],ymm7[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm8[1,1,2,2] +; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm8 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm8[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7] +; AVX2-FAST-NEXT: vmovaps 160(%rdx), %ymm11 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm11[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0],ymm7[1,2],ymm13[3],ymm7[4,5],ymm13[6],ymm7[7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vbroadcastsd 184(%rdi), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6],ymm11[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[1,1,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,0,3,3,4,4,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovaps 224(%rdx), %ymm11 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0],ymm10[1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vbroadcastsd 248(%rdi), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm11[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovaps %ymm7, 736(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm10, 704(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm6, 672(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm5, 640(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm4, 608(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm0, 576(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm1, 544(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm2, 512(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm3, 480(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm8, 448(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm15, 416(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm14, 384(%rcx) -; AVX2-FAST-NEXT: vmovaps %ymm13, 352(%rcx) +; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm11 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm11[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovaps 192(%rdx), %ymm13 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm13[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm2, %ymm11 +; AVX2-FAST-NEXT: vbroadcastsd 216(%rdi), %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] +; AVX2-FAST-NEXT: vmovaps 224(%rsi), %ymm13 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm15[2],ymm6[3,4],ymm15[5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovaps 224(%rdx), %ymm15 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm15[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0],ymm6[1,2],ymm14[3],ymm6[4,5],ymm14[6],ymm6[7] +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vbroadcastsd 248(%rdi), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm13[2],ymm2[3,4],ymm13[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm15[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7] +; AVX2-FAST-NEXT: vmovaps %ymm2, 736(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm6, 704(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm11, 640(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm10, 608(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm8, 544(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm7, 512(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm1, 448(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm3, 352(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm9, 320(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm4, 256(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm12, 224(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm5, 160(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 672(%rcx) ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 576(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 480(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 384(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-FAST-NEXT: addq $232, %rsp @@ -2713,10 +2662,10 @@ ; ; AVX512-LABEL: store_i32_stride3_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm4 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm6 @@ -2726,53 +2675,52 @@ ; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm11 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5> -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm13 ; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm13 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] ; AVX512-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512-NEXT: vpermt2d %zmm7, %zmm15, %zmm16 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] -; AVX512-NEXT: vpermt2d %zmm11, %zmm17, %zmm16 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = <5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10> -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512-NEXT: vpermt2d %zmm0, %zmm18, %zmm19 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] -; AVX512-NEXT: vpermt2d %zmm11, %zmm20, %zmm19 -; AVX512-NEXT: vpermt2d %zmm7, %zmm12, %zmm0 -; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512-NEXT: vpermt2d %zmm6, %zmm15, %zmm7 -; AVX512-NEXT: vpermt2d %zmm10, %zmm17, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512-NEXT: vpermt2d %zmm1, %zmm18, %zmm11 -; AVX512-NEXT: vpermt2d %zmm10, %zmm20, %zmm11 -; AVX512-NEXT: vpermt2d %zmm6, %zmm12, %zmm1 -; AVX512-NEXT: vpermt2d %zmm10, %zmm14, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512-NEXT: vpermt2d %zmm5, %zmm15, %zmm6 -; AVX512-NEXT: vpermt2d %zmm9, %zmm17, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512-NEXT: vpermt2d %zmm2, %zmm18, %zmm10 -; AVX512-NEXT: vpermt2d %zmm9, %zmm20, %zmm10 -; AVX512-NEXT: vpermt2d %zmm5, %zmm12, %zmm2 -; AVX512-NEXT: vpermt2d %zmm9, %zmm14, %zmm2 -; AVX512-NEXT: vpermi2d %zmm4, %zmm3, %zmm15 -; AVX512-NEXT: vpermt2d %zmm8, %zmm17, %zmm15 -; AVX512-NEXT: vpermt2d %zmm3, %zmm18, %zmm4 -; AVX512-NEXT: vpermt2d %zmm8, %zmm20, %zmm4 -; AVX512-NEXT: vmovdqa64 %zmm4, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm10, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm6, 320(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm1, 384(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm11, 448(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm7, 512(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm0, 576(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm19, 640(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm16, 704(%rcx) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = <5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10> +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512-NEXT: vpermt2d %zmm0, %zmm15, %zmm16 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15] +; AVX512-NEXT: vpermt2d %zmm8, %zmm17, %zmm16 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512-NEXT: vpermt2d %zmm4, %zmm18, %zmm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31] +; AVX512-NEXT: vpermt2d %zmm8, %zmm4, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512-NEXT: vpermt2d %zmm5, %zmm12, %zmm8 +; AVX512-NEXT: vpermt2d %zmm9, %zmm14, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512-NEXT: vpermt2d %zmm1, %zmm15, %zmm19 +; AVX512-NEXT: vpermt2d %zmm9, %zmm17, %zmm19 +; AVX512-NEXT: vpermt2d %zmm5, %zmm18, %zmm1 +; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512-NEXT: vpermt2d %zmm6, %zmm12, %zmm5 +; AVX512-NEXT: vpermt2d %zmm10, %zmm14, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512-NEXT: vpermt2d %zmm2, %zmm15, %zmm9 +; AVX512-NEXT: vpermt2d %zmm10, %zmm17, %zmm9 +; AVX512-NEXT: vpermt2d %zmm6, %zmm18, %zmm2 +; AVX512-NEXT: vpermt2d %zmm10, %zmm4, %zmm2 +; AVX512-NEXT: vpermi2d %zmm7, %zmm3, %zmm12 +; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 +; AVX512-NEXT: vpermi2d %zmm3, %zmm7, %zmm15 +; AVX512-NEXT: vpermt2d %zmm11, %zmm17, %zmm15 +; AVX512-NEXT: vpermt2d %zmm7, %zmm18, %zmm3 +; AVX512-NEXT: vpermt2d %zmm11, %zmm4, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm3, 704(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm15, 640(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm12, 576(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm2, 512(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm9, 448(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm5, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm1, 320(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm19, 256(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm8, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm13, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -2789,7 +2737,7 @@ ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX: {{.*}} ; AVX1: {{.*}} -; AVX2-ONLY: {{.*}} +; AVX2: {{.*}} ; AVX512BW: {{.*}} ; AVX512BW-FAST: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll @@ -181,10 +181,10 @@ ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps %xmm0, 32(%r8) -; SSE-NEXT: movaps %xmm1, 48(%r8) +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps %xmm1, 32(%r8) ; SSE-NEXT: movaps %xmm5, 16(%r8) ; SSE-NEXT: movaps %xmm6, (%r8) ; SSE-NEXT: retq @@ -277,33 +277,33 @@ ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] ; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm10[0] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm10[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] ; SSE-NEXT: movaps %xmm4, %xmm7 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; SSE-NEXT: movaps %xmm1, %xmm8 ; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] ; SSE-NEXT: movaps %xmm8, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm7[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm4[1] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; SSE-NEXT: movaps %xmm1, 96(%r8) -; SSE-NEXT: movaps %xmm6, 112(%r8) -; SSE-NEXT: movaps %xmm8, 64(%r8) -; SSE-NEXT: movaps %xmm10, 80(%r8) -; SSE-NEXT: movaps %xmm0, 32(%r8) -; SSE-NEXT: movaps %xmm5, 48(%r8) -; SSE-NEXT: movaps %xmm2, (%r8) -; SSE-NEXT: movaps %xmm3, 16(%r8) +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE-NEXT: movaps %xmm1, 112(%r8) +; SSE-NEXT: movaps %xmm6, 96(%r8) +; SSE-NEXT: movaps %xmm8, 80(%r8) +; SSE-NEXT: movaps %xmm10, 64(%r8) +; SSE-NEXT: movaps %xmm0, 48(%r8) +; SSE-NEXT: movaps %xmm5, 32(%r8) +; SSE-NEXT: movaps %xmm2, 16(%r8) +; SSE-NEXT: movaps %xmm3, (%r8) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride4_vf8: @@ -375,18 +375,18 @@ ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm6, (%r8) ; AVX2-ONLY-NEXT: vzeroupper @@ -439,94 +439,94 @@ ; SSE-NEXT: movaps %xmm5, %xmm6 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] ; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm15[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] ; SSE-NEXT: movaps %xmm13, %xmm15 ; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; SSE-NEXT: movaps %xmm11, %xmm7 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm15[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm3[2],xmm11[3],xmm3[3] ; SSE-NEXT: movaps %xmm11, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm13[1] -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm13[0] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm13[1] ; SSE-NEXT: movaps %xmm10, %xmm15 ; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] ; SSE-NEXT: movaps %xmm4, %xmm13 ; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] ; SSE-NEXT: movaps %xmm13, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm15[1] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm15[0] +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm15[1] ; SSE-NEXT: movaps 48(%rdx), %xmm15 ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] ; SSE-NEXT: movaps 48(%rcx), %xmm12 ; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] ; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm10[0] +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] ; SSE-NEXT: movaps %xmm15, %xmm10 ; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] ; SSE-NEXT: movaps 48(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm15[0] -; SSE-NEXT: movaps %xmm2, 224(%r8) -; SSE-NEXT: movaps %xmm1, 240(%r8) -; SSE-NEXT: movaps %xmm3, 192(%r8) -; SSE-NEXT: movaps %xmm0, 208(%r8) -; SSE-NEXT: movaps %xmm4, 160(%r8) -; SSE-NEXT: movaps %xmm9, 176(%r8) -; SSE-NEXT: movaps %xmm13, 128(%r8) -; SSE-NEXT: movaps %xmm14, 144(%r8) -; SSE-NEXT: movaps %xmm11, 96(%r8) -; SSE-NEXT: movaps %xmm8, 112(%r8) -; SSE-NEXT: movaps %xmm7, 64(%r8) +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm15[1] +; SSE-NEXT: movaps %xmm2, 240(%r8) +; SSE-NEXT: movaps %xmm1, 224(%r8) +; SSE-NEXT: movaps %xmm3, 208(%r8) +; SSE-NEXT: movaps %xmm0, 192(%r8) +; SSE-NEXT: movaps %xmm4, 176(%r8) +; SSE-NEXT: movaps %xmm9, 160(%r8) +; SSE-NEXT: movaps %xmm13, 144(%r8) +; SSE-NEXT: movaps %xmm14, 128(%r8) +; SSE-NEXT: movaps %xmm11, 112(%r8) +; SSE-NEXT: movaps %xmm8, 96(%r8) +; SSE-NEXT: movaps %xmm7, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%r8) -; SSE-NEXT: movaps %xmm5, 32(%r8) +; SSE-NEXT: movaps %xmm0, 64(%r8) +; SSE-NEXT: movaps %xmm5, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) -; SSE-NEXT: movaps %xmm6, (%r8) +; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movaps %xmm6, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r8) +; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride4_vf16: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $24, %rsp ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[1],xmm9[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm11 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm12 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm12 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm15[0],xmm2[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0] @@ -534,8 +534,8 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm10[1],xmm5[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm10[1],xmm4[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm11[0],xmm12[0] @@ -544,9 +544,9 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[1],xmm4[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[1],xmm5[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm8[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] @@ -554,34 +554,34 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm14 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm13[1],xmm14[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm10, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm15[2],xmm2[3],xmm15[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm2[2],xmm15[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[3,0],xmm7[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,0],xmm7[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm12[2],xmm11[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm12[2],xmm11[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,0],xmm7[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,0],xmm7[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm8[2],xmm6[2] @@ -601,15 +601,15 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) ; AVX1-ONLY-NEXT: addq $24, %rsp @@ -618,26 +618,26 @@ ; ; AVX2-ONLY-LABEL: store_i32_stride4_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm5 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm7 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm6 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm9 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm10 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1],ymm3[2,3],ymm11[4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm11 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] @@ -645,9 +645,9 @@ ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm9 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm9 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm8 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm8 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,1,3] @@ -657,9 +657,9 @@ ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm11 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm9 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] @@ -674,14 +674,14 @@ ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm8[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm6, 128(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r8) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -973,84 +973,100 @@ ; ; AVX1-ONLY-LABEL: store_i32_stride4_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $472, %rsp # imm = 0x1D8 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm11[0],xmm2[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[1],xmm6[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm10[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[1],xmm4[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm8[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm12[0] +; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm13[0],xmm12[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm8 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm8[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 80(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm7[0] +; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm5[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1061,40 +1077,24 @@ ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[1],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm3[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm13 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[1],xmm13[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm11[1],xmm9[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm11[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm6[0],xmm2[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1122,175 +1122,177 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm14[2],xmm1[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm12[2],xmm1[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm12[2],xmm13[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm1[3,0],xmm14[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,0],xmm1[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = zero,zero,xmm8[2],xmm10[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,0],xmm1[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm1[3,0],xmm13[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm7[2],xmm9[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,0],xmm1[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm5[2],xmm6[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm5[2],xmm7[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm1[3,0],xmm9[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[3,0],xmm10[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm3[2],xmm4[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,0],xmm6[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm1[3,0],xmm7[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[2,0,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm2[2],xmm11[2] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm2[2],xmm6[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[3,0],xmm4[3,0] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[3,0],xmm11[3,0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 480(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 416(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 288(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 480(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 416(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 352(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 288(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r8) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) -; AVX1-ONLY-NEXT: addq $472, %rsp # imm = 0x1D8 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) +; AVX1-ONLY-NEXT: addq $488, %rsp # imm = 0x1E8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i32_stride4_vf32: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: pushq %rax -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm9 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm7 -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm9 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm4 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm4 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm10 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm12 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm14 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm13[0],xmm10[0],xmm13[1],xmm10[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm15[2],xmm13[2],xmm15[3],xmm13[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm12 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm15 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm13 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3],ymm12[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1],ymm9[2,3],ymm0[4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,3,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm11 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,1,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3],ymm14[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm14 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm15 +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3],ymm11[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[4],ymm14[4],ymm13[5],ymm14[5] -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm14 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7] +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm14 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] @@ -1298,17 +1300,17 @@ ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1],ymm6[2,3],ymm13[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm14 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm3 ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm14[0],ymm3[0],ymm14[1],ymm3[1],ymm14[4],ymm3[4],ymm14[5],ymm3[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm15 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[4],ymm15[4],ymm13[5],ymm15[5] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] @@ -1333,24 +1335,24 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm7, 448(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 352(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 320(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm12, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 416(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 384(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 352(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 320(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 416(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 384(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 288(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 256(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) ; AVX2-ONLY-NEXT: popq %rax ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -1365,32 +1367,32 @@ ; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512F-NEXT: vpermt2d %zmm2, %zmm10, %zmm11 ; AVX512F-NEXT: movb $-86, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512F-NEXT: vpermt2d %zmm6, %zmm9, %zmm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512F-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512F-NEXT: vpermt2d %zmm6, %zmm12, %zmm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512F-NEXT: vpermt2d %zmm2, %zmm16, %zmm17 ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = ; AVX512F-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u> ; AVX512F-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512F-NEXT: vpermi2d %zmm7, %zmm5, %zmm8 @@ -1405,14 +1407,14 @@ ; AVX512F-NEXT: vpermt2d %zmm7, %zmm15, %zmm5 ; AVX512F-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm16, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm13, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm10, 320(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm17, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm14, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm1, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm16, 384(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm13, 320(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm10, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm17, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1426,32 +1428,32 @@ ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm10, %zmm11 ; AVX512BW-NEXT: movb $-86, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm9, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm12, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u> ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512BW-NEXT: vpermi2d %zmm7, %zmm5, %zmm8 @@ -1466,14 +1468,14 @@ ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm6, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <32 x i32>, ptr %in.vecptr0, align 64 @@ -2753,32 +2755,32 @@ ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512F-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512F-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 ; AVX512F-NEXT: movb $-86, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512F-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512F-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512F-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512F-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = ; AVX512F-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u> ; AVX512F-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -2829,22 +2831,22 @@ ; AVX512F-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm21, %zmm3 ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm3, 896(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm15, 960(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm11, 768(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm7, 832(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm2, 640(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, 704(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm26, 512(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm25, 576(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm24, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm22, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm17, 320(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm3, 960(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm15, 896(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm11, 832(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm7, 768(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm2, 704(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm27, 640(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm26, 576(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm25, 512(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm1, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm22, 320(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm17, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2866,32 +2868,32 @@ ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm14, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm7, %zmm4 ; AVX512BW-NEXT: movb $-86, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm16, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm11, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm18, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm15, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm20, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u> ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -2942,22 +2944,22 @@ ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm20, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm21, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, 896(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 768(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 832(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 704(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 512(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 960(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 896(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 832(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 768(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 704(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 640(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 576(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 512(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll @@ -28,8 +28,8 @@ ; SSE-NEXT: movaps %xmm0, %xmm6 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,0] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] @@ -43,64 +43,64 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,1,4,6,6,5] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[u,u,0,2,u,5,7,u] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vmovlps %xmm1, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2,2,1,4,6,6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[u,u,0,2,u,5,7,u] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero +; AVX1-ONLY-NEXT: vmovlps %xmm0, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i32_stride5_vf2: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: movq (%r8), %rax ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovq %rax, %xmm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = <0,2,4,6,u,1,3,5> -; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm3, %ymm0 -; AVX2-ONLY-NEXT: vmovd %eax, %xmm3 -; AVX2-ONLY-NEXT: vpbroadcastd %xmm3, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4],ymm0[5,6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovq %xmm1, 32(%r9) +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-ONLY-NEXT: movq (%r8), %rax +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vmovq %rax, %xmm1 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,2,6,u,1,5,3> +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovd %eax, %xmm2 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-ONLY-NEXT: vmovq %xmm1, 32(%r9) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512-LABEL: store_i32_stride5_vf2: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = <0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u> -; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm1 -; AVX512-NEXT: vmovlps %xmm1, 32(%r9) -; AVX512-NEXT: vmovaps %ymm0, (%r9) +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,2,4,6,16,1,3,5,7,17,u,u,u,u,u,u> +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: vextracti32x4 $2, %zmm2, %xmm0 +; AVX512-NEXT: vmovq %xmm0, 32(%r9) +; AVX512-NEXT: vmovdqa %ymm2, (%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <2 x i32>, ptr %in.vecptr0, align 64 @@ -131,25 +131,25 @@ ; SSE-NEXT: movaps %xmm0, %xmm6 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm4[3,3] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm3[3,3] ; SSE-NEXT: movaps %xmm1, %xmm7 ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[0,2] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm4[2,3] -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm5[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm0, %xmm5 ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm5[0,1] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[3,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm5[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,0] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm3, 32(%r9) -; SSE-NEXT: movaps %xmm8, 48(%r9) +; SSE-NEXT: movaps %xmm3, 48(%r9) +; SSE-NEXT: movaps %xmm8, 32(%r9) ; SSE-NEXT: movaps %xmm7, 64(%r9) ; SSE-NEXT: movaps %xmm6, (%r9) ; SSE-NEXT: movaps %xmm0, 16(%r9) @@ -185,8 +185,8 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3] ; AVX1-ONLY-NEXT: vmovaps %xmm2, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -214,7 +214,7 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5],ymm6[6],ymm5[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[3],xmm2[1,2],zero +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[3],xmm2[1,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vmovaps %xmm1, 64(%r9) ; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%r9) @@ -248,10 +248,10 @@ ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vmovaps %xmm0, 64(%r9) +; AVX2-FAST-NEXT: vmovaps %xmm1, 64(%r9) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -279,7 +279,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5],ymm6[6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[3],xmm2[1,2],zero +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[3],xmm2[1,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, 64(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%r9) @@ -321,138 +321,134 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride5_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm1 -; SSE-NEXT: movaps 16(%rdi), %xmm5 -; SSE-NEXT: movdqa (%rsi), %xmm6 -; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa (%rsi), %xmm5 +; SSE-NEXT: movdqa 16(%rsi), %xmm1 ; SSE-NEXT: movdqa (%rdx), %xmm7 -; SSE-NEXT: movdqa 16(%rdx), %xmm10 -; SSE-NEXT: movaps (%rcx), %xmm4 -; SSE-NEXT: movaps 16(%rcx), %xmm2 -; SSE-NEXT: movaps (%r8), %xmm0 -; SSE-NEXT: movaps 16(%r8), %xmm11 -; SSE-NEXT: movaps %xmm2, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm11[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm10[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm9[0,2] -; SSE-NEXT: movaps %xmm4, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,3],xmm0[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm6[3,3,3,3] +; SSE-NEXT: movdqa 16(%rdx), %xmm2 +; SSE-NEXT: movaps (%rcx), %xmm6 +; SSE-NEXT: movaps 16(%rcx), %xmm3 +; SSE-NEXT: movaps (%r8), %xmm8 +; SSE-NEXT: movaps 16(%r8), %xmm4 +; SSE-NEXT: movaps %xmm8, %xmm13 +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm6[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm9 = xmm13[0],xmm9[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm12[0,2] -; SSE-NEXT: movaps %xmm5, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm11[2,3] -; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm13[2,0] -; SSE-NEXT: movaps %xmm5, %xmm13 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] -; SSE-NEXT: movaps %xmm10, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm11[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm13[0,1] -; SSE-NEXT: movaps %xmm5, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] -; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm15 -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm14[2,0] -; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm14[0,1] -; SSE-NEXT: movaps %xmm1, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm6[0] -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = xmm10[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm10[2,0] -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm11[0],xmm5[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,0] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm14, (%r9) -; SSE-NEXT: movaps %xmm4, 32(%r9) -; SSE-NEXT: movaps %xmm15, 48(%r9) -; SSE-NEXT: movaps %xmm13, 80(%r9) -; SSE-NEXT: movaps %xmm2, 112(%r9) -; SSE-NEXT: movaps %xmm12, 128(%r9) -; SSE-NEXT: movaps %xmm1, 16(%r9) -; SSE-NEXT: movaps %xmm9, 64(%r9) +; SSE-NEXT: movss {{.*#+}} xmm9 = xmm11[0],xmm9[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,0] +; SSE-NEXT: movaps %xmm4, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,0],xmm3[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,0] +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm12[0] +; SSE-NEXT: movdqa %xmm0, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[3,0] +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm7[2,0] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm8[0],xmm0[1,2,3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm14[0,1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm6[2],xmm12[3],xmm6[3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm13[0,2] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm6[0] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE-NEXT: movaps %xmm4, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm6[0,1] +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[3,0] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[2,0] +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3] +; SSE-NEXT: movaps %xmm3, 128(%r9) +; SSE-NEXT: movaps %xmm13, 112(%r9) +; SSE-NEXT: movdqa %xmm7, 80(%r9) +; SSE-NEXT: movaps %xmm12, 48(%r9) +; SSE-NEXT: movaps %xmm8, 32(%r9) +; SSE-NEXT: movdqa %xmm11, (%r9) +; SSE-NEXT: movaps %xmm10, 144(%r9) ; SSE-NEXT: movaps %xmm5, 96(%r9) -; SSE-NEXT: movaps %xmm8, 144(%r9) +; SSE-NEXT: movaps %xmm9, 64(%r9) +; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride5_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm4 -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm8[1],xmm6[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = xmm8[0],xmm6[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6],ymm13[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2,3],ymm10[4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,zero,xmm9[2],xmm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm9[1,1],xmm7[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm4[1,1],ymm2[1,1],ymm4[5,5],ymm2[5,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm7[1,2,3,4],ymm9[5],ymm7[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm8[2],xmm6[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3],ymm10[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm8 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4,5],ymm8[6],ymm6[7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1],xmm3[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3],ymm2[4,5,6],ymm8[7] ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm10[3,3],xmm8[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[3,3],ymm1[3,3],ymm3[7,7],ymm1[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4,5,6],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[3,3],ymm0[3,3],ymm1[7,7],ymm0[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm8 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm4[2],xmm3[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm6 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4,5],ymm6[6],ymm3[7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm4[2],xmm7[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,1],xmm7[1,1] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,1],ymm9[1,1],ymm8[5,5],ymm9[5,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm7[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4],ymm6[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm9[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -485,24 +481,24 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2,3,4],ymm2[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4],ymm8[5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 16(%r8), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5],ymm7[6,7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] +; AVX2-SLOW-NEXT: vbroadcastsd 24(%r8), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm2[1],ymm8[2,3,4],ymm2[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm3[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm1[1,2],ymm9[3,4],ymm1[5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 24(%r8), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vbroadcastsd 16(%r8), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4],ymm9[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] @@ -511,8 +507,8 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 128(%r9) ; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%r9) ; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r9) ; AVX2-SLOW-NEXT: vzeroupper @@ -541,29 +537,29 @@ ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm7 = <0,1,u,u,3,2,3,u> +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm7 = ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm6, %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2,3,4],ymm2[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4],ymm8[5,6],ymm9[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5],ymm7[6,7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] +; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm2[1],ymm8[2,3,4],ymm2[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm3[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm1[1,2],ymm9[3,4],ymm1[5,6],ymm9[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4],ymm9[5],ymm8[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] @@ -572,8 +568,8 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm8, 128(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm7, 96(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm8, 96(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm7, 128(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm6, (%r9) ; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%r9) ; AVX2-FAST-NEXT: vzeroupper @@ -608,24 +604,24 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2,3,4],ymm2[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4],ymm8[5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%r8), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%r8), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm2[1],ymm8[2,3,4],ymm2[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm3[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm1[1,2],ymm9[3,4],ymm1[5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%r8), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%r8), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4],ymm9[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] @@ -634,8 +630,8 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 128(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r9) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -650,19 +646,19 @@ ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = <6,14,u,23,31,7,15,u> ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = <11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14> +; AVX512-NEXT: vpbroadcastq 24(%r8), %ymm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4,5,6],ymm4[7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3> ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm4, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3> +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = <11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14> ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] ; AVX512-NEXT: vpermi2d %zmm2, %zmm4, %zmm0 -; AVX512-NEXT: vpbroadcastq 24(%r8), %ymm1 -; AVX512-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512-NEXT: vmovdqa64 %zmm5, 64(%r9) -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2],ymm3[3,4,5,6],ymm1[7] -; AVX512-NEXT: vmovdqa %ymm0, 128(%r9) +; AVX512-NEXT: vmovdqa64 %zmm0, 64(%r9) +; AVX512-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512-NEXT: vmovdqa %ymm3, 128(%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <8 x i32>, ptr %in.vecptr0, align 64 @@ -683,444 +679,444 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $168, %rsp -; SSE-NEXT: movdqa (%rsi), %xmm10 -; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movdqa 32(%rsi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdx), %xmm5 -; SSE-NEXT: movdqa 16(%rdx), %xmm9 -; SSE-NEXT: movdqa 32(%rdx), %xmm4 -; SSE-NEXT: movaps (%rcx), %xmm12 -; SSE-NEXT: movaps 16(%rcx), %xmm13 -; SSE-NEXT: movaps 32(%rcx), %xmm11 -; SSE-NEXT: movaps (%r8), %xmm3 -; SSE-NEXT: movaps 16(%r8), %xmm15 +; SSE-NEXT: subq $136, %rsp +; SSE-NEXT: movdqa (%rsi), %xmm1 +; SSE-NEXT: movdqa 16(%rsi), %xmm12 +; SSE-NEXT: movdqa 32(%rsi), %xmm6 +; SSE-NEXT: movdqa %xmm6, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdx), %xmm11 +; SSE-NEXT: movaps (%rcx), %xmm15 +; SSE-NEXT: movaps 16(%rcx), %xmm4 +; SSE-NEXT: movaps 32(%rcx), %xmm9 +; SSE-NEXT: movaps (%r8), %xmm10 +; SSE-NEXT: movaps 16(%r8), %xmm7 ; SSE-NEXT: movaps 32(%r8), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm15[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm8[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm7 -; SSE-NEXT: movaps 48(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm4[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[3,3,3,3] +; SSE-NEXT: movdqa %xmm12, %xmm13 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm9[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rsi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] -; SSE-NEXT: movdqa (%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE-NEXT: movdqa 48(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: movaps 48(%rcx), %xmm14 +; SSE-NEXT: movaps 48(%r8), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm3[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movaps %xmm13, %xmm9 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm15[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,1] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm15[2,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[2,0] -; SSE-NEXT: movdqa %xmm4, %xmm15 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm11[2],xmm15[3],xmm11[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,1] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps %xmm7, %xmm3 -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm14 = xmm0[0],xmm14[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm14[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[1,1] +; SSE-NEXT: movaps %xmm10, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm6[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm2[2,0] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm10[0],xmm6[1,2,3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm15[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm5[0,1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0] -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3] -; SSE-NEXT: movaps %xmm7, 288(%r9) -; SSE-NEXT: movaps %xmm3, 272(%r9) -; SSE-NEXT: movdqa %xmm5, 240(%r9) -; SSE-NEXT: movaps %xmm15, 208(%r9) +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm8[0] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm13[2],xmm8[3],xmm13[3] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm8[0,1] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm5[3,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps %xmm0, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm8[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: movaps %xmm9, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm8[0] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm0[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm11[2,0] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] +; SSE-NEXT: movaps %xmm0, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm8[0,1] +; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] +; SSE-NEXT: movaps 48(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps %xmm8, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm14[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm8[3,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm4[0,2] +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[1,0],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm13[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm2[0],xmm5[1,2,3] +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[1,0],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm6[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm2[0],xmm8[1,2,3] +; SSE-NEXT: movaps %xmm14, 288(%r9) +; SSE-NEXT: movaps %xmm0, 272(%r9) +; SSE-NEXT: movaps %xmm1, 240(%r9) +; SSE-NEXT: movaps %xmm7, 208(%r9) ; SSE-NEXT: movaps %xmm11, 192(%r9) -; SSE-NEXT: movdqa %xmm6, 160(%r9) -; SSE-NEXT: movaps %xmm13, 128(%r9) -; SSE-NEXT: movaps %xmm9, 112(%r9) -; SSE-NEXT: movdqa %xmm12, 80(%r9) +; SSE-NEXT: movdqa %xmm9, 160(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r9) +; SSE-NEXT: movaps %xmm0, 128(%r9) +; SSE-NEXT: movaps %xmm15, 112(%r9) +; SSE-NEXT: movdqa %xmm10, 80(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%r9) +; SSE-NEXT: movaps %xmm0, 48(%r9) +; SSE-NEXT: movaps %xmm12, 32(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 304(%r9) -; SSE-NEXT: movaps %xmm4, 256(%r9) +; SSE-NEXT: movaps %xmm8, 256(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r9) -; SSE-NEXT: movaps %xmm2, 176(%r9) +; SSE-NEXT: movaps %xmm3, 176(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r9) -; SSE-NEXT: movaps %xmm10, 96(%r9) +; SSE-NEXT: movaps %xmm5, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r9) -; SSE-NEXT: movaps %xmm14, 16(%r9) -; SSE-NEXT: addq $168, %rsp +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%r9) +; SSE-NEXT: addq $136, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride5_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 +; AVX1-ONLY-NEXT: pushq %rax +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm7 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm10[1],xmm6[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = xmm10[0],xmm6[0],zero,zero +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1],xmm6[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm12[0],xmm11[0] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm14[0],xmm12[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5,6],ymm9[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm8[1],xmm7[1],zero -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm14 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm14[0],xmm13[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm8[1,1],xmm7[1,1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm13[0],xmm11[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = xmm8[0],xmm7[0],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm9[2,3],ymm5[4,5,6],ymm9[7] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm15, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0],ymm5[1,2,3],ymm15[4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm15 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm10[2],xmm6[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm10 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm10[1],ymm6[2,3,4,5],ymm10[6],ymm6[7] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm6[2,3],ymm12[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm12 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm12[1],ymm6[2,3,4,5],ymm12[6],ymm6[7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm11[2],xmm10[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm11[1,1],xmm10[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm9[1,1],ymm2[1,1],ymm9[5,5],ymm2[5,5] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm14[2],xmm15[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[1,1],xmm15[1,1] +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm14, %ymm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm9[1,1],ymm0[1,1],ymm9[5,5],ymm0[5,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm15[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm10[2],xmm15[2],xmm10[3],xmm15[3] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] ; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4],ymm10[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm12[3,3],xmm11[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[3,3],ymm1[3,3],ymm3[7,7],ymm1[7,7] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm15[3,4],ymm11[5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm12[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2,3,4],ymm15[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4,5,6],ymm15[7] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm15 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX1-ONLY-NEXT: vbroadcastss 36(%rcx), %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm14[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm12[1,2,3,4],ymm15[5],ymm12[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6],ymm15[7] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; AVX1-ONLY-NEXT: vbroadcastss 36(%rcx), %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm7 = zero,zero,xmm8[2],xmm7[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm8 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5],ymm8[6],ymm7[7] -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[3,3],xmm8[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm13[2],xmm8[2],xmm13[3],xmm8[3] -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[3,3],ymm0[3,3],ymm15[7,7],ymm0[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm14[3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm14[2],xmm13[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm14[1,1],xmm13[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm13 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm14[1,1],ymm13[1,1],ymm14[5,5],ymm13[5,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm7 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0],ymm7[1],ymm0[2,3,4,5],ymm7[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm11 = zero,zero,xmm8[2],xmm0[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1],xmm0[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm15[1,1],ymm1[1,1],ymm15[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2],ymm8[3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm11[3,3],xmm8[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[3,3],ymm11[3,3],ymm13[7,7],ymm11[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm4[3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm0[2],ymm8[3,4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm15[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm13[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm8 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm2[1,2,3,4],ymm0[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2],ymm4[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm14[3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm13[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm15[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm1[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2],ymm9[3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 256(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 288(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 256(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) +; AVX1-ONLY-NEXT: popq %rax ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i32_stride5_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm2 -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm7 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm6 +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm4 +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm2 ; AVX2-SLOW-NEXT: vmovaps (%r8), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%r8), %ymm6 -; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm9 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm11 -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm10 -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm13 -; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm14 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm15[1],ymm8[2,3,4,5],ymm15[6],ymm8[7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm15 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm14 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,3,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[0,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vinsertf128 $1, (%r8), %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm10[1,2,3],ymm9[4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm8 +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm11 +; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm9 +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm13 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm10 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm12 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm14 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,3,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[0,1,1,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3],ymm15[4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm15 +; AVX2-SLOW-NEXT: vinsertf128 $1, (%r8), %ymm14, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0],ymm7[1,2,3],ymm14[4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm14[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3,4,5],ymm13[6],ymm10[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vinsertf128 $1, 32(%r8), %ymm11, %ymm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4],ymm12[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm3[1,2],ymm13[3,4],ymm3[5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 48(%r8), %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm13 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm7[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm9[1],ymm8[2,3,4,5],ymm9[6],ymm8[7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm9 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,3,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm10[0,1,1,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3],ymm12[4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm12 +; AVX2-SLOW-NEXT: vinsertf128 $1, 32(%r8), %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3],ymm10[4],ymm9[5,6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%r8), %ymm10 +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm13[2],xmm11[3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm10[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3,4,5],ymm13[6],ymm11[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm4[1],ymm13[2,3,4],ymm4[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm6[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm3[1,2],ymm14[3,4],ymm3[5,6],ymm14[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 56(%r8), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 16(%r8), %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4],ymm14[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] -; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm15 -; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm0 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm15 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 24(%r8), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm14[2],ymm8[3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1,2],ymm9[3,4],ymm15[5,6],ymm9[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3,4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 16(%r8), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0],ymm9[1,2,3,4],ymm14[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 24(%r8), %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1],ymm15[2,3,4],ymm1[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm5[1,2],ymm7[3,4],ymm5[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4],ymm7[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 48(%r8), %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0],ymm7[1,2,3,4],ymm15[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm15[1,2],ymm8[3,4],ymm15[5,6],ymm8[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[6],ymm5[6],ymm0[7],ymm5[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm15[3,4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 56(%r8), %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm15[2],ymm8[3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm12[0,1,3,0,4,5,7,4] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,0,4,5,7,4] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 224(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm13, 288(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm12, 256(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm11, 160(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm10, 192(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r9) +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 288(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 256(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm14, 128(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm13, 96(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm11, 192(%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r9) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i32_stride5_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm2 -; AVX2-FAST-NEXT: vmovaps (%r8), %ymm0 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm3 +; AVX2-FAST-NEXT: vmovaps (%r8), %ymm2 +; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm3 ; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm6 ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm7 ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm10 @@ -1136,97 +1132,97 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3,4,5],ymm5[6],ymm4[7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5],ymm5[6],ymm4[7] +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm11, %ymm5 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm15[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm13[2],xmm8[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm0[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm8[1],ymm5[2,3,4,5],ymm8[6],ymm5[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm8 +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm8 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1] ; AVX2-FAST-NEXT: vpermps %ymm12, %ymm11, %ymm12 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,3,2,3,2,3,2] ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm12 +; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm12 ; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm6, %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1,2,3],ymm6[4],ymm10[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm3[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1,2],ymm14[3,4],ymm1[5,6],ymm14[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2],ymm10[3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm14 +; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0],ymm10[1,2,3,4],ymm14[5],ymm10[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,2,3,3,4,6,7,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm14[1,2],ymm4[3,4],ymm14[5,6],ymm4[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm14[3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 56(%r8), %ymm14 +; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm4[0,1],ymm14[2],ymm4[3,4,5,6],ymm14[7] ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm11, %ymm4 -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm11 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm11 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] -; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm9 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm9 ; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm7, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0],ymm0[1,2,3],ymm4[4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm9[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3,4],ymm0[5,6],ymm4[7] -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm13 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm4 +; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm7, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm0[1,2,3],ymm7[4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm13 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm11[1,2],ymm15[3,4],ymm11[5,6],ymm15[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4],ymm15[5],ymm0[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm9[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm15[1,2],ymm5[3,4],ymm15[5,6],ymm5[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm11[1],ymm15[2,3,4],ymm11[5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm13[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm15[3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0],ymm5[1,2,3,4],ymm15[5],ymm5[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vbroadcastsd 56(%r8), %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,1,3,0,4,5,7,4] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm2[2],ymm8[3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1],ymm3[2],ymm8[3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3],ymm1[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,0,2,3,7,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm9[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm2, 224(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm5, 288(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm7, 160(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm14, 288(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm10, 256(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm14, 128(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm10, 96(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm6, (%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) @@ -1237,126 +1233,126 @@ ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride5_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm15[1],ymm8[2,3,4,5],ymm15[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, (%r8), %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm10[1,2,3],ymm9[4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm14 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3],ymm15[4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, (%r8), %ymm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0],ymm7[1,2,3],ymm14[4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm14[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3,4,5],ymm13[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 32(%r8), %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4],ymm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm3[1,2],ymm13[3,4],ymm3[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%r8), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm13 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm7[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm9[1],ymm8[2,3,4,5],ymm9[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm9 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm10[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3],ymm12[4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 32(%r8), %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3],ymm10[4],ymm9[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm14 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm13[2],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm10[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3,4,5],ymm13[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm4[1],ymm13[2,3,4],ymm4[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm6[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm3[1,2],ymm14[3,4],ymm3[5,6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%r8), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%r8), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4],ymm14[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm15 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%r8), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm14[2],ymm8[3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1,2],ymm9[3,4],ymm15[5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3,4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%r8), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0],ymm9[1,2,3,4],ymm14[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%r8), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1],ymm15[2,3,4],ymm1[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm5[1,2],ymm7[3,4],ymm5[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4],ymm7[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%r8), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0],ymm7[1,2,3,4],ymm15[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm15[1,2],ymm8[3,4],ymm15[5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[6],ymm5[6],ymm0[7],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm15[3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%r8), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm15[2],ymm8[3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm12[0,1,3,0,4,5,7,4] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,0,4,5,7,4] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 224(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 288(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 256(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 160(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, 192(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r9) +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 288(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 256(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 128(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 192(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r9) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -1509,54 +1505,55 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride5_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $728, %rsp # imm = 0x2D8 -; SSE-NEXT: movdqa (%rsi), %xmm9 -; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa 32(%rsi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdx), %xmm11 -; SSE-NEXT: movdqa 16(%rdx), %xmm10 -; SSE-NEXT: movdqa 32(%rdx), %xmm12 -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm3 -; SSE-NEXT: movaps 16(%rcx), %xmm5 -; SSE-NEXT: movaps 32(%rcx), %xmm6 -; SSE-NEXT: movaps (%r8), %xmm4 -; SSE-NEXT: movaps 16(%r8), %xmm15 -; SSE-NEXT: movaps 32(%r8), %xmm13 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm15[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm13[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm12 -; SSE-NEXT: movdqa 48(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm8 -; SSE-NEXT: movaps 48(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: subq $600, %rsp # imm = 0x258 +; SSE-NEXT: movdqa (%rsi), %xmm3 +; SSE-NEXT: movdqa 16(%rsi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rsi), %xmm11 +; SSE-NEXT: movdqa (%rdx), %xmm8 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdx), %xmm13 +; SSE-NEXT: movaps (%rcx), %xmm6 +; SSE-NEXT: movaps 16(%rcx), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rcx), %xmm15 +; SSE-NEXT: movaps (%r8), %xmm12 +; SSE-NEXT: movaps 16(%r8), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%r8), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm6[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm9[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm15[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rsi), %xmm9 +; SSE-NEXT: movdqa 48(%rdx), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 48(%rcx), %xmm4 +; SSE-NEXT: movaps 48(%r8), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1565,26 +1562,25 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 64(%rcx), %xmm0 +; SSE-NEXT: movaps 64(%rcx), %xmm5 +; SSE-NEXT: movaps 64(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 80(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%r8), %xmm2 +; SSE-NEXT: movaps 80(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps 80(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1593,12 +1589,12 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 96(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%r8), %xmm2 +; SSE-NEXT: movaps 96(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rsi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1607,249 +1603,224 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 112(%rcx), %xmm0 +; SSE-NEXT: movaps 112(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%r8), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm4[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm15[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm15[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movaps 32(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm3[2],xmm14[3],xmm3[3] +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm3[1,1] +; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm8[2,0] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm12[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm14[0,1] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm13[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm13[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: movaps %xmm8, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm4[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps 16(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[3,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] +; SSE-NEXT: movdqa 32(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm11[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm13[2,0] +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm15[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm15[2],xmm6[3],xmm15[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa 48(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[3,0] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm7[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 64(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movaps 64(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[1,1] ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm2[2,0] +; SSE-NEXT: movss {{.*#+}} xmm12 = xmm3[0],xmm12[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movaps 80(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm14 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 80(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 96(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movaps %xmm1, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,0],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[3,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[0,2] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movaps 96(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm10 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm14, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,1],xmm1[2,0] +; SSE-NEXT: movss {{.*#+}} xmm7 = xmm6[0],xmm7[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[0,1] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm2[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movaps %xmm8, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: movaps 112(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[0,1] +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[3,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] +; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = xmm8[1,0],mem[1,1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm8[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm8[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] -; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE-NEXT: movaps %xmm1, %xmm15 -; SSE-NEXT: movaps %xmm1, %xmm12 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm2[1] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[1,0],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm2[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE-NEXT: movss {{.*#+}} xmm15 = xmm2[0],xmm15[1,2,3] +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[1,0],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm2[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm15 = xmm0[0],xmm15[1,2,3] -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] +; SSE-NEXT: movss {{.*#+}} xmm11 = xmm2[0],xmm11[1,2,3] +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[1,0],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm4[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm0[0],xmm6[1,2,3] -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm14[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm0[0],xmm5[1,2,3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm2[0],xmm5[1,2,3] ; SSE-NEXT: movaps %xmm0, 608(%r9) -; SSE-NEXT: movaps %xmm12, 592(%r9) -; SSE-NEXT: movaps %xmm4, 560(%r9) -; SSE-NEXT: movaps %xmm7, 528(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 512(%r9) -; SSE-NEXT: movaps %xmm8, 480(%r9) +; SSE-NEXT: movaps %xmm3, 592(%r9) +; SSE-NEXT: movaps %xmm1, 560(%r9) +; SSE-NEXT: movaps %xmm9, 528(%r9) +; SSE-NEXT: movaps %xmm6, 512(%r9) +; SSE-NEXT: movaps %xmm10, 480(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 448(%r9) -; SSE-NEXT: movaps %xmm9, 432(%r9) -; SSE-NEXT: movaps %xmm13, 400(%r9) +; SSE-NEXT: movaps %xmm13, 432(%r9) +; SSE-NEXT: movaps %xmm14, 400(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1882,183 +1853,183 @@ ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 624(%r9) -; SSE-NEXT: movaps %xmm3, 576(%r9) +; SSE-NEXT: movaps %xmm5, 576(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 544(%r9) -; SSE-NEXT: movaps %xmm5, 496(%r9) +; SSE-NEXT: movaps %xmm7, 496(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 464(%r9) -; SSE-NEXT: movaps %xmm6, 416(%r9) +; SSE-NEXT: movaps %xmm11, 416(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 384(%r9) -; SSE-NEXT: movaps %xmm11, 336(%r9) +; SSE-NEXT: movaps %xmm12, 336(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 304(%r9) -; SSE-NEXT: movaps %xmm10, 256(%r9) +; SSE-NEXT: movaps %xmm15, 256(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 224(%r9) -; SSE-NEXT: movaps %xmm2, 176(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%r9) -; SSE-NEXT: movaps %xmm15, 96(%r9) +; SSE-NEXT: movaps %xmm8, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: addq $728, %rsp # imm = 0x2D8 +; SSE-NEXT: addq $600, %rsp # imm = 0x258 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $616, %rsp # imm = 0x268 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5 +; AVX1-ONLY-NEXT: subq $600, %rsp # imm = 0x258 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm9[1],xmm5[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm9[0],xmm5[0],zero,zero +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1],xmm6[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm12[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm14[0],xmm12[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm11[1],xmm7[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm11[0],xmm7[0],zero,zero +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1],xmm7[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm13[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm0[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,xmm1[1],xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = xmm1[0],xmm2[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6],ymm6[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm6[1,2,3],ymm13[4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm15 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm0[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1],xmm4[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm1[1,2,3],ymm5[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 100(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm13 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm14[1],xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm14[0],xmm0[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6],ymm13[7] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vbroadcastss 100(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm8 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm5[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3],ymm1[4,5,6],ymm8[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1,2,3],ymm0[4],ymm13[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm12[2],xmm10[3],xmm12[3] -; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm9[2],xmm5[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm5 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5],ymm5[6],ymm0[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm14[2],xmm12[3],xmm14[3] +; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm11[2],xmm6[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = zero,zero,xmm5[2],xmm0[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1],xmm0[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm13 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1],ymm13[1,1],ymm5[5,5],ymm13[5,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[3,3],xmm5[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm1[2],xmm0[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm14 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm14[1,1],ymm1[5,5],ymm14[5,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[3,3],xmm6[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,3],ymm6[3,3],ymm9[7,7],ymm6[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3,4],ymm5[5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm8 +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[3,3],ymm6[3,3],ymm8[7,7],ymm6[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm6[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4],ymm9[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4],ymm6[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm9[2],ymm5[3,4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm6[2],ymm1[3,4,5,6],ymm6[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vbroadcastss 36(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,zero,xmm11[2],xmm7[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5],ymm3[6],ymm0[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; AVX1-ONLY-NEXT: vbroadcastss 36(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm10[2],xmm7[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm3[2],xmm0[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm0[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1],ymm3[1,1],ymm4[5,5],ymm3[5,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm3[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3],ymm4[3,3],ymm12[7,7],ymm4[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,zero,xmm1[2],xmm0[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1],ymm1[1,1],ymm6[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[3,3],xmm1[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[3,3],ymm6[3,3],ymm11[7,7],ymm6[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm6[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4],ymm9[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm9[2],ymm1[3,4,5,6],ymm9[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; AVX1-ONLY-NEXT: vbroadcastss 68(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm2[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX1-ONLY-NEXT: vbroadcastss 68(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm2[2],xmm4[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] @@ -2068,110 +2039,109 @@ ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm10 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1],ymm11[1,1],ymm9[5,5],ymm11[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1],ymm10[1,1],ymm9[5,5],ymm10[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,3],xmm1[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,3],ymm7[3,3],ymm6[7,7],ymm7[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2],ymm5[3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm8 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm4[1,2,3,4],ymm0[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm2[3,3],xmm0[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[3,3],ymm6[3,3],ymm4[7,7],ymm6[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm12[0,1,2],ymm13[3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm12 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm12[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm1[1,2,3,4],ymm15[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm15[2],ymm13[3,4,5,6],ymm15[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vbroadcastss 100(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = zero,zero,xmm14[2],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4,5],ymm4[6],ymm0[7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vbroadcastss 100(%rcx), %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,zero,xmm3[2],xmm5[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,zero,xmm5[2],xmm0[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1],xmm0[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm14 -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[1,1],ymm14[1,1],ymm3[5,5],ymm14[5,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm15[3,4],ymm5[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm15 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,zero,xmm5[2],xmm1[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm15[1,1],ymm5[1,1],ymm15[5,5],ymm5[5,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2],ymm13[3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm13 ; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3],xmm15[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3],xmm13[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm15[3,3],ymm2[3,3],ymm15[7,7],ymm2[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm4[3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm5[1,2,3,4],ymm0[5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[3,3],ymm13[3,3],ymm1[7,7],ymm13[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0],ymm8[1,2,3,4],ymm0[5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1],ymm0[2],ymm3[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm13[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = mem[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4,5,6],ymm12[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1,2,3],ymm1[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm11[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm15[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm14[1,0,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm2, 544(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 384(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r9) +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm11[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm11 = mem[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm11[2],ymm14[3,4,5,6],ymm11[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm11[1,2,3],ymm3[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm9[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm10[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2],ymm9[3,4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3],ymm4[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm12[3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1,0,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2,3],ymm1[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm1, 544(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 384(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 608(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 576(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 608(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 576(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r9) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%r9) @@ -2195,7 +2165,7 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) -; AVX1-ONLY-NEXT: addq $616, %rsp # imm = 0x268 +; AVX1-ONLY-NEXT: addq $600, %rsp # imm = 0x258 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -3034,213 +3004,209 @@ ; ; AVX512F-LABEL: store_i32_stride5_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm11 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm14 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm17, %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm13 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm14 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm4 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm16 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm6 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm12, %zmm8 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm5, %zmm2 ; AVX512F-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512F-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 +; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512F-NEXT: vpermt2d %zmm15, %zmm17, %zmm2 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm18, %zmm10 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm9, %zmm8 +; AVX512F-NEXT: movw $6342, %ax # imm = 0x18C6 +; AVX512F-NEXT: kmovw %eax, %k3 +; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm8 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512F-NEXT: vpermt2d %zmm15, %zmm19, %zmm8 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm16, %zmm11, %zmm10 ; AVX512F-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512F-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512F-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512F-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm10 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512F-NEXT: vpermt2d %zmm15, %zmm21, %zmm10 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] +; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm22, %zmm23 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm24, %zmm25 +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512F-NEXT: vpermt2d %zmm15, %zmm23, %zmm25 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512F-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 -; AVX512F-NEXT: movw $6342, %ax # imm = 0x18C6 -; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512F-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm0 {%k1} -; AVX512F-NEXT: vpermt2d %zmm13, %zmm18, %zmm0 -; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm19 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm2, %zmm8 -; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm8 {%k2} -; AVX512F-NEXT: vpermt2d %zmm5, %zmm20, %zmm8 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm2, %zmm21 -; AVX512F-NEXT: vpermi2d %zmm4, %zmm6, %zmm10 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm10 {%k1} -; AVX512F-NEXT: vpermt2d %zmm5, %zmm22, %zmm10 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm2, %zmm23 -; AVX512F-NEXT: vpermi2d %zmm6, %zmm4, %zmm25 -; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm25 {%k2} -; AVX512F-NEXT: vpermt2d %zmm5, %zmm24, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm6, %zmm26, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm28, %zmm1 -; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm1 {%k3} -; AVX512F-NEXT: vpermt2d %zmm5, %zmm27, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm25, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm8, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm0, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm29, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm15, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm9, 512(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm7, 576(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512F-NEXT: vpermt2d %zmm16, %zmm26, %zmm14 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm13, %zmm16, %zmm0 +; AVX512F-NEXT: vmovdqa32 %zmm14, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512F-NEXT: vpermt2d %zmm15, %zmm13, %zmm0 +; AVX512F-NEXT: vpermi2d %zmm7, %zmm4, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm5 +; AVX512F-NEXT: vmovdqa32 %zmm12, %zmm5 {%k1} +; AVX512F-NEXT: vpermt2d %zmm6, %zmm17, %zmm5 +; AVX512F-NEXT: vpermi2d %zmm7, %zmm4, %zmm18 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm3, %zmm9 +; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm9 {%k3} +; AVX512F-NEXT: vpermt2d %zmm6, %zmm19, %zmm9 +; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm20 +; AVX512F-NEXT: vpermi2d %zmm7, %zmm4, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm11 {%k2} +; AVX512F-NEXT: vpermt2d %zmm6, %zmm21, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm3, %zmm1, %zmm22 +; AVX512F-NEXT: vpermi2d %zmm4, %zmm7, %zmm24 +; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm24 {%k1} +; AVX512F-NEXT: vpermt2d %zmm6, %zmm23, %zmm24 +; AVX512F-NEXT: vpermt2d %zmm7, %zmm26, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm3, %zmm16, %zmm1 +; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm1 {%k2} +; AVX512F-NEXT: vpermt2d %zmm6, %zmm13, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, 576(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm24, 512(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm11, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm9, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm5, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm25, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i32_stride5_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm14 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm16, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm17, %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm14 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm16 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm6 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm12, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm5, %zmm2 ; AVX512BW-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm19, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm17, %zmm2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm18, %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm9, %zmm8 +; AVX512BW-NEXT: movw $6342, %ax # imm = 0x18C6 +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm8 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm19, %zmm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm21 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm11, %zmm10 ; AVX512BW-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm20, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm21, %zmm15 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm9 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm9 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm23, %zmm24 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm25, %zmm15 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm24, %zmm15 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm10 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm21, %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] +; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm22, %zmm23 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm24, %zmm25 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm23, %zmm25 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] ; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm26, %zmm27 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm28, %zmm29 -; AVX512BW-NEXT: movw $6342, %ax # imm = 0x18C6 -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm29 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm16, %zmm12 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm0 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm13, %zmm18, %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm19 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm8 {%k2} -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm20, %zmm8 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm21 -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm6, %zmm10 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm10 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm22, %zmm10 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm23 -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm4, %zmm25 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm25 {%k2} -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm24, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm26, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm28, %zmm1 -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm1 {%k3} -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm27, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 512(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 576(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512BW-NEXT: vpermt2d %zmm16, %zmm26, %zmm14 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm16, %zmm0 +; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm13, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm4, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm5 +; AVX512BW-NEXT: vmovdqa32 %zmm12, %zmm5 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm17, %zmm5 +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm4, %zmm18 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm3, %zmm9 +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm9 {%k3} +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm19, %zmm9 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm20 +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm4, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm11 {%k2} +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm21, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm1, %zmm22 +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm7, %zmm24 +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm24 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm23, %zmm24 +; AVX512BW-NEXT: vpermt2d %zmm7, %zmm26, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm3, %zmm16, %zmm1 +; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm1 {%k2} +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm13, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 512(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <32 x i32>, ptr %in.vecptr0, align 64 @@ -3261,83 +3227,84 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride5_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $1736, %rsp # imm = 0x6C8 -; SSE-NEXT: movdqa (%rsi), %xmm12 -; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rsi), %xmm3 +; SSE-NEXT: subq $1464, %rsp # imm = 0x5B8 +; SSE-NEXT: movdqa (%rsi), %xmm6 +; SSE-NEXT: movdqa 16(%rsi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rsi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdx), %xmm12 +; SSE-NEXT: movdqa 16(%rdx), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdx), %xmm8 -; SSE-NEXT: movdqa 16(%rdx), %xmm11 -; SSE-NEXT: movdqa 32(%rdx), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm4 -; SSE-NEXT: movaps 16(%rcx), %xmm9 -; SSE-NEXT: movaps 32(%rcx), %xmm13 +; SSE-NEXT: movdqa 32(%rdx), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rcx), %xmm8 +; SSE-NEXT: movaps 16(%rcx), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rcx), %xmm11 +; SSE-NEXT: movaps (%r8), %xmm15 +; SSE-NEXT: movaps 16(%r8), %xmm13 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%r8), %xmm5 -; SSE-NEXT: movaps 16(%r8), %xmm7 -; SSE-NEXT: movaps 32(%r8), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: movaps 32(%r8), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm8[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm10[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm11[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm10 -; SSE-NEXT: movdqa 48(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] +; SSE-NEXT: movdqa 48(%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdx), %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm6 -; SSE-NEXT: movaps 48(%r8), %xmm13 -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm13[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps 48(%rcx), %xmm4 +; SSE-NEXT: movaps 48(%r8), %xmm10 +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rdx), %xmm15 +; SSE-NEXT: movdqa 64(%rdx), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 64(%rcx), %xmm14 -; SSE-NEXT: movaps 64(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps 64(%rcx), %xmm7 +; SSE-NEXT: movaps 64(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 80(%rcx), %xmm0 +; SSE-NEXT: movaps 80(%rcx), %xmm13 +; SSE-NEXT: movaps 80(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm13[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3346,26 +3313,25 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 96(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps 96(%rcx), %xmm5 +; SSE-NEXT: movaps 96(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 112(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%r8), %xmm2 +; SSE-NEXT: movaps 112(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps 112(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3374,26 +3340,26 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 128(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%r8), %xmm2 +; SSE-NEXT: movaps 128(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 144(%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 144(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%r8), %xmm2 +; SSE-NEXT: movaps 144(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps 144(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3402,26 +3368,25 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 160(%rcx), %xmm0 +; SSE-NEXT: movaps 160(%rcx), %xmm14 +; SSE-NEXT: movaps 160(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%r8), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 176(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 176(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%r8), %xmm2 +; SSE-NEXT: movaps 176(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps 176(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3430,26 +3395,26 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 192(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 192(%r8), %xmm2 +; SSE-NEXT: movaps 192(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 208(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%r8), %xmm2 +; SSE-NEXT: movaps 208(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps 208(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 224(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3458,228 +3423,244 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 224(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%r8), %xmm2 +; SSE-NEXT: movaps 224(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rsi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 240(%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 240(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 240(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%r8), %xmm3 +; SSE-NEXT: movaps 240(%rcx), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-NEXT: movaps 240(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm5[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm9, %xmm8 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm7[2,3] -; SSE-NEXT: punpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm6[1,1] +; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm12[2,0] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm15[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm3[0,1] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[0,1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[3,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: movaps 32(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm13[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[2,0] +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm2[2,0] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm11[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movaps 80(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[0,1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,2] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 96(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE-NEXT: movaps 64(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[1,1] ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[2,0] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: movaps 80(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: movaps 112(%rdi), %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm13[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[0,1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movaps %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[0,2] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movaps 96(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[1,1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[2,0] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movaps 112(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movaps 128(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[1,1] ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[2,0] +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm4[0],xmm5[1,2,3] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[0,1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3693,272 +3674,223 @@ ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[0,1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 160(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] +; SSE-NEXT: movaps 160(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[1,1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[2,0] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm14[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[0,1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 176(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps 176(%rdi), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 192(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movaps 192(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[1,1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps %xmm15, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm2[2,0] +; SSE-NEXT: movss {{.*#+}} xmm14 = xmm15[0],xmm14[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[0,1] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 208(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm12 +; SSE-NEXT: movaps 208(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm12, %xmm13 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0] +; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm10 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[2,3] +; SSE-NEXT: movaps %xmm2, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[3,0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps 224(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movaps 224(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm10 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm1[2,0] +; SSE-NEXT: movss {{.*#+}} xmm8 = xmm7[0],xmm8[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[0,1] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movaps 240(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE-NEXT: movaps %xmm3, %xmm14 +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm2[1] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[2,0] -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = xmm2[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm14 = xmm0[0],xmm14[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm15 = xmm0[0],xmm15[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm13 = xmm0[0],xmm13[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm0[0],xmm5[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[0,1] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm5[3,0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE-NEXT: movaps %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[0,2] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[1,0],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm6[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm6[0],xmm3[1,2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[1,0],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm6[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm6[0],xmm3[1,2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[1,0],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm6[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm6[0],xmm3[1,2,3] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[1,0],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[1,0],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[1,0],mem[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[1,0],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm3[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm12 = xmm3[0],xmm12[1,2,3] +; SSE-NEXT: shufps $81, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[1,0],mem[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movss {{.*#+}} xmm5 = xmm1[0],xmm5[1,2,3] ; SSE-NEXT: movaps %xmm0, 1248(%r9) -; SSE-NEXT: movaps %xmm3, 1232(%r9) -; SSE-NEXT: movaps %xmm6, 1200(%r9) -; SSE-NEXT: movaps %xmm8, 1168(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1152(%r9) -; SSE-NEXT: movaps %xmm9, 1120(%r9) +; SSE-NEXT: movaps %xmm2, 1232(%r9) +; SSE-NEXT: movaps %xmm4, 1200(%r9) +; SSE-NEXT: movaps %xmm9, 1168(%r9) +; SSE-NEXT: movaps %xmm7, 1152(%r9) +; SSE-NEXT: movaps %xmm10, 1120(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1088(%r9) -; SSE-NEXT: movaps %xmm10, 1072(%r9) -; SSE-NEXT: movaps %xmm12, 1040(%r9) +; SSE-NEXT: movaps %xmm11, 1072(%r9) +; SSE-NEXT: movaps %xmm13, 1040(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1008(%r9) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 992(%r9) +; SSE-NEXT: movaps %xmm15, 992(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 960(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3991,7 +3923,7 @@ ; SSE-NEXT: movaps %xmm0, 592(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 560(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 528(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 512(%r9) @@ -4038,25 +3970,28 @@ ; SSE-NEXT: movaps %xmm5, 1216(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1184(%r9) -; SSE-NEXT: movaps %xmm4, 1136(%r9) +; SSE-NEXT: movaps %xmm8, 1136(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1104(%r9) -; SSE-NEXT: movaps %xmm7, 1056(%r9) +; SSE-NEXT: movaps %xmm12, 1056(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1024(%r9) -; SSE-NEXT: movaps %xmm11, 976(%r9) +; SSE-NEXT: movaps %xmm14, 976(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 944(%r9) -; SSE-NEXT: movaps %xmm13, 896(%r9) +; SSE-NEXT: movaps %xmm6, 896(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 864(%r9) -; SSE-NEXT: movaps %xmm15, 816(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 816(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 784(%r9) -; SSE-NEXT: movaps %xmm2, 736(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 736(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 704(%r9) -; SSE-NEXT: movaps %xmm14, 656(%r9) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 656(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 624(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4089,24 +4024,28 @@ ; SSE-NEXT: movaps %xmm0, 64(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%r9) -; SSE-NEXT: addq $1736, %rsp # imm = 0x6C8 +; SSE-NEXT: addq $1464, %rsp # imm = 0x5B8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride5_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $1784, %rsp # imm = 0x6F8 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm7[1],xmm8[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm7[0],xmm8[0],zero,zero +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm15 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm8[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 @@ -4114,14 +4053,10 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm14[1],xmm12[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = xmm14[0],xmm12[0],zero,zero +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[1,1],xmm15[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm7[0],xmm2[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 @@ -4130,7 +4065,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 @@ -4139,8 +4074,8 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm13[1],xmm10[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = xmm13[0],xmm10[0],zero,zero +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm14[1,1],xmm10[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm4, %ymm3 @@ -4156,11 +4091,11 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm5[1],xmm11[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = xmm5[0],xmm11[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[1,1],xmm11[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm5[0],xmm11[0],xmm5[1],xmm11[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm7, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm0 @@ -4174,12 +4109,12 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm6 = zero,xmm9[1],xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = xmm9[0],xmm0[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r8), %ymm15, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm4[1,2,3],ymm6[4],ymm4[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm9[1,1],xmm0[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm12, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r8), %ymm12, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm4[1,2,3],ymm7[4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -4187,17 +4122,17 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 164(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6 +; AVX1-ONLY-NEXT: vbroadcastss 164(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm7 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm4[1],xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm4[0],xmm0[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3],ymm15[4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm4[1,1],xmm0[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3],ymm12[4,5,6],ymm7[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r8), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1,2,3],ymm0[4],ymm6[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1,2,3],ymm0[4],ymm7[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4205,15 +4140,15 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,0] -; AVX1-ONLY-NEXT: vbroadcastss 196(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 196(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm6[1],xmm0[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm6[0],xmm0[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm7[1,1],xmm0[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3],ymm12[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r8), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4229,11 +4164,11 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm0[1],xmm1[1],zero -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = xmm0[0],xmm1[0],zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,1],xmm1[1,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4241,8 +4176,8 @@ ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 4(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm7[2],xmm8[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm6[2],xmm8[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] @@ -4252,22 +4187,23 @@ ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm15 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm15[1,1],ymm1[5,5],ymm15[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[3,3],xmm1[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,3],xmm2[3,3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[3,3],ymm2[3,3],ymm6[7,7],ymm2[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm2 @@ -4282,8 +4218,8 @@ ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 36(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm14[2],xmm12[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm13[2],xmm15[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] @@ -4295,15 +4231,15 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1],ymm1[1,1],ymm14[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,1],ymm1[1,1],ymm15[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[3,3],xmm1[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[3,3],xmm1[3,3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 @@ -4323,8 +4259,8 @@ ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 68(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm13[2],xmm10[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,zero,xmm14[2],xmm10[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] @@ -4334,17 +4270,17 @@ ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm14 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm13[1,1],ymm1[5,5],ymm13[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm14[1,1],ymm1[5,5],ymm14[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[3,3],xmm1[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[3,3],xmm1[3,3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm3 @@ -4418,11 +4354,11 @@ ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 144(%rcx), %xmm1 @@ -4430,11 +4366,10 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,3],xmm1[3,3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3],ymm13[3,3],ymm2[7,7],ymm13[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm2 @@ -4491,9 +4426,9 @@ ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vbroadcastss 196(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm6[2],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = zero,zero,xmm7[2],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] @@ -4570,7 +4505,8 @@ ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm15[1,0,2,2] +; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4,5,6],ymm6[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -4580,7 +4516,7 @@ ; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm10 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4,5,6],ymm6[7] @@ -4593,7 +4529,7 @@ ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm13[1,0,2,2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm14[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2],ymm15[3,4,5,6],ymm6[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload @@ -4612,8 +4548,7 @@ ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = mem[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm13[4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload @@ -6534,423 +6469,411 @@ ; ; AVX512F-LABEL: store_i32_stride5_vf64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm12 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm25 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm16 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm27 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm22 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm23, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512F-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm17 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm14 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm13 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm19 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm24 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm1 +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm22 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm28 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm23 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm18 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] ; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm28 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm15, %zmm28 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm11 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm3, %zmm11 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] +; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm10 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm16, %zmm10 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm25, %zmm13, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm18, %zmm21 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm26, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm22, %zmm23, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm3, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm13, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm18, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm26, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm4, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm27, %zmm23, %zmm2 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm26 +; AVX512F-NEXT: vpermt2d %zmm17, %zmm12, %zmm26 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm5, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] +; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512F-NEXT: vpermt2d %zmm22, %zmm15, %zmm30 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm19, %zmm20, %zmm21 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512F-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm22, %zmm27, %zmm19 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm3, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm4, %zmm6 +; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm15, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm24, %zmm20, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm28, %zmm27, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm3, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm13, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm23, %zmm1 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm15, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm20, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm23, %zmm27, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512F-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm20, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm20, %zmm18, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm26, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm23, %zmm20 -; AVX512F-NEXT: vpermi2d %zmm16, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm0, %zmm16, %zmm13 -; AVX512F-NEXT: vpermi2d %zmm16, %zmm0, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm26, %zmm0 +; AVX512F-NEXT: vpermi2d %zmm18, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm18, %zmm0, %zmm4 +; AVX512F-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermi2d %zmm18, %zmm0, %zmm15 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm18, %zmm20 +; AVX512F-NEXT: vpermt2d %zmm18, %zmm27, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm16, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm11, %zmm29 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm15, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm4, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm23, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm16, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm11, %zmm19 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm8 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm16, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm15, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm4, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm23, %zmm9 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm12, %zmm15 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm12, %zmm4 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm12, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm16, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm18 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm23, %zmm18 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512F-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2d %zmm13, %zmm27, %zmm17 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm16, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm14, %zmm12, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm5, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm23, %zmm6 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm27, %zmm14 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm16, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm13, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm23, %zmm5 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm27, %zmm13 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm16 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm28 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm23 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512F-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm10 {%k1} +; AVX512F-NEXT: movw $6342, %ax # imm = 0x18C6 +; AVX512F-NEXT: kmovw %eax, %k3 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm26 {%k3} ; AVX512F-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm11 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm30 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm21 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm18, %zmm10 +; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm26 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm27, %zmm30 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm21 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm17 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm21 {%k2} -; AVX512F-NEXT: movw $6342, %ax # imm = 0x18C6 -; AVX512F-NEXT: kmovw %eax, %k3 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm26 {%k3} -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm12, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm16, %zmm28 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm21 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm29 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm4 {%k3} +; AVX512F-NEXT: vmovdqa32 %zmm8, %zmm22 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm18, %zmm9 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm27, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm25 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm14 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm10, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm14, %zmm24 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm6 {%k3} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k3} +; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm29 {%k2} +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm31 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm13 {%k2} ; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm12, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm16, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm18, %zmm3 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm27, %zmm29 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm31 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm13 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm9, %zmm30 {%k2} -; AVX512F-NEXT: vmovdqa32 %zmm31, %zmm3 {%k3} -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm7 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm12, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm16, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm20, %zmm30 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm25, %zmm3 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm15 {%k2} -; AVX512F-NEXT: vpermt2d %zmm0, %zmm12, %zmm15 -; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm13 {%k1} -; AVX512F-NEXT: vpermt2d %zmm0, %zmm16, %zmm13 -; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} -; AVX512F-NEXT: vpermt2d %zmm0, %zmm20, %zmm18 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm2 {%k3} -; AVX512F-NEXT: vpermt2d %zmm0, %zmm25, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm18, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm13, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm1, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm3, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm30, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm27, 512(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm5, 576(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm19, 640(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm6, 704(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm24, 768(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm22, 832(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm17, 896(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm29, 960(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm26, 1024(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm21, 1088(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm28, 1152(%r9) +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm18, %zmm16 +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm12 {%k3} +; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm12 +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm15 {%k2} +; AVX512F-NEXT: vpermt2d %zmm0, %zmm27, %zmm15 +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} +; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm20 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm1 {%k2} +; AVX512F-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, 1216(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm20, 1152(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm15, 1088(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm12, 1024(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm16, 960(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm13, 896(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm31, 832(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm29, 768(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm2, 704(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm3, 640(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm14, 576(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm25, 512(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm4, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm9, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm21, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm30, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm26, 64(%r9) ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1216(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512F-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512F-NEXT: vmovaps %zmm0, (%r9) +; AVX512F-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i32_stride5_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm25 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm16 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm20 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm27 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm22 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] -; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm23, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512BW-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm19 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm24 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm22 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm28 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm23 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm18 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [17,0,0,16,2,18,0,1,17,0,0,16,2,18,0,1] ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm28 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm15, %zmm28 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm11 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm3, %zmm11 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,16,2,18,0,1,17,3,0,16,2,18,0,1,17,3] +; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm10 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm16, %zmm10 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm25, %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm18, %zmm21 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,3,19,5,21,0,4,20,0,3,19,5,21,0,4,20] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm26, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm4, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm23, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm13, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm18, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm26, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm23, %zmm2 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm26 +; AVX512BW-NEXT: vpermt2d %zmm17, %zmm12, %zmm26 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm5, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [6,22,8,24,0,7,23,9,6,22,8,24,0,7,23,9] +; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm15, %zmm30 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,27,11,0,26,10,28,12,9,27,11,0,26,10,28,12] +; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm19, %zmm20, %zmm21 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [14,30,0,13,29,15,31,0,14,30,0,13,29,15,31,0] +; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm22, %zmm27, %zmm19 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm4, %zmm6 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm15, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm20, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm28, %zmm27, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm3, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm13, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm23, %zmm1 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm15, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm20, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm27, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,7,23,9,25,0,8,24,0,7,23,9,25,0] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [27,0,10,26,12,28,0,11,27,0,10,26,12,28,0,11] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm18, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm26, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm23, %zmm20 -; AVX512BW-NEXT: vpermi2d %zmm16, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm0, %zmm16, %zmm13 -; AVX512BW-NEXT: vpermi2d %zmm16, %zmm0, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm26, %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm18, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm18, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vpermi2d %zmm18, %zmm0, %zmm15 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm18, %zmm20 +; AVX512BW-NEXT: vpermt2d %zmm18, %zmm27, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [3,21,5,0,20,4,22,6,3,21,5,0,20,4,22,6] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm16, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm11, %zmm29 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm15, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm23, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm16, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm11, %zmm19 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm16, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm15, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm4, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm23, %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm12, %zmm15 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm12, %zmm4 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm12, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm16, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm23, %zmm18 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,29,15,31,0,14,30,0,13,29,15,31,0,14,30] +; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm16, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm14, %zmm12, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm5, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm23, %zmm6 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm27, %zmm14 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm16, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm13, %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm23, %zmm5 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm13 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm16 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm28 +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm1 ; AVX512BW-NEXT: movw $12684, %ax # imm = 0x318C ; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm10 {%k1} +; AVX512BW-NEXT: movw $6342, %ax # imm = 0x18C6 +; AVX512BW-NEXT: kmovd %eax, %k3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm26 {%k3} ; AVX512BW-NEXT: movw $25368, %ax # imm = 0x6318 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm11 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm30 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm21 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm10 +; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm26 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm30 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm17 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm21 {%k2} -; AVX512BW-NEXT: movw $6342, %ax # imm = 0x18C6 -; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm26 {%k3} -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [28,1,2,3,4,29,6,7,8,9,30,11,12,13,14,31] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,25,2,3,4,5,26,7,8,9,10,27,12,13,14,15] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm16, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,22,3,4,5,6,23,8,9,10,11,24,13,14,15] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm26 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm29 +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm4 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm9 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm25 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm14 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm24 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm6 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm29 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm31 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm13 {%k2} ; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm16, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm3 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm29 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm31 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm13 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm30 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm3 {%k3} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm16, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm20, %zmm30 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm25, %zmm3 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm15 {%k2} -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm15 -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm13 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm16, %zmm13 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm20, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm2 {%k3} -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 512(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 576(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 640(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 704(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 768(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 832(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 896(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 960(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 1024(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 1088(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 1152(%r9) +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm18, %zmm16 +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm12 {%k3} +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm12 +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm15 {%k2} +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm15 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm20 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm20 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm1 {%k2} +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, 1216(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 1152(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 1088(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 1024(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 960(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 896(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 832(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 768(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 704(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 640(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 512(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%r9) ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1216(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r9) -; AVX512BW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512BW-NEXT: vmovaps %zmm0, (%r9) +; AVX512BW-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll @@ -45,23 +45,24 @@ ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero -; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[u,u,0,2,u,u,5,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,3],xmm4[1,3] -; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm2[2,3],ymm0[4,6],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -70,22 +71,24 @@ ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2,1,3,4,6,5,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,2,2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] -; AVX2-SLOW-NEXT: vmovaps %xmm1, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX2-SLOW-NEXT: vmovaps %xmm0, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -94,23 +97,24 @@ ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = <0,2,4,6,u,u,1,3> -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = <0,2,4,6,u,u,1,3> +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-NEXT: vmovaps %xmm3, 32(%rax) +; AVX2-FAST-NEXT: vmovaps %xmm2, 32(%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -119,43 +123,44 @@ ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2,1,3,4,6,5,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512-LABEL: store_i32_stride6_vf2: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0 -; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = <0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u> -; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vextractf32x4 $2, %zmm0, 32(%rax) -; AVX512-NEXT: vmovaps %ymm0, (%rax) +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2 +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,2,4,6,16,20,1,3,5,7,17,21,u,u,u,u> +; AVX512-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 +; AVX512-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) +; AVX512-NEXT: vmovdqa %ymm1, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <2 x i32>, ptr %in.vecptr0, align 64 @@ -180,36 +185,36 @@ ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm2 -; SSE-NEXT: movaps (%rdx), %xmm1 +; SSE-NEXT: movaps (%rsi), %xmm1 +; SSE-NEXT: movaps (%rdx), %xmm2 ; SSE-NEXT: movaps (%rcx), %xmm3 ; SSE-NEXT: movaps (%r8), %xmm4 ; SSE-NEXT: movaps (%r9), %xmm5 -; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: movaps %xmm2, %xmm6 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: movaps %xmm4, %xmm7 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm5[1] -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm5[3,3] -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm5[1,1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[2,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm8[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm9[0,2] -; SSE-NEXT: movaps %xmm5, 16(%rax) -; SSE-NEXT: movaps %xmm6, 32(%rax) +; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] +; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm9[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm2, 80(%rax) ; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps %xmm1, 80(%rax) -; SSE-NEXT: movaps %xmm7, 64(%rax) -; SSE-NEXT: movaps %xmm4, (%rax) +; SSE-NEXT: movaps %xmm6, 32(%rax) +; SSE-NEXT: movaps %xmm8, 16(%rax) +; SSE-NEXT: movaps %xmm1, 64(%rax) +; SSE-NEXT: movaps %xmm7, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride6_vf4: @@ -229,7 +234,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[0,0],xmm1[0,0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm3[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] @@ -335,161 +340,161 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride6_vf8: ; SSE: # %bb.0: +; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps (%rsi), %xmm7 +; SSE-NEXT: movaps 16(%rdi), %xmm2 +; SSE-NEXT: movaps (%rsi), %xmm11 ; SSE-NEXT: movaps 16(%rsi), %xmm10 -; SSE-NEXT: movaps (%rdx), %xmm6 -; SSE-NEXT: movaps 16(%rdx), %xmm2 -; SSE-NEXT: movaps (%rcx), %xmm5 -; SSE-NEXT: movaps 16(%rcx), %xmm9 -; SSE-NEXT: movaps (%r8), %xmm3 -; SSE-NEXT: movaps 16(%r8), %xmm11 -; SSE-NEXT: movaps (%r9), %xmm8 -; SSE-NEXT: movaps 16(%r9), %xmm4 -; SSE-NEXT: movaps %xmm9, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm2[1] -; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm13[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[2,0] -; SSE-NEXT: movaps %xmm11, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,1],xmm4[1,1] -; SSE-NEXT: movaps %xmm2, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; SSE-NEXT: movaps %xmm11, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm4[3,3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm11[0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm12[0,2] -; SSE-NEXT: movaps %xmm3, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3],xmm8[3,3] -; SSE-NEXT: movaps %xmm6, %xmm12 -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm11[0,2] -; SSE-NEXT: movaps %xmm0, %xmm11 -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm10[0,2] -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm11[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm5[2,0] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[0,2] -; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm7, 32(%rax) -; SSE-NEXT: movaps %xmm11, 48(%rax) -; SSE-NEXT: movaps %xmm1, 96(%rax) -; SSE-NEXT: movaps %xmm4, 112(%rax) -; SSE-NEXT: movaps %xmm14, 160(%rax) -; SSE-NEXT: movaps %xmm2, 176(%rax) -; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: movaps %xmm5, 16(%rax) -; SSE-NEXT: movaps %xmm6, 64(%rax) -; SSE-NEXT: movaps %xmm12, 80(%rax) -; SSE-NEXT: movaps %xmm15, 128(%rax) -; SSE-NEXT: movaps %xmm13, 144(%rax) +; SSE-NEXT: movaps (%rdx), %xmm1 +; SSE-NEXT: movaps 16(%rdx), %xmm5 +; SSE-NEXT: movaps (%rcx), %xmm12 +; SSE-NEXT: movaps 16(%rcx), %xmm7 +; SSE-NEXT: movaps (%r8), %xmm13 +; SSE-NEXT: movaps 16(%r8), %xmm9 +; SSE-NEXT: movaps (%r9), %xmm14 +; SSE-NEXT: movaps 16(%r9), %xmm8 +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] +; SSE-NEXT: movaps %xmm14, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm13[0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE-NEXT: movaps %xmm14, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm13[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm15[2,0] +; SSE-NEXT: movaps %xmm12, %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; SSE-NEXT: movaps %xmm14, %xmm11 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm13[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,0],xmm13[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm14[2,0] +; SSE-NEXT: movaps %xmm5, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1] +; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; SSE-NEXT: movaps %xmm8, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm9[0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm12[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm13[0] +; SSE-NEXT: movaps %xmm8, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,0],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm15[2,0] +; SSE-NEXT: movaps %xmm7, %xmm15 +; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm5[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm2[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,0],xmm9[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm8[2,0] +; SSE-NEXT: movaps %xmm5, 176(%rax) +; SSE-NEXT: movaps %xmm10, 160(%rax) +; SSE-NEXT: movaps %xmm2, 144(%rax) +; SSE-NEXT: movaps %xmm13, 128(%rax) +; SSE-NEXT: movaps %xmm14, 112(%rax) +; SSE-NEXT: movaps %xmm12, 96(%rax) +; SSE-NEXT: movaps %xmm1, 80(%rax) +; SSE-NEXT: movaps %xmm11, 64(%rax) +; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movaps %xmm4, 32(%rax) +; SSE-NEXT: movaps %xmm6, 16(%rax) +; SSE-NEXT: movaps %xmm3, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride6_vf8: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 16(%r9), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,2],xmm9[1,2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2,1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[1,2],xmm5[1,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2,1,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 4(%r8), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 16(%r9), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm10[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm1[1,2],ymm2[1,2],ymm1[5,6],ymm2[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm10 +; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm11 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm11[0],mem[0],xmm11[1],mem[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm12[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 4(%r8), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm12[3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm2[1,2],ymm3[1,2],ymm2[5,6],ymm3[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm10[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5,6],ymm10[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm2 -; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[3,0],ymm2[7,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 32(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i32_stride6_vf8: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] @@ -514,49 +519,49 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm12[2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastq %xmm10, %ymm11 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm12, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm13 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm13, %ymm11 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm11[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4],ymm11[5],ymm14[6],ymm11[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm1[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4],ymm12[5],ymm14[6],ymm12[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 20(%r9), %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm13[2,3],ymm8[2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = mem[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 16(%r9), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 16(%r9), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 160(%rax) +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 96(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm7, 64(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm11, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, 128(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm6, (%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%rax) ; AVX2-SLOW-NEXT: vzeroupper @@ -565,93 +570,93 @@ ; AVX2-FAST-LABEL: store_i32_stride6_vf8: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm5 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm6 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm9 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm10 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm10[0],zero,xmm10[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm12 +; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm13 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm12 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm12[0],zero,xmm12[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm13 -; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm14 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm13[2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm12, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,3,2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,0,7,0,6,0,7,0] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,6,0,7,0,6,0,7] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm10, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm12[2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm10, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm11 +; AVX2-FAST-NEXT: vpbroadcastd %xmm11, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm1[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3],ymm14[4],ymm13[5],ymm14[6],ymm13[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm12[4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,2,3,3,2,2,3,3] +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm12, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 32(%rax) +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm12[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,0,7,0,6,0,7,0] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,7,0,6,0,7] +; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd (%r9), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm13, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride6_vf8: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] @@ -676,49 +681,49 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm12[2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm10, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm12, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm13, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm11[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,2,3,5,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4],ymm11[5],ymm14[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm11 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm1[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4],ymm12[5],ymm14[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 20(%r9), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm13[2,3],ymm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = mem[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 16(%r9), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 16(%r9), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -771,133 +776,133 @@ ; SSE-LABEL: store_i32_stride6_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $72, %rsp -; SSE-NEXT: movaps (%rdi), %xmm4 -; SSE-NEXT: movaps 16(%rdi), %xmm5 +; SSE-NEXT: movaps (%rdi), %xmm3 +; SSE-NEXT: movaps 16(%rdi), %xmm4 ; SSE-NEXT: movaps (%rsi), %xmm8 ; SSE-NEXT: movaps 16(%rsi), %xmm11 -; SSE-NEXT: movaps (%rdx), %xmm6 -; SSE-NEXT: movaps 16(%rdx), %xmm7 +; SSE-NEXT: movaps (%rdx), %xmm5 +; SSE-NEXT: movaps 16(%rdx), %xmm6 ; SSE-NEXT: movaps (%rcx), %xmm1 ; SSE-NEXT: movaps 16(%rcx), %xmm14 ; SSE-NEXT: movaps (%r8), %xmm9 -; SSE-NEXT: movaps 16(%r8), %xmm15 -; SSE-NEXT: movaps (%r9), %xmm2 -; SSE-NEXT: movaps 16(%r9), %xmm0 -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] -; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: movaps 16(%r8), %xmm0 +; SSE-NEXT: movaps (%r9), %xmm10 +; SSE-NEXT: movaps 16(%r9), %xmm15 +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSE-NEXT: movaps %xmm10, %xmm12 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm3[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm2[2,3] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm9[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE-NEXT: movaps %xmm10, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[2,3] +; SSE-NEXT: movaps %xmm7, (%rsp) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm3[0,2] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSE-NEXT: movaps %xmm9, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm4[2,3] -; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm2[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm9[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,0],xmm9[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm10[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] -; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm15[0] +; SSE-NEXT: movaps %xmm15, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm14, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm11[2],xmm4[3],xmm11[3] ; SSE-NEXT: movaps %xmm15, %xmm2 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,3],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,0],xmm0[3,3] ; SSE-NEXT: movaps 32(%rdx), %xmm13 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm14[2],xmm6[3],xmm14[3] ; SSE-NEXT: movaps 32(%rcx), %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm15[0,2] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm15[2,0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm15 ; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] -; SSE-NEXT: movaps 32(%rsi), %xmm1 +; SSE-NEXT: movaps 32(%rsi), %xmm2 ; SSE-NEXT: movaps %xmm12, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: movaps 32(%r8), %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; SSE-NEXT: movaps 32(%r8), %xmm1 ; SSE-NEXT: movaps 32(%r9), %xmm4 ; SSE-NEXT: movaps %xmm4, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0] +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm1[0] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm14[2,3] ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm3[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm12[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm3[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm2[2],xmm12[3],xmm2[3] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm13[1] +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm12[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm3[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm1[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm2[0,2] -; SSE-NEXT: movaps 48(%rdx), %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm4[2,0] +; SSE-NEXT: movaps 48(%rdx), %xmm3 ; SSE-NEXT: movaps 48(%rcx), %xmm9 -; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movaps %xmm3, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] ; SSE-NEXT: movaps 48(%rdi), %xmm0 ; SSE-NEXT: movaps 48(%rsi), %xmm10 -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] -; SSE-NEXT: movaps 48(%r8), %xmm3 -; SSE-NEXT: movaps 48(%r9), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: movaps 48(%r8), %xmm7 +; SSE-NEXT: movaps 48(%r9), %xmm5 +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm1[2,0] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] ; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm3, %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: movaps %xmm5, %xmm10 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm7[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm7[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm5[2,0] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 368(%rax) +; SSE-NEXT: movaps %xmm3, 368(%rax) ; SSE-NEXT: movaps %xmm10, 352(%rax) ; SSE-NEXT: movaps %xmm0, 336(%rax) ; SSE-NEXT: movaps %xmm4, 320(%rax) ; SSE-NEXT: movaps %xmm6, 304(%rax) -; SSE-NEXT: movaps %xmm5, 288(%rax) +; SSE-NEXT: movaps %xmm2, 288(%rax) ; SSE-NEXT: movaps %xmm13, 272(%rax) ; SSE-NEXT: movaps %xmm8, 256(%rax) ; SSE-NEXT: movaps %xmm12, 240(%rax) @@ -933,347 +938,333 @@ ; ; AVX1-ONLY-LABEL: store_i32_stride6_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $104, %rsp -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm13 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm11 -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[1,2],xmm8[1,2] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm5 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm6 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 4(%r8), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 4(%r8), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm13[0],ymm4[1],ymm13[1],ymm4[4],ymm13[4],ymm4[5],ymm13[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[4],ymm15[4],ymm10[5],ymm15[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 48(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 16(%r9), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,2],xmm3[1,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 36(%r8), %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 36(%r9), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm11[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm11 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm14 -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 16(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm10[0,1,2,3,4],ymm14[5],ymm10[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm13[2],ymm4[3],ymm13[3],ymm4[6],ymm13[6],ymm4[7],ymm13[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,2],ymm9[1,2],ymm7[5,6],ymm9[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,2],ymm12[1,2],ymm11[5,6],ymm12[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6],ymm5[7] -; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[3,0],ymm7[3,0],ymm9[7,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vbroadcastss 36(%r8), %xmm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 36(%r9), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[4],ymm14[4],ymm12[5],ymm14[5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 48(%r9), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[6],ymm15[6],ymm10[7],ymm15[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,2],ymm7[1,2],ymm5[5,6],ymm7[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm15[2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm14 = ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[6],ymm14[6],ymm12[7],ymm14[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm1[1,2],ymm2[1,2],ymm1[5,6],ymm2[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm15 +; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm13, %ymm3 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[3,0],ymm5[3,0],ymm7[7,4],ymm5[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4,5,6],ymm6[7] -; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm6 -; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm12[3,0],ymm11[3,0],ymm12[7,4],ymm11[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6],ymm5[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[3,0],ymm2[7,4],ymm1[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 256(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 96(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 288(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $104, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i32_stride6_vf16: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $200, %rsp -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 4(%r9), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm11 +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm12[0],zero,xmm12[1],zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 4(%r9), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,2,2,3] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm7[0],zero,xmm7[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 36(%r9), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd 32(%rcx), %xmm3 -; AVX2-SLOW-NEXT: vpbroadcastd 32(%rdx), %xmm4 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm10 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm7, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm15 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm15, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm14 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,2,3,5,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 52(%r9), %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm11[0],zero,xmm11[1],zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 36(%r9), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd (%rcx), %xmm2 ; AVX2-SLOW-NEXT: vpbroadcastd (%rdx), %xmm3 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm13 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm13, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm12, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm4 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,2,3,5,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm3[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm5 -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm3[4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 20(%r9), %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm11, %ymm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm10[2],ymm14[2],ymm10[3],ymm14[3],ymm10[6],ymm14[6],ymm10[7],ymm14[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm12 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm12[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3,4,5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm13 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm11, %ymm13 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm13[2,3,4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm10[0],ymm14[0],ymm10[1],ymm14[1],ymm10[4],ymm14[4],ymm10[5],ymm14[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm10[2,2,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 48(%r9), %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm9[2,3] -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = mem[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm7 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 20(%r9), %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd 32(%rcx), %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastd 32(%rdx), %xmm5 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm11, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm8 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm8, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm5[2,1,2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm5 +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm9[4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = mem[0],zero,mem[1],zero +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 52(%r9), %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm14[3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm14, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm4[1],ymm10[2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm13[0],ymm0[0],ymm13[1],ymm0[1],ymm13[4],ymm0[4],ymm13[5],ymm0[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 16(%r9), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 16(%r9), %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm13[2],ymm0[2],ymm13[3],ymm0[3],ymm13[6],ymm0[6],ymm13[7],ymm0[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = mem[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm7, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm7 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm15[0],ymm1[0],ymm15[1],ymm1[1],ymm15[4],ymm1[4],ymm15[5],ymm1[5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 48(%r9), %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm15[2],ymm1[2],ymm15[3],ymm1[3],ymm15[6],ymm1[6],ymm15[7],ymm1[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 288(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm7, 256(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 352(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 352(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 288(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 256(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) @@ -1283,354 +1274,353 @@ ; ; AVX2-FAST-LABEL: store_i32_stride6_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $232, %rsp -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,3] +; AVX2-FAST-NEXT: subq $200, %rsp +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[1,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm3 -; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm5 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm10 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm4, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 32(%r9), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm11[0],zero,xmm11[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm15 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[6],ymm15[6],ymm2[7],ymm15[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm1 -; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm3 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm14[2],xmm12[2],xmm14[3],xmm12[3] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm7[0],zero,xmm7[1],zero +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm2 +; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm3 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm13 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm11, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm7 +; AVX2-FAST-NEXT: vpbroadcastd %xmm7, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm5 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm3[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm4 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm14[2],ymm4[2],ymm14[3],ymm4[3],ymm14[6],ymm4[6],ymm14[7],ymm4[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm3[4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm9, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm9, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm13[2,3] -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,5,3,3,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,6,2,3,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm9, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm9, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm7 = ymm7[0],ymm15[0],ymm7[1],ymm15[1],ymm7[4],ymm15[4],ymm7[5],ymm15[5] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm3 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[6],ymm2[6],ymm6[7],ymm2[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm1 +; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm4 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm5, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm9 +; AVX2-FAST-NEXT: vpbroadcastd %xmm9, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm15 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm4[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm10 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm10[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1,2],ymm14[3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm12 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,2,3,3,2,2,3,3] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm7[1],ymm11[2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm13[0],ymm3[0],ymm13[1],ymm3[1],ymm13[4],ymm3[4],ymm13[5],ymm3[5] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm7 = ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[6],ymm1[6],ymm5[7],ymm1[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm7[2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[6],ymm3[6],ymm13[7],ymm3[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm8[2,3],ymm3[2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[4],ymm1[4],ymm5[5],ymm1[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm14[0],ymm4[0],ymm14[1],ymm4[1],ymm14[4],ymm4[4],ymm14[5],ymm4[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [4,6,2,3,4,6,6,7] +; AVX2-FAST-NEXT: vpermd (%r9), %ymm6, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2,3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm12, %ymm8 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm15[0],ymm1[0],ymm15[1],ymm1[1],ymm15[4],ymm1[4],ymm15[5],ymm1[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm8[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm15[2],ymm1[2],ymm15[3],ymm1[3],ymm15[6],ymm1[6],ymm15[7],ymm1[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm10[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermd 32(%r9), %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 352(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 288(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 256(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm3, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 288(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 256(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm11, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm14, 320(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $232, %rsp +; AVX2-FAST-NEXT: addq $200, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride6_vf16: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $200, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm13[0],zero,xmm13[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%r9), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm7[0],zero,xmm7[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%r9), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rcx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rdx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm7, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,2,3,5,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 52(%r9), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm13, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,2,3,5,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm3[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm3[4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 20(%r9), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm11, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm1[1],ymm12[2,3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm10[2],ymm14[2],ymm10[3],ymm14[3],ymm10[6],ymm14[6],ymm10[7],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = ymm12[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3,4,5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm13 = mem[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm13 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm11, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm13[2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm10[0],ymm14[0],ymm10[1],ymm14[1],ymm10[4],ymm14[4],ymm10[5],ymm14[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm10[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 48(%r9), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm9[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[2,1,3,3,6,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm10 = mem[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2,3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm12[0],zero,xmm12[1],zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 4(%r9), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm11[0],zero,xmm11[1],zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 36(%r9), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rcx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd (%rdx), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm12, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm7 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 20(%r9), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 32(%rdx), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm11, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm5[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm9[4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm10 = mem[0],zero,mem[1],zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 52(%r9), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm14[3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm14, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm4[1],ymm10[2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm13[0],ymm0[0],ymm13[1],ymm0[1],ymm13[4],ymm0[4],ymm13[5],ymm0[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 16(%r9), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 16(%r9), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm13[2],ymm0[2],ymm13[3],ymm0[3],ymm13[6],ymm0[6],ymm13[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = mem[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm7, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5] +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm15[0],ymm1[0],ymm15[1],ymm1[1],ymm15[4],ymm1[4],ymm15[5],ymm1[5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 48(%r9), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm15[2],ymm1[2],ymm15[3],ymm1[3],ymm15[6],ymm1[6],ymm15[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = mem[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 288(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 256(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 352(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 256(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) @@ -1660,15 +1650,18 @@ ; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512F-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512F-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512F-SLOW-NEXT: movb $-110, %cl +; AVX512F-SLOW-NEXT: kmovw %ecx, %k2 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> ; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512F-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -1676,23 +1669,20 @@ ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512F-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512F-SLOW-NEXT: movb $-110, %cl -; AVX512F-SLOW-NEXT: kmovw %ecx, %k2 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> ; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512F-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = ; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512F-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> ; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512F-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -1713,10 +1703,10 @@ ; AVX512F-SLOW-NEXT: vpermi2d %zmm1, %zmm11, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 128(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) ; AVX512F-SLOW-NEXT: vzeroupper @@ -1731,68 +1721,72 @@ ; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512F-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm6 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512F-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512F-FAST-NEXT: movb $-110, %cl -; AVX512F-FAST-NEXT: kmovw %ecx, %k2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512F-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 +; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm8 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512F-FAST-NEXT: vpermi2d %zmm7, %zmm8, %zmm9 ; AVX512F-FAST-NEXT: movb $36, %cl ; AVX512F-FAST-NEXT: kmovw %ecx, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 {%k1} ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,18,u,u,6,22,7,23,u,u,7,23,2,18,u,u> -; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512F-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512F-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm8 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <2,18,u,u,6,22,7,23,u,u,7,23,2,18,u,u> +; AVX512F-FAST-NEXT: vpermi2d %zmm8, %zmm9, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512F-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm8 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512F-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 +; AVX512F-FAST-NEXT: movb $-110, %cl +; AVX512F-FAST-NEXT: kmovw %ecx, %k2 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm9, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm9 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512F-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm8 ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512F-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm10, %zmm8 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm8 ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512F-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm11, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm11, %zmm8 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm11 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,18,u,u,14,30,15,31,u,u,15,31,2,18,u,u> -; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <2,18,u,u,14,30,15,31,u,u,15,31,2,18,u,u> +; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512F-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> ; AVX512F-FAST-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] @@ -1800,9 +1794,9 @@ ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 256(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -1828,15 +1822,18 @@ ; AVX512BW-SLOW-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] ; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512BW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512BW-SLOW-NEXT: movb $-110, %cl +; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> ; AVX512BW-SLOW-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] ; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512BW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -1844,23 +1841,20 @@ ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512BW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 -; AVX512BW-SLOW-NEXT: movb $-110, %cl -; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> ; AVX512BW-SLOW-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] ; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = ; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512BW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-SLOW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> ; AVX512BW-SLOW-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] ; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512BW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] @@ -1881,10 +1875,10 @@ ; AVX512BW-SLOW-NEXT: vpermi2d %zmm1, %zmm11, %zmm2 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] ; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, 256(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, 128(%rax) ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -1899,68 +1893,72 @@ ; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512BW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm6 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512BW-FAST-NEXT: movb $-110, %cl -; AVX512BW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 +; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %ymm8 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512BW-FAST-NEXT: vpermi2d %zmm7, %zmm8, %zmm9 ; AVX512BW-FAST-NEXT: movb $36, %cl ; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm8, %zmm7 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,18,u,u,6,22,7,23,u,u,7,23,2,18,u,u> -; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm9, %zmm7 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm9 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm6, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm6 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] ; AVX512BW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm8 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <2,18,u,u,6,22,7,23,u,u,7,23,2,18,u,u> +; AVX512BW-FAST-NEXT: vpermi2d %zmm8, %zmm9, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm7, %zmm8 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm7 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm8 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm9 +; AVX512BW-FAST-NEXT: movb $-110, %cl +; AVX512BW-FAST-NEXT: kmovd %ecx, %k2 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm9, %zmm8 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm9 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512BW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm8 ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512BW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm10, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm10, %zmm8 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm7 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm8 ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512BW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm11, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm11, %zmm8 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm7, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,18,u,u,14,30,15,31,u,u,15,31,2,18,u,u> -; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm7 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm8, %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <2,18,u,u,14,30,15,31,u,u,15,31,2,18,u,u> +; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm3, %zmm8 ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512BW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> ; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm3, %zmm2 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] @@ -1968,9 +1966,9 @@ ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 256(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <16 x i32>, ptr %in.vecptr0, align 64 @@ -2016,22 +2014,22 @@ ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm13[0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[0,2] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm8 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm9[2,3] +; SSE-NEXT: movaps %xmm7, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm9[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,0],xmm6[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm7[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm12, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] @@ -2043,25 +2041,25 @@ ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm12[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm10[2,3] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: movaps 32(%rdx), %xmm6 ; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; SSE-NEXT: movaps 32(%rcx), %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm6, %xmm7 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] @@ -2076,22 +2074,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdx), %xmm7 ; SSE-NEXT: movaps 48(%rcx), %xmm0 @@ -2109,22 +2107,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdx), %xmm7 ; SSE-NEXT: movaps 64(%rcx), %xmm0 @@ -2142,22 +2140,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[2,3] ; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdx), %xmm7 ; SSE-NEXT: movaps 80(%rcx), %xmm0 @@ -2175,90 +2173,90 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdx), %xmm11 +; SSE-NEXT: movaps 96(%rdx), %xmm12 ; SSE-NEXT: movaps 96(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: movaps 96(%rdi), %xmm10 -; SSE-NEXT: movaps 96(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm10, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movaps 96(%r8), %xmm2 +; SSE-NEXT: movaps %xmm12, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: movaps 96(%rdi), %xmm9 +; SSE-NEXT: movaps 96(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] +; SSE-NEXT: movaps 96(%r8), %xmm1 ; SSE-NEXT: movaps 96(%r9), %xmm8 ; SSE-NEXT: movaps %xmm8, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm13[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm5[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] -; SSE-NEXT: movaps %xmm2, %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm8[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] -; SSE-NEXT: movaps 112(%rdx), %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm1[0] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm11[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm13[0] +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm7[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1] +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm9[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm7[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,0],xmm1[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm8[2,0] +; SSE-NEXT: movaps 112(%rdx), %xmm7 ; SSE-NEXT: movaps 112(%rcx), %xmm8 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] ; SSE-NEXT: movaps 112(%rdi), %xmm0 ; SSE-NEXT: movaps 112(%rsi), %xmm14 -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1] -; SSE-NEXT: movaps 112(%r8), %xmm3 -; SSE-NEXT: movaps 112(%r9), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] +; SSE-NEXT: movaps 112(%r8), %xmm6 +; SSE-NEXT: movaps 112(%r9), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[2,0] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm3, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[2,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: movaps %xmm4, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm7[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm6[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[2,0] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 752(%rax) +; SSE-NEXT: movaps %xmm7, 752(%rax) ; SSE-NEXT: movaps %xmm14, 736(%rax) ; SSE-NEXT: movaps %xmm0, 720(%rax) -; SSE-NEXT: movaps %xmm4, 704(%rax) -; SSE-NEXT: movaps %xmm6, 688(%rax) -; SSE-NEXT: movaps %xmm5, 672(%rax) -; SSE-NEXT: movaps %xmm11, 656(%rax) -; SSE-NEXT: movaps %xmm9, 640(%rax) -; SSE-NEXT: movaps %xmm10, 624(%rax) -; SSE-NEXT: movaps %xmm12, 608(%rax) +; SSE-NEXT: movaps %xmm3, 704(%rax) +; SSE-NEXT: movaps %xmm5, 688(%rax) +; SSE-NEXT: movaps %xmm2, 672(%rax) +; SSE-NEXT: movaps %xmm12, 656(%rax) +; SSE-NEXT: movaps %xmm10, 640(%rax) +; SSE-NEXT: movaps %xmm9, 624(%rax) +; SSE-NEXT: movaps %xmm13, 608(%rax) ; SSE-NEXT: movaps %xmm15, 592(%rax) -; SSE-NEXT: movaps %xmm13, 576(%rax) +; SSE-NEXT: movaps %xmm11, 576(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 560(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2336,58 +2334,51 @@ ; ; AVX1-ONLY-LABEL: store_i32_stride6_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1016, %rsp # imm = 0x3F8 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm4 +; AVX1-ONLY-NEXT: subq $776, %rsp # imm = 0x308 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm14 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm9 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm11 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm10 -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm8 +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 4(%r8), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 4(%r8), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm6[0],ymm13[1],ymm6[1],ymm13[4],ymm6[4],ymm13[5],ymm6[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm4[0],ymm10[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 16(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 16(%r9), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,2],xmm2[1,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -2396,15 +2387,10 @@ ; AVX1-ONLY-NEXT: vbroadcastss 36(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[4],ymm7[4],ymm8[5],ymm7[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm9[0],ymm14[1],ymm9[1],ymm14[4],ymm9[4],ymm14[5],ymm9[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm11[0],ymm4[2],ymm11[2] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -2414,18 +2400,15 @@ ; AVX1-ONLY-NEXT: vbroadcastss 48(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -2434,16 +2417,13 @@ ; AVX1-ONLY-NEXT: vbroadcastss 68(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm6[0],ymm14[2],ymm6[2] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -2453,94 +2433,79 @@ ; AVX1-ONLY-NEXT: vbroadcastss 80(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm15 ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm15[1,2] -; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 100(%r8), %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 100(%r9), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm11 -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm11[2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 112(%r9), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 100(%r8), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 100(%r9), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,2],ymm10[1,2],ymm0[5,6],ymm10[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm9[0],ymm2[0],ymm9[2],ymm2[2] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,2],ymm9[1,2],ymm10[5,6],ymm9[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm1[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,2],ymm14[1,2],ymm6[5,6],ymm14[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 84(%r8), %xmm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 84(%r9), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm13[0],xmm4[1],xmm13[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm15[0,0,0,0] -; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0,0,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 96(%r9), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm15[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm15 +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 112(%r9), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm13[2],ymm6[2],ymm13[3],ymm6[3],ymm13[6],ymm6[6],ymm13[7],ymm6[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,2],ymm13[1,2],ymm12[5,6],ymm13[5,6] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,2],ymm13[1,2],ymm12[5,6],ymm13[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm14[2],mem[2],ymm14[3],mem[3],ymm14[6],mem[6],ymm14[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,2],ymm14[1,2],ymm11[5,6],ymm14[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm11 +; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,2],ymm7[1,2],ymm3[5,6],ymm7[5,6] +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm8 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 84(%r8), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 84(%r9), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2],ymm9[1,2],ymm2[5,6],ymm9[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 116(%r8), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 116(%r9), %ymm2 @@ -2549,9 +2514,8 @@ ; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm1 ; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm2 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm2, %ymm2 @@ -2559,9 +2523,8 @@ ; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 @@ -2571,130 +2534,138 @@ ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,0],ymm12[3,0],ymm13[7,4],ymm12[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm3[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,0],ymm10[3,0],ymm9[7,4],ymm10[7,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm14[3,0],mem[3,0],ymm14[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vbroadcastss 64(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1],ymm3[2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 64(%r9), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,0],mem[3,0],ymm8[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] +; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vbroadcastss 64(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 64(%r9), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[3,0],ymm8[3,0],ymm7[7,4],ymm8[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vbroadcastss 96(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vbroadcastss 96(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],mem[0],xmm8[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 96(%r9), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm13[3,0],ymm12[3,0],ymm13[7,4],ymm12[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm10[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2,3,4,5,6],ymm10[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm7[3,0],mem[3,0],ymm7[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm10[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4,5,6],ymm10[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm4, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 736(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm8, 640(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 448(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 576(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 384(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 256(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm15, 576(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2717,13 +2688,13 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $1016, %rsp # imm = 0x3F8 +; AVX1-ONLY-NEXT: addq $776, %rsp # imm = 0x308 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i32_stride6_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $904, %rsp # imm = 0x388 +; AVX2-SLOW-NEXT: subq $936, %rsp # imm = 0x3A8 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 @@ -2739,7 +2710,7 @@ ; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 ; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,3] @@ -2782,9 +2753,9 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %xmm10 -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm10[0],zero,xmm10[1],zero -; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%r8), %xmm7 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 68(%r9), %ymm7 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] @@ -2803,9 +2774,9 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero +; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm10 +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm10[0],zero,xmm10[1],zero +; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 100(%r9), %ymm9 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] @@ -2823,10 +2794,11 @@ ; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm14 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,2,4,5,6,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm14[0,1,2,2,4,5,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,1,2,3] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm9 @@ -2874,10 +2846,10 @@ ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm10, %ymm5 +; AVX2-SLOW-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %ymm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2887,25 +2859,25 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2],ymm13[3],ymm15[4],ymm13[5],ymm15[6],ymm13[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 64(%rsi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm10 = ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[6],ymm10[6],ymm0[7],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5],ymm13[6,7] ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 84(%r9), %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpbroadcastd 96(%rcx), %xmm13 +; AVX2-SLOW-NEXT: vpbroadcastd 96(%rdx), %xmm15 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpbroadcastq %xmm10, %ymm7 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %xmm7 ; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2930,7 +2902,7 @@ ; AVX2-SLOW-NEXT: vpbroadcastd 116(%r9), %ymm15 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm13 = xmm10[2],mem[2],xmm10[3],mem[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] @@ -2945,7 +2917,8 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm10[1],ymm13[2,3,4,5,6],ymm10[7] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm14[0],ymm1[0],ymm14[1],ymm1[1],ymm14[4],ymm1[4],ymm14[5],ymm1[5] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] @@ -2953,7 +2926,7 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 16(%r9), %ymm10 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm14[2],ymm1[2],ymm14[3],ymm1[3],ymm14[6],ymm1[6],ymm14[7],ymm1[7] +; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[2,3],ymm0[2,3] @@ -3006,7 +2979,7 @@ ; AVX2-SLOW-NEXT: # xmm9 = mem[2,2,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpermilps $250, (%rsp), %xmm9 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm9 = mem[2,2,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm9[1],ymm3[2,3,4,5,6],ymm9[7] @@ -3103,349 +3076,350 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-SLOW-NEXT: addq $904, %rsp # imm = 0x388 +; AVX2-SLOW-NEXT: addq $936, %rsp # imm = 0x3A8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i32_stride6_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $888, %rsp # imm = 0x378 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-NEXT: subq $1064, %rsp # imm = 0x428 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm8 -; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm2[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm7 +; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm10[0],zero,xmm10[1],zero +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[1,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[1,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm12 +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm7 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 68(%r9), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm8[3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm13 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm14 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm15[2],xmm5[2],xmm15[3],xmm5[3] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm8 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm8[0],zero,xmm8[1],zero +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 68(%r9), %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm8[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm4[0],zero,xmm4[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm7 +; AVX2-FAST-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm14 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm10 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm10[0],zero,xmm10[1],zero +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 100(%r9), %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm8 +; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm4 ; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm9 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm4 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm11, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm6 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2],ymm0[3],ymm8[4],ymm0[5],ymm8[6],ymm0[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[6],ymm1[6],ymm6[7],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm9 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm8 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm5, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm11 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm11[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm4 +; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm11 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm12, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 32(%r9), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm8 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2],ymm0[3],ymm11[4],ymm0[5],ymm11[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[6],ymm11[6],ymm12[7],ymm11[7] +; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd 64(%rcx), %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd 64(%rdx), %xmm3 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm7, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 64(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 64(%rdx), %xmm13 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm8, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %xmm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd %xmm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm7 -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm8[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[2],ymm0[3],ymm13[4],ymm0[5],ymm13[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm12 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[6],ymm12[6],ymm1[7],ymm12[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 84(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm13 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm1[2],ymm13[2],ymm1[3],ymm13[3],ymm1[6],ymm13[6],ymm1[7],ymm13[7] +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 84(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd %xmm13, %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd %xmm10, %xmm3 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 96(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 96(%rcx), %xmm0 +; AVX2-FAST-NEXT: vpbroadcastd 96(%rdx), %xmm15 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm10, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd %xmm7, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm3 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm11 = ymm9[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2],ymm0[3],ymm11[4],ymm0[5],ymm11[6],ymm0[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm7 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[3],ymm15[4],ymm0[5],ymm15[6],ymm0[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,1,2,3] ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[6],ymm1[6],ymm10[7],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm14 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 116(%r9), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm14 = ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[6],ymm0[6],ymm10[7],ymm0[7] +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 116(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm15 +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [2,2,3,3,2,2,3,3] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm15 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm11[1],ymm14[2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[2,3],ymm2[2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[4],ymm9[4],ymm1[5],ymm9[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm10 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[6],ymm2[6],ymm6[7],ymm2[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,5,3,3,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [4,6,2,3,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm4, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4,5,6],ymm10[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3,4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = ymm13[0],mem[0],ymm13[1],mem[1],ymm13[4],mem[4],ymm13[5],mem[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[4],ymm5[4],ymm8[5],ymm5[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm11[4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [4,6,2,3,4,6,6,7] +; AVX2-FAST-NEXT: vpermd (%r9), %ymm10, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[4],ymm11[4],ymm12[5],ymm11[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5],ymm11[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[2,3],ymm3[2,3] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm2, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermd 32(%r9), %ymm10, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm4, %ymm12, %ymm4 +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1],ymm4[2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[4],ymm13[4],ymm6[5],ymm13[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm8[0],ymm5[0],ymm8[1],ymm5[1],ymm8[4],ymm5[4],ymm8[5],ymm5[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 80(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[6],ymm5[6],ymm8[7],ymm5[7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[2,3],ymm5[2,3] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1],ymm8[2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm15[1],ymm8[2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[4],ymm12[4],ymm15[5],ymm12[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 80(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm15[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[2,3],ymm6[2,3] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm11, %ymm7 -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm11 -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1],ymm7[2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm14 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[4],mem[4],ymm7[5],mem[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[4],ymm3[4],ymm9[5],ymm3[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 112(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm15[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[2,3],ymm3[2,3] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermd 64(%r9), %ymm10, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm6 +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1],ymm6[2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2,3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0],mem[0],ymm6[1],mem[1],ymm6[4],mem[4],ymm6[5],mem[5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[4],ymm7[4],ymm8[5],ymm7[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 112(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm7 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[2,3],ymm7[2,3] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd 96(%r9), %ymm10, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4,5,6],ymm7[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm2, 736(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 672(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 640(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 544(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 672(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 640(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 544(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm12, 480(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 448(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 352(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm13, 288(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 256(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm4, 448(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 352(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm11, 288(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm15, 256(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 96(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3472,13 +3446,13 @@ ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $888, %rsp # imm = 0x378 +; AVX2-FAST-NEXT: addq $1064, %rsp # imm = 0x428 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride6_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $904, %rsp # imm = 0x388 +; AVX2-FAST-PERLANE-NEXT: subq $936, %rsp # imm = 0x3A8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 @@ -3494,7 +3468,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,3] @@ -3537,9 +3511,9 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm10[0],zero,xmm10[1],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r8), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 68(%r9), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] @@ -3558,9 +3532,9 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm10[0],zero,xmm10[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%r9), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7] @@ -3578,10 +3552,11 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,2,4,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm14[0,1,2,2,4,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm9 @@ -3629,10 +3604,10 @@ ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3642,25 +3617,25 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2],ymm13[3],ymm15[4],ymm13[5],ymm15[6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rsi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm10 = ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[6],ymm10[6],ymm0[7],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm15 = mem[0],zero,mem[1],zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 84(%r9), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 96(%rcx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 96(%rdx), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm10, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3685,7 +3660,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 116(%r9), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm10[2],mem[2],xmm10[3],mem[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] @@ -3700,7 +3675,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm10[1],ymm13[2,3,4,5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] -; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm14[0],ymm1[0],ymm14[1],ymm1[1],ymm14[4],ymm1[4],ymm14[5],ymm1[5] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] @@ -3708,7 +3684,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 16(%r9), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm14[2],ymm1[2],ymm14[3],ymm1[3],ymm14[6],ymm1[6],ymm14[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[2,3],ymm0[2,3] @@ -3761,7 +3737,7 @@ ; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, (%rsp), %xmm9 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm9[1],ymm3[2,3,4,5,6],ymm9[7] @@ -3858,553 +3834,561 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $904, %rsp # imm = 0x388 +; AVX2-FAST-PERLANE-NEXT: addq $936, %rsp # imm = 0x3A8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i32_stride6_vf32: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rsi), %zmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%r8), %zmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%r9), %zmm10 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512F-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512F-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm13, %zmm11 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512F-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm16, %zmm15 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512F-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, %zmm17 -; AVX512F-SLOW-NEXT: vpermt2d %zmm4, %zmm18, %zmm17 -; AVX512F-SLOW-NEXT: vpermi2d %zmm14, %zmm12, %zmm13 -; AVX512F-SLOW-NEXT: vpermi2d %zmm14, %zmm12, %zmm16 -; AVX512F-SLOW-NEXT: vpermi2d %zmm14, %zmm12, %zmm18 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm19 = zmm12[2],zmm14[2],zmm12[3],zmm14[3],zmm12[6],zmm14[6],zmm12[7],zmm14[7],zmm12[10],zmm14[10],zmm12[11],zmm14[11],zmm12[14],zmm14[14],zmm12[15],zmm14[15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm14, %zmm3, %zmm12 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] -; AVX512F-SLOW-NEXT: vpermt2d (%rcx), %ymm21, %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rsi), %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rcx), %zmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 (%r8), %zmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%r8), %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512F-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm16 = zmm13[2],zmm11[2],zmm13[3],zmm11[3],zmm13[6],zmm11[6],zmm13[7],zmm11[7],zmm13[10],zmm11[10],zmm13[11],zmm11[11],zmm13[14],zmm11[14],zmm13[15],zmm11[15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm11, %zmm4, %zmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [3,11,0,8,7,15,4,12] +; AVX512F-SLOW-NEXT: vpermt2d (%rcx), %ymm19, %ymm17 ; AVX512F-SLOW-NEXT: movb $36, %dl ; AVX512F-SLOW-NEXT: kmovw %edx, %k1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm14[0,1,0,1,2,3,0,1] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm14, %zmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm22, %zmm12 -; AVX512F-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 -; AVX512F-SLOW-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm20[0,1,0,1,2,3,0,1] -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm14, %zmm3 -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm22, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [3,19,0,16,11,27,8,24,15,31,12,28,3,19,0,16] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm14, %zmm20 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm20, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm21, %zmm11 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512F-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm22, %zmm23 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm17[0,1,0,1,2,3,0,1] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm17, %zmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm20, %zmm13 +; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm1, %zmm4 +; AVX512F-SLOW-NEXT: vpermt2d 64(%rcx), %ymm19, %ymm18 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k1} = zmm18[0,1,0,1,2,3,0,1] +; AVX512F-SLOW-NEXT: vpermt2d %zmm5, %zmm17, %zmm4 +; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm20, %zmm4 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512F-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm18 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512F-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm11, %zmm19, %zmm12 ; AVX512F-SLOW-NEXT: movb $-110, %cl ; AVX512F-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, %zmm15 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm23, %zmm15 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm24, %zmm15 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512F-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm25, %zmm26 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm26, %zmm17 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm27, %zmm17 -; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm20, %zmm13 -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm21, %zmm13 -; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm22 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm23, %zmm16 -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm24, %zmm16 -; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm26, %zmm18 -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm27, %zmm18 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512F-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm14, %zmm20 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[0,1,2,3,2,3,0,1] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm22, %zmm20 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm23, %zmm20 -; AVX512F-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm14 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm21[0,1,2,3,2,3,0,1] -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm22, %zmm14 -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm23, %zmm14 -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm18, %zmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm20, %zmm12 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512F-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm21, %zmm0 -; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[14],zmm4[14],zmm2[15],zmm4[15] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm2[0,1,6,7,6,7,0,1] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm4, %zmm0 -; AVX512F-SLOW-NEXT: vpermt2d %zmm5, %zmm21, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[0,1,6,7,6,7,0,1] -; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm4, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, 576(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 704(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, 512(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm21, %zmm22 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512F-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm11, %zmm23, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm22, %zmm14 {%k2} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm22, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm24, %zmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [3,19,0,16,11,27,8,24,15,31,12,28,3,19,0,16] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm26 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512F-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm11, %zmm27, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm11, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm26, %zmm15 +; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm2, %zmm17 +; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm1, %zmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, %zmm19 {%k2} +; AVX512F-SLOW-NEXT: vpermt2d %zmm5, %zmm18, %zmm19 +; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm20, %zmm19 +; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm2, %zmm21 +; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm1, %zmm23 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} +; AVX512F-SLOW-NEXT: vpermt2d %zmm5, %zmm22, %zmm23 +; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm24, %zmm23 +; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm2, %zmm25 +; AVX512F-SLOW-NEXT: vpermi2d %zmm3, %zmm1, %zmm27 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} +; AVX512F-SLOW-NEXT: vpermt2d %zmm5, %zmm11, %zmm27 +; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm26, %zmm27 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512F-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm11, %zmm17 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm18 = ymm18[2],mem[2],ymm18[3],mem[3],ymm18[6],mem[6],ymm18[7],mem[7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,2,3,2,3,0,1] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm18, %zmm17 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm21, %zmm17 +; AVX512F-SLOW-NEXT: vpermi2d %zmm6, %zmm2, %zmm11 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm20 = ymm20[2],mem[2],ymm20[3],mem[3],ymm20[6],mem[6],ymm20[7],mem[7] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm20[0,1,2,3,2,3,0,1] +; AVX512F-SLOW-NEXT: vpermt2d %zmm5, %zmm18, %zmm11 +; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm21, %zmm11 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512F-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vpermt2d %zmm9, %zmm18, %zmm0 +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm16[0,1,6,7,6,7,0,1] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-SLOW-NEXT: vpermt2d %zmm8, %zmm9, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512F-SLOW-NEXT: vpermt2d %zmm10, %zmm8, %zmm0 +; AVX512F-SLOW-NEXT: vpermt2d %zmm6, %zmm18, %zmm2 +; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm1 = zmm1[2],zmm3[2],zmm1[3],zmm3[3],zmm1[6],zmm3[6],zmm1[7],zmm3[7],zmm1[10],zmm3[10],zmm1[11],zmm3[11],zmm1[14],zmm3[14],zmm1[15],zmm3[15] +; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm1[0,1,6,7,6,7,0,1] +; AVX512F-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm2 +; AVX512F-SLOW-NEXT: vpermt2d %zmm7, %zmm8, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 704(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, 640(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, 576(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm15, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 512(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i32_stride6_vf32: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm7 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rcx), %zmm24 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm14 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rcx), %zmm8 +; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm15 +; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 64(%r8), %zmm6 +; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512F-FAST-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512F-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rcx), %ymm16 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %ymm17 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [3,19,0,16,3,19,0,16,7,23,4,20,3,19,0,16] +; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm18, %zmm10 +; AVX512F-FAST-NEXT: movb $36, %cl +; AVX512F-FAST-NEXT: kmovw %ecx, %k1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm20, %zmm1 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512F-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %ymm21 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %ymm22 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %ymm24 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [2,18,3,19,6,22,7,23,6,22,7,23,2,18,3,19] +; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm25, %zmm23 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm21, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm23, %zmm9 +; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm4, %zmm2 +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm18, %zmm17 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm6, %zmm19, %zmm2 +; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm20, %zmm2 +; AVX512F-FAST-NEXT: vpermi2d %zmm8, %zmm3, %zmm10 +; AVX512F-FAST-NEXT: vpermt2d %zmm22, %zmm25, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm6, %zmm21, %zmm10 +; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm23, %zmm10 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512F-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm22, %zmm18 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512F-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm17, %zmm16 +; AVX512F-FAST-NEXT: movb $-110, %cl +; AVX512F-FAST-NEXT: kmovw %ecx, %k2 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm23, %zmm16 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm24, %zmm16 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512F-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm10 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512F-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm14, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,18,3,19,14,30,15,31,14,30,15,31,2,18,3,19] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm12, %zmm15 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512F-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm6, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,3,19,0,16] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm17, %zmm19 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512F-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm13, %zmm11 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm25, %zmm20 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512F-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm26, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm27, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [3,19,0,16,11,27,8,24,15,31,12,28,3,19,0,16] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm28, %zmm29 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512F-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm21, %zmm23 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512F-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm18, %zmm16 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512F-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm22, %zmm20 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [3,19,0,16,3,19,0,16,7,23,4,20,3,19,0,16] -; AVX512F-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm26 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [2,18,3,19,6,22,7,23,6,22,7,23,2,18,3,19] -; AVX512F-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm6 -; AVX512F-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm17 -; AVX512F-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm21 -; AVX512F-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm22 -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm24, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm0 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm21, %zmm20 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm29, %zmm20 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm30, %zmm20 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [2,18,3,19,14,30,15,31,14,30,15,31,2,18,3,19] +; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm31, %zmm11 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512F-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm25 -; AVX512F-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm12 -; AVX512F-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm13 -; AVX512F-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm18 -; AVX512F-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm24 -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm14, %zmm1 -; AVX512F-FAST-NEXT: movb $-110, %al -; AVX512F-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm7 -; AVX512F-FAST-NEXT: movb $36, %al -; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm10, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm16 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm20 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm9, %zmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm25 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm15, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm4, %zmm20 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm10, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm10, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm14, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm13 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm9, %zmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, %zmm18 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm15, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm12, %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm4, %zmm22 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm4, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm11 -; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm10, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm7, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm10, %zmm20 -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm12, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm12, %zmm25 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm4, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm13 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm7, %zmm18 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm10, %zmm22 -; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm12, %zmm1 -; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 128(%rax) +; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm13, %zmm11, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm13, %zmm0 +; AVX512F-FAST-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 +; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm4, %zmm17 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} +; AVX512F-FAST-NEXT: vpermt2d %zmm6, %zmm23, %zmm17 +; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm24, %zmm17 +; AVX512F-FAST-NEXT: vpermi2d %zmm8, %zmm3, %zmm25 +; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm4, %zmm19 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 {%k2} +; AVX512F-FAST-NEXT: vpermt2d %zmm6, %zmm26, %zmm19 +; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm27, %zmm19 +; AVX512F-FAST-NEXT: vpermi2d %zmm8, %zmm3, %zmm28 +; AVX512F-FAST-NEXT: vpermi2d %zmm5, %zmm4, %zmm21 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, %zmm21 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm6, %zmm29, %zmm21 +; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm30, %zmm21 +; AVX512F-FAST-NEXT: vpermt2d %zmm5, %zmm31, %zmm4 +; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm6, %zmm11, %zmm3 +; AVX512F-FAST-NEXT: vpermt2d %zmm7, %zmm13, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 704(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 576(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 256(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 512(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 704(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: store_i32_stride6_vf32: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm14 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm4 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm9 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm10 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm13, %zmm11 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512BW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm16, %zmm15 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm17 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm4, %zmm18, %zmm17 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm14, %zmm12, %zmm13 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm14, %zmm12, %zmm16 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm14, %zmm12, %zmm18 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm19 = zmm12[2],zmm14[2],zmm12[3],zmm14[3],zmm12[6],zmm14[6],zmm12[7],zmm14[7],zmm12[10],zmm14[10],zmm12[11],zmm14[11],zmm12[14],zmm14[14],zmm12[15],zmm14[15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm14, %zmm3, %zmm12 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm20 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [3,11,0,8,7,15,4,12] -; AVX512BW-SLOW-NEXT: vpermt2d (%rcx), %ymm21, %ymm14 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm11 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm3 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm9 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %zmm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512BW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm16 = zmm13[2],zmm11[2],zmm13[3],zmm11[3],zmm13[6],zmm11[6],zmm13[7],zmm11[7],zmm13[10],zmm11[10],zmm13[11],zmm11[11],zmm13[14],zmm11[14],zmm13[15],zmm11[15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm11, %zmm4, %zmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [3,11,0,8,7,15,4,12] +; AVX512BW-SLOW-NEXT: vpermt2d (%rcx), %ymm19, %ymm17 ; AVX512BW-SLOW-NEXT: movb $36, %dl ; AVX512BW-SLOW-NEXT: kmovd %edx, %k1 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k1} = zmm14[0,1,0,1,2,3,0,1] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm14, %zmm12 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm22, %zmm12 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm3 -; AVX512BW-SLOW-NEXT: vpermt2d 64(%rcx), %ymm21, %ymm20 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm20[0,1,0,1,2,3,0,1] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm14, %zmm3 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm22, %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [3,19,0,16,11,27,8,24,15,31,12,28,3,19,0,16] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm14, %zmm20 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm11 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm20, %zmm11 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm21, %zmm11 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512BW-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm22, %zmm23 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k1} = zmm17[0,1,0,1,2,3,0,1] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm17, %zmm13 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm20, %zmm13 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm3, %zmm1, %zmm4 +; AVX512BW-SLOW-NEXT: vpermt2d 64(%rcx), %ymm19, %ymm18 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k1} = zmm18[0,1,0,1,2,3,0,1] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm5, %zmm17, %zmm4 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm20, %zmm4 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm17, %zmm18 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm11, %zmm19, %zmm12 ; AVX512BW-SLOW-NEXT: movb $-110, %cl ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm15 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm23, %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm18, %zmm12 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm20, %zmm12 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512BW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm21, %zmm22 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512BW-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm11, %zmm23, %zmm14 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm14 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm22, %zmm14 ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm24, %zmm15 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm24, %zmm14 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [3,19,0,16,11,27,8,24,15,31,12,28,3,19,0,16] ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm25, %zmm26 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm17 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm26, %zmm17 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm27, %zmm17 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm14 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k1} -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm20, %zmm13 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm21, %zmm13 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm22 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm16 {%k2} -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm23, %zmm16 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm24, %zmm16 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm25 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2} -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm26, %zmm18 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm27, %zmm18 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm14, %zmm20 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm21 -; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm22 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[0,1,2,3,2,3,0,1] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm22, %zmm20 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm23, %zmm20 -; AVX512BW-SLOW-NEXT: vpermi2d %zmm5, %zmm1, %zmm14 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm21 = ymm21[2],mem[2],ymm21[3],mem[3],ymm21[6],mem[6],ymm21[7],mem[7] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm21[0,1,2,3,2,3,0,1] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm22, %zmm14 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm23, %zmm14 -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512BW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm21, %zmm0 -; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[14],zmm4[14],zmm2[15],zmm4[15] -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm2[0,1,6,7,6,7,0,1] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm2, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm4, %zmm0 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm5, %zmm21, %zmm1 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm19[0,1,6,7,6,7,0,1] -; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm2, %zmm1 -; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm4, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, 576(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 704(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, 448(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, 512(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm26 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512BW-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm11, %zmm27, %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm15 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm11, %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm26, %zmm15 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm6, %zmm2, %zmm17 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm3, %zmm1, %zmm19 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm19 {%k2} +; AVX512BW-SLOW-NEXT: vpermt2d %zmm5, %zmm18, %zmm19 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm20, %zmm19 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm6, %zmm2, %zmm21 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm3, %zmm1, %zmm23 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} +; AVX512BW-SLOW-NEXT: vpermt2d %zmm5, %zmm22, %zmm23 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm24, %zmm23 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm6, %zmm2, %zmm25 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm3, %zmm1, %zmm27 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} +; AVX512BW-SLOW-NEXT: vpermt2d %zmm5, %zmm11, %zmm27 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm26, %zmm27 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512BW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm11, %zmm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm18 = ymm18[2],mem[2],ymm18[3],mem[3],ymm18[6],mem[6],ymm18[7],mem[7] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,2,3,2,3,0,1] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm18, %zmm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm21, %zmm17 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm6, %zmm2, %zmm11 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} ymm20 = ymm20[2],mem[2],ymm20[3],mem[3],ymm20[6],mem[6],ymm20[7],mem[7] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm20[0,1,2,3,2,3,0,1] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm5, %zmm18, %zmm11 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm21, %zmm11 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] +; AVX512BW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm9, %zmm18, %zmm0 +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm16[0,1,6,7,6,7,0,1] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-SLOW-NEXT: vpermt2d %zmm8, %zmm9, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm10, %zmm8, %zmm0 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm6, %zmm18, %zmm2 +; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} zmm1 = zmm1[2],zmm3[2],zmm1[3],zmm3[3],zmm1[6],zmm3[6],zmm1[7],zmm3[7],zmm1[10],zmm3[10],zmm1[11],zmm3[11],zmm1[14],zmm3[14],zmm1[15],zmm3[15] +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm1[0,1,6,7,6,7,0,1] +; AVX512BW-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm2 +; AVX512BW-SLOW-NEXT: vpermt2d %zmm7, %zmm8, %zmm2 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, 704(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm27, 640(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm23, 576(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, 256(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, 512(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, 448(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: store_i32_stride6_vf32: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm7 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm24 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm14 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm8 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%r8), %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%r9), %zmm7 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512BW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm14, %zmm2, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rcx), %ymm16 +; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %ymm17 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [3,19,0,16,3,19,0,16,7,23,4,20,3,19,0,16] +; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm18, %zmm10 +; AVX512BW-FAST-NEXT: movb $36, %cl +; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm19, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm20, %zmm1 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512BW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %ymm21 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %ymm22 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm24 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [2,18,3,19,6,22,7,23,6,22,7,23,2,18,3,19] +; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm25, %zmm23 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm21, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm23, %zmm9 +; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm4, %zmm2 +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm18, %zmm17 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm6, %zmm19, %zmm2 +; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm20, %zmm2 +; AVX512BW-FAST-NEXT: vpermi2d %zmm8, %zmm3, %zmm10 +; AVX512BW-FAST-NEXT: vpermt2d %zmm22, %zmm25, %zmm24 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm10 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm6, %zmm21, %zmm10 +; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm23, %zmm10 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm22, %zmm18 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm16 +; AVX512BW-FAST-NEXT: vpermt2d %zmm14, %zmm17, %zmm16 +; AVX512BW-FAST-NEXT: movb $-110, %cl +; AVX512BW-FAST-NEXT: kmovd %ecx, %k2 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, %zmm16 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm23, %zmm16 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm24, %zmm16 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] ; AVX512BW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm10 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm14, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,18,3,19,14,30,15,31,14,30,15,31,2,18,3,19] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm12, %zmm15 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] -; AVX512BW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm6, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [3,19,0,16,11,27,8,24,15,31,12,28,3,19,0,16] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm19 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm17, %zmm19 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm13, %zmm11 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm25, %zmm20 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512BW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512BW-FAST-NEXT: vpermt2d %zmm14, %zmm19, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm26, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm27, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [3,19,0,16,11,27,8,24,15,31,12,28,3,19,0,16] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm28, %zmm29 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512BW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm21, %zmm23 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512BW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm18, %zmm16 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm22, %zmm20 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = [3,19,0,16,3,19,0,16,7,23,4,20,3,19,0,16] -; AVX512BW-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm26 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [2,18,3,19,6,22,7,23,6,22,7,23,2,18,3,19] -; AVX512BW-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm6 -; AVX512BW-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm17 -; AVX512BW-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm21 -; AVX512BW-FAST-NEXT: vpermi2d %zmm9, %zmm3, %zmm22 -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm24, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm0 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512BW-FAST-NEXT: vpermt2d %zmm14, %zmm21, %zmm20 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm29, %zmm20 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm30, %zmm20 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [2,18,3,19,14,30,15,31,14,30,15,31,2,18,3,19] +; AVX512BW-FAST-NEXT: vpermt2d %zmm14, %zmm31, %zmm11 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512BW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm25 -; AVX512BW-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm12 -; AVX512BW-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm13 -; AVX512BW-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm18 -; AVX512BW-FAST-NEXT: vpermi2d %zmm7, %zmm1, %zmm24 -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm14, %zmm1 -; AVX512BW-FAST-NEXT: movb $-110, %al -; AVX512BW-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm7 -; AVX512BW-FAST-NEXT: movb $36, %al -; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm5 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm10, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, %zmm16 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, %zmm20 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm9, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm26, %zmm25 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm15, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm4, %zmm20 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm10, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm10, %zmm25 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 {%k1} -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm14, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm13 {%k1} -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm9, %zmm13 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, %zmm18 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm15, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm12, %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm4, %zmm22 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm4, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm11 -; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm10, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm7, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm10, %zmm20 -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm12, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm12, %zmm25 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm4, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm13 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm7, %zmm18 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm10, %zmm22 -; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm12, %zmm1 -; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, 128(%rax) +; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm13, %zmm11, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm13, %zmm0 +; AVX512BW-FAST-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 +; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm4, %zmm17 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2} +; AVX512BW-FAST-NEXT: vpermt2d %zmm6, %zmm23, %zmm17 +; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm24, %zmm17 +; AVX512BW-FAST-NEXT: vpermi2d %zmm8, %zmm3, %zmm25 +; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm4, %zmm19 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 {%k2} +; AVX512BW-FAST-NEXT: vpermt2d %zmm6, %zmm26, %zmm19 +; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm27, %zmm19 +; AVX512BW-FAST-NEXT: vpermi2d %zmm8, %zmm3, %zmm28 +; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm4, %zmm21 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm28, %zmm21 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm6, %zmm29, %zmm21 +; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm30, %zmm21 +; AVX512BW-FAST-NEXT: vpermt2d %zmm5, %zmm31, %zmm4 +; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm14, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm6, %zmm11, %zmm3 +; AVX512BW-FAST-NEXT: vpermt2d %zmm7, %zmm13, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, 704(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, 576(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, 256(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 320(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, 512(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 576(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 640(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, 704(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, 448(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <32 x i32>, ptr %in.vecptr0, align 64 @@ -4450,22 +4434,22 @@ ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm13[0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[0,2] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm8 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm9[2,3] +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm9[2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,0],xmm6[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm7[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm12, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] @@ -4477,28 +4461,28 @@ ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm5[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm12[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm10[2,3] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,3] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%rdi), %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm5[3,3] -; SSE-NEXT: movaps 32(%rdx), %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm3[3,3] +; SSE-NEXT: movaps 32(%rdx), %xmm7 ; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; SSE-NEXT: movaps 32(%rcx), %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: movaps 32(%rsi), %xmm1 ; SSE-NEXT: movaps %xmm6, %xmm4 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] @@ -4508,25 +4492,25 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[0,2] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm2[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdx), %xmm7 ; SSE-NEXT: movaps 48(%rcx), %xmm0 ; SSE-NEXT: movaps %xmm7, %xmm5 @@ -4543,22 +4527,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdx), %xmm7 ; SSE-NEXT: movaps 64(%rcx), %xmm0 @@ -4576,22 +4560,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdx), %xmm7 ; SSE-NEXT: movaps 80(%rcx), %xmm0 @@ -4609,22 +4593,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%rdx), %xmm7 ; SSE-NEXT: movaps 96(%rcx), %xmm0 @@ -4642,22 +4626,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rdx), %xmm7 ; SSE-NEXT: movaps 112(%rcx), %xmm0 @@ -4675,22 +4659,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%rdx), %xmm7 ; SSE-NEXT: movaps 128(%rcx), %xmm0 @@ -4708,22 +4692,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%rdx), %xmm7 ; SSE-NEXT: movaps 144(%rcx), %xmm0 @@ -4741,22 +4725,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 160(%rdx), %xmm7 ; SSE-NEXT: movaps 160(%rcx), %xmm0 @@ -4774,22 +4758,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 176(%rdx), %xmm7 ; SSE-NEXT: movaps 176(%rcx), %xmm0 @@ -4807,22 +4791,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%rdx), %xmm7 ; SSE-NEXT: movaps 192(%rcx), %xmm0 @@ -4840,22 +4824,22 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[2,3] ; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 208(%rdx), %xmm7 ; SSE-NEXT: movaps 208(%rcx), %xmm0 @@ -4873,90 +4857,90 @@ ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[0,2] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm6[2,3] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm6[2,3] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdx), %xmm11 +; SSE-NEXT: movaps 224(%rdx), %xmm12 ; SSE-NEXT: movaps 224(%rcx), %xmm0 -; SSE-NEXT: movaps %xmm11, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] -; SSE-NEXT: movaps 224(%rdi), %xmm10 -; SSE-NEXT: movaps 224(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm10, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] -; SSE-NEXT: movaps 224(%r8), %xmm2 +; SSE-NEXT: movaps %xmm12, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: movaps 224(%rdi), %xmm9 +; SSE-NEXT: movaps 224(%rsi), %xmm2 +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] +; SSE-NEXT: movaps 224(%r8), %xmm1 ; SSE-NEXT: movaps 224(%r9), %xmm8 ; SSE-NEXT: movaps %xmm8, %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm2[0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm13[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm5[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1] -; SSE-NEXT: movaps %xmm2, %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm10[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm8[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] -; SSE-NEXT: movaps 240(%rdx), %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm1[0] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm11[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm13[0] +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm7[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; SSE-NEXT: movaps %xmm0, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1] +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm9[2,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm7[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,0],xmm1[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm8[2,0] +; SSE-NEXT: movaps 240(%rdx), %xmm7 ; SSE-NEXT: movaps 240(%rcx), %xmm8 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] ; SSE-NEXT: movaps 240(%rdi), %xmm0 ; SSE-NEXT: movaps 240(%rsi), %xmm14 -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1] -; SSE-NEXT: movaps 240(%r8), %xmm3 -; SSE-NEXT: movaps 240(%r9), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm1[0,2] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] +; SSE-NEXT: movaps 240(%r8), %xmm6 +; SSE-NEXT: movaps 240(%r9), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[2,3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[2,0] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; SSE-NEXT: movaps %xmm8, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm3, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[2,3] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] +; SSE-NEXT: movaps %xmm4, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm6[1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm7[3,3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm6[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[2,0] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 1520(%rax) +; SSE-NEXT: movaps %xmm7, 1520(%rax) ; SSE-NEXT: movaps %xmm14, 1504(%rax) ; SSE-NEXT: movaps %xmm0, 1488(%rax) -; SSE-NEXT: movaps %xmm4, 1472(%rax) -; SSE-NEXT: movaps %xmm6, 1456(%rax) -; SSE-NEXT: movaps %xmm5, 1440(%rax) -; SSE-NEXT: movaps %xmm11, 1424(%rax) -; SSE-NEXT: movaps %xmm9, 1408(%rax) -; SSE-NEXT: movaps %xmm10, 1392(%rax) -; SSE-NEXT: movaps %xmm12, 1376(%rax) +; SSE-NEXT: movaps %xmm3, 1472(%rax) +; SSE-NEXT: movaps %xmm5, 1456(%rax) +; SSE-NEXT: movaps %xmm2, 1440(%rax) +; SSE-NEXT: movaps %xmm12, 1424(%rax) +; SSE-NEXT: movaps %xmm10, 1408(%rax) +; SSE-NEXT: movaps %xmm9, 1392(%rax) +; SSE-NEXT: movaps %xmm13, 1376(%rax) ; SSE-NEXT: movaps %xmm15, 1360(%rax) -; SSE-NEXT: movaps %xmm13, 1344(%rax) +; SSE-NEXT: movaps %xmm11, 1344(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1328(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -5130,59 +5114,54 @@ ; ; AVX1-ONLY-LABEL: store_i32_stride6_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2392, %rsp # imm = 0x958 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX1-ONLY-NEXT: subq $1896, %rsp # imm = 0x768 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm9 ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm4 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm6 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 4(%r8), %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 4(%r8), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 4(%r9), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 16(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 16(%r9), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,2],xmm2[1,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -5191,17 +5170,9 @@ ; AVX1-ONLY-NEXT: vbroadcastss 36(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5211,18 +5182,12 @@ ; AVX1-ONLY-NEXT: vbroadcastss 48(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -5240,7 +5205,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -5251,17 +5216,12 @@ ; AVX1-ONLY-NEXT: vbroadcastss 80(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -5270,17 +5230,15 @@ ; AVX1-ONLY-NEXT: vbroadcastss 100(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm9 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[4],ymm9[4],ymm10[5],ymm9[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5290,18 +5248,12 @@ ; AVX1-ONLY-NEXT: vbroadcastss 112(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -5310,16 +5262,15 @@ ; AVX1-ONLY-NEXT: vbroadcastss 132(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm11 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm6 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[4],ymm6[4],ymm8[5],ymm6[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5329,16 +5280,12 @@ ; AVX1-ONLY-NEXT: vbroadcastss 144(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -5347,11 +5294,9 @@ ; AVX1-ONLY-NEXT: vbroadcastss 164(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5367,16 +5312,12 @@ ; AVX1-ONLY-NEXT: vbroadcastss 176(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -5385,15 +5326,15 @@ ; AVX1-ONLY-NEXT: vbroadcastss 196(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm4[0],ymm10[1],ymm4[1],ymm10[4],ymm4[4],ymm10[5],ymm4[5] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm7 +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[2],ymm7[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5403,18 +5344,12 @@ ; AVX1-ONLY-NEXT: vbroadcastss 208(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],mem[1,2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -5423,166 +5358,121 @@ ; AVX1-ONLY-NEXT: vbroadcastss 228(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm7 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm15[0],ymm7[0],ymm15[1],ymm7[1],ymm15[4],ymm7[4],ymm15[5],ymm7[5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],ymm1[0],ymm11[2],ymm1[2] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 240(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm15[5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm12[4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 240(%r9), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm0[2],ymm12[2],ymm0[3],ymm12[3],ymm0[6],ymm12[6],ymm0[7],ymm12[7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 84(%r8), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 84(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] -; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = mem[0,0,0,0] -; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = mem[0,0,0,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 96(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm13 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 116(%r8), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 116(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 52(%r9), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm11[2],ymm0[3],ymm11[3],ymm0[6],ymm11[6],ymm0[7],ymm11[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm11 = ymm11[1,2],mem[1,2],ymm11[5,6],mem[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 148(%r8), %xmm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 148(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2],ymm14[3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $153, (%rsp), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 84(%r8), %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 84(%r9), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm14 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm12[1,2],mem[1,2],ymm12[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm14[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 116(%r8), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 116(%r9), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2],ymm10[3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[0,0,0,0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm14[0,0,0,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r8), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 160(%r9), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[6],ymm6[6],ymm8[7],ymm6[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm10[1,2],ymm9[1,2],ymm10[5,6],ymm9[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 180(%r8), %xmm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 180(%r9), %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm7[3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 148(%r8), %xmm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 148(%r9), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm8[3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0,0,0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[0,0,0,0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r8), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 192(%r9), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm0[1,2],ymm8[1,2],ymm0[5,6],ymm8[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 180(%r8), %xmm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 180(%r9), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm10[2],ymm4[2],ymm10[3],ymm4[3],ymm10[6],ymm4[6],ymm10[7],ymm4[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm3[1,2],mem[1,2],ymm3[5,6],mem[5,6] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,1,3,4,6,5,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 212(%r8), %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 212(%r9), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 212(%r8), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 212(%r9), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm7[2],ymm15[3],ymm7[3],ymm15[6],ymm7[6],ymm15[7],ymm7[7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2],ymm5[1,2],ymm1[5,6],ymm5[5,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2],ymm11[1,2],ymm1[5,6],ymm11[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] @@ -5594,9 +5484,8 @@ ; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm0 ; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r8), %ymm1, %ymm1 @@ -5604,16 +5493,54 @@ ; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] @@ -5622,7 +5549,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] @@ -5631,34 +5558,32 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss 64(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r8), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastss 64(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] @@ -5672,208 +5597,203 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss 64(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r8), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 64(%r9), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm2[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm4[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vbroadcastss 96(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vbroadcastss 96(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r8), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 96(%r9), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm12[3,0],ymm0[7,4],ymm12[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vbroadcastss 128(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vbroadcastss 128(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r8), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 128(%r9), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6],ymm5[7] -; AVX1-ONLY-NEXT: vbroadcastss 128(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vbroadcastss 128(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r8), %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,3],ymm11[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 128(%r9), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm11[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4,5,6],ymm12[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = mem[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4,5,6],ymm12[7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[2,3],ymm13[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1],ymm9[2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2,3,4,5,6],ymm13[7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = mem[2,1,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm14[0,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,0],ymm3[3,0],ymm0[7,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm14[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1],ymm3[2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4,5,6],ymm14[7] -; AVX1-ONLY-NEXT: vbroadcastss 224(%rcx), %xmm14 -; AVX1-ONLY-NEXT: vbroadcastss 224(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm15, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 224(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm14 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm14 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[3,0],ymm10[3,0],ymm9[7,4],ymm10[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6],ymm6[7] +; AVX1-ONLY-NEXT: vbroadcastss 160(%rcx), %xmm4 +; AVX1-ONLY-NEXT: vbroadcastss 160(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm10[0],mem[0],xmm10[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r8), %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 160(%r9), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm10[5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm10[2],mem[2],xmm10[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm10 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm11[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm14, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0],ymm11[1],ymm10[2,3,4,5,6],ymm11[7] +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm8[3,0],mem[3,0],ymm8[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[2,3],ymm10[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5,6],ymm10[7] +; AVX1-ONLY-NEXT: vbroadcastss 192(%rcx), %xmm10 +; AVX1-ONLY-NEXT: vbroadcastss 192(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm14[0],mem[0],xmm14[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r8), %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3],ymm14[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 192(%r9), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm14[5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm14[2],mem[2],xmm14[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm7, %ymm14 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm3, %ymm14 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm15, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovaps 224(%r9), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm15 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2,3,4,5,6],ymm7[7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm6[3,0],mem[3,0],ymm6[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[2,3],ymm14[2,3] -; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm14[1],ymm6[2,3,4,5,6],ymm14[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm7[1],ymm14[2,3,4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7] +; AVX1-ONLY-NEXT: vbroadcastss 224(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vbroadcastss 224(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm15[0],mem[0],xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm15, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 224(%r9), %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm13[5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm13[2],mem[2],xmm13[3],mem[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm13 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = mem[2,1,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%r9), %xmm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm15[0,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0],ymm3[1],ymm13[2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm12[3,0],mem[3,0],ymm12[7,4],mem[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[2,3],ymm13[2,3] +; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3,4,5,6],ymm15[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm6, 1504(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 1408(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1344(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 1312(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 1216(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 1504(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 1408(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 1344(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 1312(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 1216(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 1152(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm9, 1120(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 1024(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 928(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 832(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 768(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 1024(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 960(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 928(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 832(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 736(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 640(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 448(%rax) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5893,18 +5813,12 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1280(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1152(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1088(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 960(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 896(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) @@ -5942,7 +5856,7 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $2392, %rsp # imm = 0x958 +; AVX1-ONLY-NEXT: addq $1896, %rsp # imm = 0x768 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -6030,7 +5944,7 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovdqa 96(%r8), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 100(%r9), %ymm9 @@ -6116,7 +6030,7 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovdqa 224(%r8), %xmm14 -; AVX2-SLOW-NEXT: vmovdqa %xmm14, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 228(%r9), %ymm14 @@ -6221,13 +6135,13 @@ ; AVX2-SLOW-NEXT: vpbroadcastd 84(%r9), %ymm15 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpbroadcastd 96(%rcx), %xmm14 +; AVX2-SLOW-NEXT: vpbroadcastd 96(%rdx), %xmm15 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpbroadcastq (%rsp), %ymm7 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovdqa 96(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6283,8 +6197,8 @@ ; AVX2-SLOW-NEXT: vpbroadcastd 148(%r9), %ymm15 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpbroadcastd 160(%rcx), %xmm14 +; AVX2-SLOW-NEXT: vpbroadcastd 160(%rdx), %xmm15 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1] @@ -6314,8 +6228,8 @@ ; AVX2-SLOW-NEXT: vpbroadcastd 180(%r9), %ymm15 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpbroadcastd 192(%rcx), %xmm14 +; AVX2-SLOW-NEXT: vpbroadcastd 192(%rdx), %xmm15 ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] @@ -6353,7 +6267,7 @@ ; AVX2-SLOW-NEXT: # xmm15 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd (%rsp), %ymm15 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vmovaps 224(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6503,7 +6417,7 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpermilps $250, (%rsp), %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = mem[2,2,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] @@ -6511,7 +6425,7 @@ ; AVX2-SLOW-NEXT: # xmm1 = mem[2,2,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] @@ -6645,7 +6559,7 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm13 -; AVX2-SLOW-NEXT: vpermilps $250, (%rsp), %xmm14 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm14 = mem[2,2,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] @@ -6692,7 +6606,7 @@ ; AVX2-SLOW-NEXT: vmovdqa %ymm6, 736(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 672(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 640(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 544(%rax) @@ -6766,263 +6680,272 @@ ; ; AVX2-FAST-LABEL: store_i32_stride6_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $2312, %rsp # imm = 0x908 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; AVX2-FAST-NEXT: subq $2744, %rsp # imm = 0xAB8 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm7 ; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 ; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm10[0],zero,xmm10[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm8 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm8[0],zero,xmm8[1],zero +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 68(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm12 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm12[0],zero,xmm12[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 68(%r9), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm7[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm13 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm12 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm11 +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm11[0],zero,xmm11[1],zero +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 100(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 100(%r9), %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm10[3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 128(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm9[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 128(%rsi), %xmm10 -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa 128(%rdx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 128(%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm15 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqa 128(%r8), %xmm14 -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm14[0],zero,xmm14[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 132(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm14[0],zero,xmm14[1],zero +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 132(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 160(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa 160(%rdx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 160(%rsi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa 160(%rdx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 160(%rsi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa 160(%r8), %xmm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 164(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5],ymm13[6,7] -; AVX2-FAST-NEXT: vmovdqa 160(%r8), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 164(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 192(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 192(%rsi), %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa 192(%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1] +; AVX2-FAST-NEXT: vmovdqa 192(%rsi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa 192(%r8), %xmm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 196(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqa 192(%r8), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 224(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 196(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rcx), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,2,2,3] ; AVX2-FAST-NEXT: vmovdqa 224(%rdx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,1,2,1] ; AVX2-FAST-NEXT: vmovdqa 224(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqa 224(%r8), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 228(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm5 -; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm13 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqa 224(%r8), %xmm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 228(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm0 +; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm5 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss 32(%rcx), %xmm0 -; AVX2-FAST-NEXT: vbroadcastss 32(%rdx), %xmm5 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vbroadcastss 32(%r9), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm6 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,1,2,3,5,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[6],ymm5[6],ymm11[7],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 32(%rcx), %xmm0 +; AVX2-FAST-NEXT: vpbroadcastd 32(%rdx), %xmm5 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastq %xmm6, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd %xmm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd 64(%rcx), %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd 64(%rdx), %xmm5 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FAST-NEXT: vpbroadcastd 64(%rdx), %xmm3 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq %xmm12, %ymm3 +; AVX2-FAST-NEXT: vpbroadcastq %xmm8, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 64(%r9), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 64(%r9), %xmm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd %xmm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm5 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %ymm6 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero @@ -7030,29 +6953,30 @@ ; AVX2-FAST-NEXT: vpbroadcastd 84(%r9), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastd 96(%rcx), %xmm0 +; AVX2-FAST-NEXT: vpbroadcastd 96(%rdx), %xmm3 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpbroadcastq %xmm11, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 96(%r9), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 96(%r9), %xmm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd %xmm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm11 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %ymm11 +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %ymm8 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm10[2],ymm3[2],ymm10[3],ymm3[3],ymm10[6],ymm3[6],ymm10[7],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero @@ -7063,26 +6987,28 @@ ; AVX2-FAST-NEXT: vpbroadcastd 128(%rcx), %xmm0 ; AVX2-FAST-NEXT: vpbroadcastd 128(%rdx), %xmm3 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm15[0],mem[0],xmm15[1],mem[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastq %xmm14, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 128(%r9), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 128(%r9), %xmm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd %xmm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rcx), %ymm10 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 128(%rdx), %ymm13 +; AVX2-FAST-NEXT: vmovdqa 128(%rcx), %ymm12 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm12[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 128(%rsi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm10[2],ymm3[2],ymm10[3],ymm3[3],ymm10[6],ymm3[6],ymm10[7],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero @@ -7090,62 +7016,68 @@ ; AVX2-FAST-NEXT: vpbroadcastd 148(%r9), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm15[0],mem[0],xmm15[1],mem[1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 160(%r9), %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdx), %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rcx), %ymm9 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 160(%rcx), %xmm0 +; AVX2-FAST-NEXT: vbroadcastss 160(%rdx), %xmm3 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps (%rsp), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps 160(%r9), %xmm3 +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastss %xmm3, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rcx), %ymm10 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 160(%rsi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm14[2],ymm3[2],ymm14[3],ymm3[3],ymm14[6],ymm3[6],ymm14[7],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 180(%r9), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-FAST-NEXT: vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastss 192(%rcx), %xmm0 +; AVX2-FAST-NEXT: vbroadcastss 192(%rdx), %xmm3 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vbroadcastss 192(%r9), %ymm3 +; AVX2-FAST-NEXT: vmovaps 192(%r9), %xmm3 +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastss %xmm3, %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 192(%rdx), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rcx), %ymm15 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 192(%rsi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm14[2],ymm3[2],ymm14[3],ymm3[3],ymm14[6],ymm3[6],ymm14[7],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero @@ -7156,313 +7088,304 @@ ; AVX2-FAST-NEXT: vbroadcastss 224(%rcx), %xmm0 ; AVX2-FAST-NEXT: vbroadcastss 224(%rdx), %xmm3 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vbroadcastss 224(%r9), %ymm3 +; AVX2-FAST-NEXT: vmovaps 224(%r9), %xmm3 +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastss %xmm3, %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 224(%rdx), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rcx), %ymm14 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm14[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vmovdqa 224(%rcx), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 224(%rsi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm14[2],ymm3[2],ymm14[3],ymm3[3],ymm14[6],ymm3[6],ymm14[7],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 244(%r9), %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm3 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,2,3,3,2,2,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [2,2,3,3,2,2,3,3] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm4[2,3,4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm12 -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[4],mem[4],ymm4[5],mem[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm13[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm15[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm14[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,5,3,3,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,5,3,3,6,5,7,7] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,6,2,3,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,2,3,4,6,6,7] +; AVX2-FAST-NEXT: vpermd (%r9), %ymm2, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4,5,6],ymm14[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3,4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[4],ymm6[4],ymm3[5],ymm6[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm13[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm15[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermd 32(%r9), %ymm2, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[4],ymm6[4],ymm9[5],ymm6[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm12[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 80(%r9), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm12[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 80(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermd 64(%r9), %ymm2, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm3[0],ymm11[0],ymm3[1],ymm11[1],ymm3[4],ymm11[4],ymm3[5],ymm11[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm11[0],ymm8[0],ymm11[1],ymm8[1],ymm11[4],ymm8[4],ymm11[5],ymm8[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 112(%r9), %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 112(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm11[2],ymm3[3],ymm11[3],ymm3[6],ymm11[6],ymm3[7],ymm11[7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[6],ymm8[6],ymm11[7],ymm8[7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermd 96(%r9), %ymm2, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 128(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqa 128(%r9), %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[4],ymm10[4],ymm3[5],ymm10[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 144(%r9), %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 128(%r8), %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 144(%r9), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm3, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpermd 128(%r9), %ymm2, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm9[1],ymm1[2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 160(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqa 160(%r9), %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm5[1],ymm1[2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0],ymm11[1],ymm1[2,3,4,5,6],ymm11[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[4],ymm9[4],ymm4[5],ymm9[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm4[0],ymm10[0],ymm4[1],ymm10[1],ymm4[4],ymm10[4],ymm4[5],ymm10[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 176(%r9), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4],ymm5[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 160(%r8), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 176(%r9), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4],ymm13[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm10[2],ymm4[3],ymm10[3],ymm4[6],ymm10[6],ymm4[7],ymm10[7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm3, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vpermd 160(%r9), %ymm2, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm10[1],ymm1[2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 192(%r8), %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqa 192(%r9), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm4[1],ymm1[2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm12[1],ymm1[2,3,4,5,6],ymm12[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[4],ymm15[4],ymm12[5],ymm15[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 208(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm13[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm12[2],ymm15[2],ymm12[3],ymm15[3],ymm12[6],ymm15[6],ymm12[7],ymm15[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[2,3],ymm13[2,3] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm13[2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vmovdqa 224(%r8), %ymm13 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vmovdqa 224(%r9), %ymm15 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm0[1],ymm3[2,3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 192(%r8), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 208(%r9), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1,2,3,4],ymm15[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[2,3],ymm15[2,3] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm3, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermd 192(%r9), %ymm2, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm15 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,3,2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm15, %ymm5, %ymm15 +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm0[1],ymm6[2,3,4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 240(%r9), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[2,3],ymm12[2,3] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 224(%r8), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm15[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 240(%r9), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[2,3],ymm6[2,3] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd 224(%r9), %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm7, 1504(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 1440(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 1408(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 1312(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 1248(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 1216(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 1120(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 1056(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 1024(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm11, 928(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 864(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 1504(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 1440(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm4, 1408(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm14, 1312(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 1248(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm12, 1216(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm10, 1120(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm13, 1056(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm11, 1024(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 928(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 864(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 832(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7497,7 +7420,7 @@ ; AVX2-FAST-NEXT: vmovaps %ymm0, 1280(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 1152(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 1088(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 960(%rax) @@ -7537,7 +7460,7 @@ ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $2312, %rsp # imm = 0x908 +; AVX2-FAST-NEXT: addq $2744, %rsp # imm = 0xAB8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -7625,7 +7548,7 @@ ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r8), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%r9), %ymm9 @@ -7711,7 +7634,7 @@ ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%r8), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 228(%r9), %ymm14 @@ -7816,13 +7739,13 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 84(%r9), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 96(%rcx), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 96(%rdx), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%rsp), %ymm7 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7878,8 +7801,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 148(%r9), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 160(%rcx), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 160(%rdx), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,2,1] @@ -7909,8 +7832,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 180(%r9), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm15[3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 192(%rcx), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 192(%rdx), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1] @@ -7948,7 +7871,7 @@ ; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rsp), %ymm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8098,7 +8021,7 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, (%rsp), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] @@ -8106,7 +8029,7 @@ ; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] @@ -8240,7 +8163,7 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpermilps $250, (%rsp), %xmm14 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] @@ -8287,7 +8210,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 736(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 672(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 640(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 544(%rax) @@ -8630,303 +8553,324 @@ ; ; AVX512F-FAST-LABEL: store_i32_stride6_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm24 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm20 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512F-FAST-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 192(%rsi), %zmm14 -; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm15 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512F-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm31 -; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm8, %zmm31 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512F-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm21 -; AVX512F-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [3,19,0,16,3,19,0,16,7,23,4,20,3,19,0,16] -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512F-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm28 -; AVX512F-FAST-NEXT: vpermt2d %zmm20, %zmm6, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,18,3,19,6,22,7,23,6,22,7,23,2,18,3,19] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm11 -; AVX512F-FAST-NEXT: vpermt2d %zmm20, %zmm6, %zmm11 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512F-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm20, %zmm12, %zmm1 +; AVX512F-FAST-NEXT: subq $1352, %rsp # imm = 0x548 +; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 192(%rsi), %zmm8 +; AVX512F-FAST-NEXT: vmovdqa64 128(%rsi), %zmm16 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rsi), %zmm19 +; AVX512F-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512F-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512F-FAST-NEXT: vpermt2d %zmm3, %zmm7, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 128(%rcx), %ymm20 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %ymm14 +; AVX512F-FAST-NEXT: vmovdqa 128(%rdx), %ymm15 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [3,19,0,16,3,19,0,16,7,23,4,20,3,19,0,16] +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 128(%rsi), %ymm21 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %ymm22 +; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,18,3,19,6,22,7,23,6,22,7,23,2,18,3,19] +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm4 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512F-FAST-NEXT: vpermt2d %zmm19, %zmm7, %zmm12 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm17, %zmm14 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm18, %zmm22 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm7, %zmm4 +; AVX512F-FAST-NEXT: vpermt2d %zmm20, %zmm17, %zmm15 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermt2d %zmm21, %zmm18, %zmm23 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512F-FAST-NEXT: vmovdqa 192(%rsi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512F-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm20, %zmm13, %zmm6 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [2,18,3,19,14,30,15,31,14,30,15,31,2,18,3,19] -; AVX512F-FAST-NEXT: vpermt2d %zmm20, %zmm27, %zmm24 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm5, %zmm6 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm9, %zmm6 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm11, %zmm16 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm12, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm13, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermt2d %zmm25, %zmm27, %zmm6 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm11, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm13, %zmm30 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm27, %zmm3 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm1 -; AVX512F-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm5 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm9 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512F-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512F-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512F-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512F-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm3, %zmm11, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [2,18,3,19,14,30,15,31,14,30,15,31,2,18,3,19] +; AVX512F-FAST-NEXT: vpermt2d %zmm3, %zmm30, %zmm14 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm19, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm19, %zmm11, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm2, %zmm29 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm31 +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm11, %zmm31 +; AVX512F-FAST-NEXT: vpermt2d %zmm16, %zmm30, %zmm9 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm11 +; AVX512F-FAST-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512F-FAST-NEXT: vpermi2d %zmm8, %zmm5, %zmm7 +; AVX512F-FAST-NEXT: vpermi2d %zmm8, %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermi2d %zmm8, %zmm5, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpermi2d %zmm8, %zmm5, %zmm11 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm12 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm13 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm27, %zmm2 +; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm30, %zmm5 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512F-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm19 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm19 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512F-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm20, %zmm2 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512F-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm27, %zmm14 -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512F-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm6, %zmm12 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [3,19,0,16,11,27,8,24,15,31,12,28,3,19,0,16] -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm7, %zmm22 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512F-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm18, %zmm24 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [3,19,0,16,11,27,8,24,15,31,12,28,3,19,0,16] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm15, %zmm22 ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512F-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpermt2d %zmm15, %zmm2, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %zmm15 -; AVX512F-FAST-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm19 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm5 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm27, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm10 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm15 -; AVX512F-FAST-NEXT: vmovdqa64 128(%rdx), %zmm17 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm2, %zmm21 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rdx), %zmm30 +; AVX512F-FAST-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, %zmm14 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm20, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm18, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, %zmm27 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm15, %zmm27 +; AVX512F-FAST-NEXT: vpermt2d %zmm1, %zmm2, %zmm30 +; AVX512F-FAST-NEXT: vmovdqa64 128(%rdx), %zmm11 ; AVX512F-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm23 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm16 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm4 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm27, %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm6, %zmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm8 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm16, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm26 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm25 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm15, %zmm23 +; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm11 ; AVX512F-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512F-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm16 ; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm25 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm27 -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm6 -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 +; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm15 ; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 -; AVX512F-FAST-NEXT: movb $-110, %al -; AVX512F-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 {%k2} ; AVX512F-FAST-NEXT: movb $36, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 {%k2} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm28 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm14 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm31 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, %zmm24 {%k2} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm24 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm19 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm5 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm15 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k2} -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, %zmm10 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm12, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa64 64(%r8), %zmm4 +; AVX512F-FAST-NEXT: vpermt2d %zmm4, %zmm12, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 128(%r8), %zmm5 +; AVX512F-FAST-NEXT: vpermt2d %zmm5, %zmm12, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 192(%r8), %zmm10 +; AVX512F-FAST-NEXT: vpermt2d %zmm10, %zmm12, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm6 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 64(%r9), %zmm6 +; AVX512F-FAST-NEXT: vpermt2d %zmm6, %zmm17, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 128(%r9), %zmm8 +; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm17, %zmm9 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 192(%r9), %zmm9 +; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm17, %zmm7 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, %zmm29 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm30 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm17, %zmm19 +; AVX512F-FAST-NEXT: vpermt2d %zmm4, %zmm17, %zmm14 +; AVX512F-FAST-NEXT: vpermt2d %zmm5, %zmm17, %zmm13 +; AVX512F-FAST-NEXT: vpermt2d %zmm10, %zmm17, %zmm16 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm19 +; AVX512F-FAST-NEXT: vpermt2d %zmm6, %zmm17, %zmm14 +; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm17, %zmm13 +; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm17, %zmm16 +; AVX512F-FAST-NEXT: movb $-110, %al +; AVX512F-FAST-NEXT: kmovw %eax, %k2 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, %zmm7 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm17, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm24, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm1, %zmm21 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 {%k2} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, %zmm28 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, %zmm30 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm4, %zmm17, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512F-FAST-NEXT: vpermt2d %zmm4, %zmm24, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm4, %zmm2, %zmm30 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 {%k2} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm5, %zmm17, %zmm2 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm10 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm4 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm29 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm30 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm17 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512F-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm29 +; AVX512F-FAST-NEXT: vpermt2d %zmm5, %zmm24, %zmm31 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512F-FAST-NEXT: vpermt2d %zmm5, %zmm4, %zmm11 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm2 {%k2} +; AVX512F-FAST-NEXT: vpermt2d %zmm10, %zmm17, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 {%k1} -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm27 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} -; AVX512F-FAST-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm21 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm14 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512F-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm31 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 64(%r9), %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm24 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm19 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm5 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm15 -; AVX512F-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm23 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm10 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm4 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm29 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm30 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm17 -; AVX512F-FAST-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm11 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm12 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm27 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm16 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm18 -; AVX512F-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 {%k2} +; AVX512F-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vpermt2d %zmm10, %zmm24, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm0, %zmm22 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm2, %zmm7 +; AVX512F-FAST-NEXT: vpermt2d %zmm10, %zmm4, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm4, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512F-FAST-NEXT: vpermt2d %zmm12, %zmm5, %zmm21 +; AVX512F-FAST-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 +; AVX512F-FAST-NEXT: vpermt2d %zmm6, %zmm2, %zmm27 +; AVX512F-FAST-NEXT: vpermt2d %zmm6, %zmm4, %zmm28 +; AVX512F-FAST-NEXT: vpermt2d %zmm6, %zmm5, %zmm30 +; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm23 +; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm2, %zmm29 +; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm4, %zmm31 +; AVX512F-FAST-NEXT: vpermt2d %zmm8, %zmm5, %zmm11 +; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm0, %zmm17 +; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm2, %zmm18 +; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm4, %zmm15 +; AVX512F-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm3 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 1472(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 1344(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 1280(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 1216(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 1152(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 1088(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 1024(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 1408(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm18, 1344(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 1152(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, 1024(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 960(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 832(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, 768(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 704(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 640(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 576(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 512(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, (%rax) -; AVX512F-FAST-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 704(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, 640(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 576(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, 1280(%rax) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512F-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-FAST-NEXT: addq $1352, %rsp # imm = 0x548 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -9201,303 +9145,324 @@ ; ; AVX512BW-FAST-LABEL: store_i32_stride6_vf64: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm24 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm20 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 -; AVX512BW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm14 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm15 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm31 -; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm8, %zmm31 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm21 -; AVX512BW-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [3,19,0,16,3,19,0,16,7,23,4,20,3,19,0,16] -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] -; AVX512BW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm28 -; AVX512BW-FAST-NEXT: vpermt2d %zmm20, %zmm6, %zmm28 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,18,3,19,6,22,7,23,6,22,7,23,2,18,3,19] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm11 -; AVX512BW-FAST-NEXT: vpermt2d %zmm20, %zmm6, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] -; AVX512BW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm20, %zmm12, %zmm1 +; AVX512BW-FAST-NEXT: subq $1352, %rsp # imm = 0x548 +; AVX512BW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm8 +; AVX512BW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm16 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm19 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm13 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] +; AVX512BW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512BW-FAST-NEXT: vpermt2d %zmm3, %zmm7, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 +; AVX512BW-FAST-NEXT: vmovdqa64 128(%rcx), %ymm20 +; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512BW-FAST-NEXT: vmovdqa 64(%rdx), %ymm14 +; AVX512BW-FAST-NEXT: vmovdqa 128(%rdx), %ymm15 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [3,19,0,16,3,19,0,16,7,23,4,20,3,19,0,16] +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512BW-FAST-NEXT: vmovdqa64 128(%rsi), %ymm21 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm22 +; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [2,18,3,19,6,22,7,23,6,22,7,23,2,18,3,19] +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm4 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512BW-FAST-NEXT: vpermt2d %zmm19, %zmm7, %zmm12 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm17, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm18, %zmm22 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm7, %zmm4 +; AVX512BW-FAST-NEXT: vpermt2d %zmm20, %zmm17, %zmm15 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpermt2d %zmm21, %zmm18, %zmm23 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-FAST-NEXT: vmovdqa 192(%rsi), %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] -; AVX512BW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm20, %zmm13, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = [2,18,3,19,14,30,15,31,14,30,15,31,2,18,3,19] -; AVX512BW-FAST-NEXT: vpermt2d %zmm20, %zmm27, %zmm24 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm5, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm9, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm11, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm12, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm4 -; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm13, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermt2d %zmm25, %zmm27, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm9, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm11, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm29 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm13, %zmm30 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm27, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm1 -; AVX512BW-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm9 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] +; AVX512BW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm14 +; AVX512BW-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] +; AVX512BW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm3, %zmm11, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [2,18,3,19,14,30,15,31,14,30,15,31,2,18,3,19] +; AVX512BW-FAST-NEXT: vpermt2d %zmm3, %zmm30, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm19, %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm19, %zmm11, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpermt2d %zmm19, %zmm30, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, %zmm29 +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm2, %zmm29 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, %zmm31 +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm11, %zmm31 +; AVX512BW-FAST-NEXT: vpermt2d %zmm16, %zmm30, %zmm9 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm1 +; AVX512BW-FAST-NEXT: vpermi2d %zmm8, %zmm5, %zmm7 +; AVX512BW-FAST-NEXT: vpermi2d %zmm8, %zmm5, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpermi2d %zmm8, %zmm5, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpermi2d %zmm8, %zmm5, %zmm11 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm12 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm14, %zmm2, %zmm13 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermt2d %zmm14, %zmm27, %zmm2 +; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm30, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] +; AVX512BW-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm19 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm19 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm20, %zmm2 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [5,21,0,0,7,23,6,22,5,21,0,0,7,23,6,22] -; AVX512BW-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm14 -; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm27, %zmm14 -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] -; AVX512BW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm6, %zmm12 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [3,19,0,16,11,27,8,24,15,31,12,28,3,19,0,16] -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm7, %zmm22 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [9,25,8,24,0,0,10,26,9,25,8,24,0,0,10,26] +; AVX512BW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm18, %zmm24 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [3,19,0,16,11,27,8,24,15,31,12,28,3,19,0,16] +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm15, %zmm22 ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [13,29,0,0,15,31,14,30,13,29,0,0,15,31,14,30] ; AVX512BW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpermt2d %zmm15, %zmm2, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm15 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm20 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm19 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm5 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm27, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm13 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm6, %zmm13 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm10 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm10 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm17 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm2, %zmm21 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm30 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm30, %zmm14 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm30, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm20, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm30, %zmm28 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm18, %zmm28 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm30, %zmm27 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm15, %zmm27 +; AVX512BW-FAST-NEXT: vpermt2d %zmm1, %zmm2, %zmm30 +; AVX512BW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm11 ; AVX512BW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm8, %zmm23 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm16 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm4 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm27, %zmm4 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm6, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm8 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm17 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm16, %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm26 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm25 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm25 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm23 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm15, %zmm23 +; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm11 ; AVX512BW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm3 ; AVX512BW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm16 ; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm20 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm25 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm27 -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm6 -; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm7 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm18 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm3, %zmm15 ; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm3 -; AVX512BW-FAST-NEXT: movb $-110, %al -; AVX512BW-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 {%k2} ; AVX512BW-FAST-NEXT: movb $36, %al ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 {%k2} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, %zmm31 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm9 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm28 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm14 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm31 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, %zmm24 {%k2} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, %zmm19 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, %zmm18 {%k2} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm24 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm19 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm5 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 {%k2} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, %zmm10 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm12, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa64 64(%r8), %zmm4 +; AVX512BW-FAST-NEXT: vpermt2d %zmm4, %zmm12, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 128(%r8), %zmm5 +; AVX512BW-FAST-NEXT: vpermt2d %zmm5, %zmm12, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 192(%r8), %zmm10 +; AVX512BW-FAST-NEXT: vpermt2d %zmm10, %zmm12, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 64(%r9), %zmm6 +; AVX512BW-FAST-NEXT: vpermt2d %zmm6, %zmm17, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 128(%r9), %zmm8 +; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm17, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 192(%r9), %zmm9 +; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm17, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, %zmm29 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm30 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm17, %zmm19 +; AVX512BW-FAST-NEXT: vpermt2d %zmm4, %zmm17, %zmm14 +; AVX512BW-FAST-NEXT: vpermt2d %zmm5, %zmm17, %zmm13 +; AVX512BW-FAST-NEXT: vpermt2d %zmm10, %zmm17, %zmm16 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm17, %zmm19 +; AVX512BW-FAST-NEXT: vpermt2d %zmm6, %zmm17, %zmm14 +; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm17, %zmm13 +; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm17, %zmm16 +; AVX512BW-FAST-NEXT: movb $-110, %al +; AVX512BW-FAST-NEXT: kmovd %eax, %k2 ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, %zmm7 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm17, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm24, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm1, %zmm21 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, %zmm28 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, %zmm30 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm4, %zmm17, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512BW-FAST-NEXT: vpermt2d %zmm4, %zmm24, %zmm28 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm4, %zmm2, %zmm30 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, %zmm29 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm11 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm5, %zmm17, %zmm2 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm10 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm4 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm29 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm30 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm18, %zmm17 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 192(%r8), %zmm0 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 +; AVX512BW-FAST-NEXT: vpermt2d %zmm5, %zmm0, %zmm29 +; AVX512BW-FAST-NEXT: vpermt2d %zmm5, %zmm24, %zmm31 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> +; AVX512BW-FAST-NEXT: vpermt2d %zmm5, %zmm4, %zmm11 ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm20, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, %zmm2 {%k2} +; AVX512BW-FAST-NEXT: vpermt2d %zmm10, %zmm17, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 ; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 {%k1} -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm22, %zmm27 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm26, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15] -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm25 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23] -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm14 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, %zmm3 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm28 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u> -; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm7, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm31 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 64(%r9), %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm24 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm19 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm5 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm13 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqa64 128(%r9), %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm23 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm10 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm4 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm29 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm30 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm17 -; AVX512BW-FAST-NEXT: vmovdqa64 192(%r9), %zmm2 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm20, %zmm11 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm6, %zmm12 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm22, %zmm27 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm8, %zmm16 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm0, %zmm18 -; AVX512BW-FAST-NEXT: vpermt2d %zmm2, %zmm7, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 {%k2} +; AVX512BW-FAST-NEXT: vpermt2d %zmm10, %zmm0, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 {%k1} +; AVX512BW-FAST-NEXT: vpermt2d %zmm10, %zmm24, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm0, %zmm22 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15] +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm2, %zmm7 +; AVX512BW-FAST-NEXT: vpermt2d %zmm10, %zmm4, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15] +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm4, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31] +; AVX512BW-FAST-NEXT: vpermt2d %zmm12, %zmm5, %zmm21 +; AVX512BW-FAST-NEXT: vpermt2d %zmm6, %zmm0, %zmm1 +; AVX512BW-FAST-NEXT: vpermt2d %zmm6, %zmm2, %zmm27 +; AVX512BW-FAST-NEXT: vpermt2d %zmm6, %zmm4, %zmm28 +; AVX512BW-FAST-NEXT: vpermt2d %zmm6, %zmm5, %zmm30 +; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm0, %zmm23 +; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm2, %zmm29 +; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm4, %zmm31 +; AVX512BW-FAST-NEXT: vpermt2d %zmm8, %zmm5, %zmm11 +; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm0, %zmm17 +; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm2, %zmm18 +; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm4, %zmm15 +; AVX512BW-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm3 ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, 1472(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 1344(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, 1280(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, 1216(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 1152(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 1088(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm30, 1024(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, 1408(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, 1344(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 1152(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 1088(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm31, 1024(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm29, 960(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 832(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, 768(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, 704(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 640(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, 576(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, 512(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 320(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm31, 256(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm28, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, (%rax) -; AVX512BW-FAST-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm30, 704(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm28, 640(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, 576(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 384(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, 320(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 256(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 1280(%rax) +; AVX512BW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovaps %zmm0, 1216(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, 896(%rax) +; AVX512BW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512BW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512BW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-FAST-NEXT: addq $1352, %rsp # imm = 0x548 ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <64 x i32>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll @@ -27,23 +27,23 @@ ; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm6 = mem[0],zero -; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm0[1,0] ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm7[2,0] ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm6[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,3,3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm4[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm4[0,2] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE-NEXT: movaps %xmm1, 32(%rax) -; SSE-NEXT: movaps %xmm7, 16(%rax) +; SSE-NEXT: movaps %xmm6, 16(%rax) ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movq %xmm2, 48(%rax) +; SSE-NEXT: movaps %xmm1, 32(%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf2: @@ -52,138 +52,76 @@ ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,0],ymm0[1,0],ymm1[7,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2],ymm0[2,1],ymm6[4,6],ymm0[6,5] -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10,11] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,2],xmm4[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,3],ymm1[4,6],ymm0[4,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,u,0,2,u,u,u,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,3],ymm2[4,6],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero +; AVX1-ONLY-NEXT: vbroadcastsd (%r10), %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[4],ymm7[4],ymm8[5],ymm7[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[1],ymm8[1],ymm0[4],ymm8[4],ymm0[5],ymm8[5] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,0],ymm5[1,0],ymm6[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[1],xmm2[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vmovaps %xmm1, 32(%rax) +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm1 +; AVX1-ONLY-NEXT: vmovlps %xmm1, 48(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm0 -; AVX1-ONLY-NEXT: vmovlps %xmm0, 48(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; -; AVX2-SLOW-LABEL: store_i32_stride7_vf2: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero -; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,1,4,6,6,5] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm3 = <3,5,7,u> -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,5,0,1,3,5,0,1] -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vmovlps %xmm0, 48(%rax) -; AVX2-SLOW-NEXT: vmovaps %xmm2, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm1, (%rax) -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: store_i32_stride7_vf2: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero -; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <3,5,7,u> -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,5,0,1,3,5,0,1] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = <0,2,4,6,u,u,u,1> -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,2,4,0,0,2,4,0] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX2-FAST-NEXT: vmovlps %xmm2, 48(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-NEXT: vmovaps %xmm1, 32(%rax) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: store_i32_stride7_vf2: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,1,4,6,6,5] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm3 = <3,5,7,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,5,0,1,3,5,0,1] -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovlps %xmm0, 48(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%rax) -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-ONLY-LABEL: store_i32_stride7_vf2: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX2-ONLY-NEXT: movq (%r10), %rcx +; AVX2-ONLY-NEXT: vmovq %rcx, %xmm6 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm8 +; AVX2-ONLY-NEXT: vpermd %ymm8, %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,4,0,1,0,4,0,1] +; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm7 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,4,0,4,0,4,0,4] +; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm8, %ymm7 +; AVX2-ONLY-NEXT: vmovd %ecx, %xmm8 +; AVX2-ONLY-NEXT: vpbroadcastd %xmm8, %ymm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm7 = [1,5,1,5] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm7, %ymm1 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm2 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,5,1,5,1,5,1,5] +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-ONLY-NEXT: vmovq %xmm2, 48(%rax) +; AVX2-ONLY-NEXT: vmovdqa %xmm1, 32(%rax) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i32_stride7_vf2: ; AVX512F-SLOW: # %bb.0: @@ -197,16 +135,16 @@ ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,2,4,6,16,20,18,1,3,5,7,17,21,19,u,u> -; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) -; AVX512F-SLOW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 +; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,2,4,6,16,20,18,1,3,5,7,17,21,19,u,u> +; AVX512F-SLOW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; AVX512F-SLOW-NEXT: vextracti32x4 $2, %zmm2, 32(%rax) +; AVX512F-SLOW-NEXT: vextracti32x4 $3, %zmm2, %xmm0 ; AVX512F-SLOW-NEXT: vmovq %xmm0, 48(%rax) -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa %ymm2, (%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -222,12 +160,12 @@ ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,u> -; AVX512F-FAST-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vpermi2q %ymm4, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,2,4,6,8,10,12,1,3,5,7,9,11,13,u,u> ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-FAST-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) @@ -249,16 +187,16 @@ ; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,2,4,6,16,20,18,1,3,5,7,17,21,19,u,u> -; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 -; AVX512BW-SLOW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) -; AVX512BW-SLOW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 +; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,2,4,6,16,20,18,1,3,5,7,17,21,19,u,u> +; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; AVX512BW-SLOW-NEXT: vextracti32x4 $2, %zmm2, 32(%rax) +; AVX512BW-SLOW-NEXT: vextracti32x4 $3, %zmm2, %xmm0 ; AVX512BW-SLOW-NEXT: vmovq %xmm0, 48(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa %ymm1, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa %ymm2, (%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; @@ -274,12 +212,12 @@ ; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512BW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,u> -; AVX512BW-FAST-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vpermi2q %ymm4, %ymm0, %ymm1 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,2,4,6,8,10,12,1,3,5,7,9,11,13,u,u> ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) @@ -314,48 +252,48 @@ ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm6 -; SSE-NEXT: movaps (%rdx), %xmm5 -; SSE-NEXT: movaps (%rcx), %xmm1 -; SSE-NEXT: movaps (%r8), %xmm4 -; SSE-NEXT: movaps (%r9), %xmm2 -; SSE-NEXT: movaps (%r10), %xmm8 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] +; SSE-NEXT: movaps (%rsi), %xmm1 +; SSE-NEXT: movaps (%rdx), %xmm3 +; SSE-NEXT: movaps (%rcx), %xmm2 +; SSE-NEXT: movaps (%r8), %xmm8 +; SSE-NEXT: movaps (%r9), %xmm4 +; SSE-NEXT: movaps (%r10), %xmm6 +; SSE-NEXT: movaps %xmm3, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; SSE-NEXT: movaps %xmm3, %xmm7 +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm2[2],xmm7[3],xmm2[3] ; SSE-NEXT: movaps %xmm8, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm2[3,3] -; SSE-NEXT: movaps %xmm4, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm1[1,1] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm9[2,0] -; SSE-NEXT: movaps %xmm0, %xmm9 -; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm9[0,1] -; SSE-NEXT: movaps %xmm6, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm10[2,0] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm5[2,0] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm10[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm4, 16(%rax) -; SSE-NEXT: movaps %xmm9, 32(%rax) -; SSE-NEXT: movaps %xmm2, 48(%rax) -; SSE-NEXT: movaps %xmm1, 96(%rax) +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm2[1,1] +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,0],xmm4[3,3] +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[2,3,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0],xmm0[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm6[0,2] +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm10[2,0] +; SSE-NEXT: movaps %xmm0, %xmm10 +; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm10[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm8[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[2,0] +; SSE-NEXT: movss {{.*#+}} xmm0 = xmm11[0],xmm0[1,2,3] +; SSE-NEXT: movaps %xmm2, 96(%rax) +; SSE-NEXT: movaps %xmm4, 48(%rax) +; SSE-NEXT: movaps %xmm6, 32(%rax) +; SSE-NEXT: movaps %xmm9, 16(%rax) ; SSE-NEXT: movaps %xmm7, 64(%rax) -; SSE-NEXT: movaps %xmm3, (%rax) +; SSE-NEXT: movaps %xmm5, (%rax) ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: retq ; @@ -363,48 +301,48 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm1 ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm8 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm9 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm9[1,0],ymm8[1,0],ymm9[5,4],ymm8[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm8[2,1],ymm10[6,4],ymm8[6,5] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,1],ymm6[2,0],ymm5[5,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm3[1,1],xmm4[1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm11[1,2],ymm6[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm10[3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm5[3,3],ymm7[3,3],ymm5[7,7],ymm7[7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm11 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm3[2],ymm11[2],ymm3[3],ymm11[3],ymm3[6],ymm11[6],ymm3[7],ymm11[7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5,6],ymm11[7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1],ymm11[2,0],ymm9[6,5],ymm11[6,4] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4],ymm10[5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm4[1],ymm7[3],ymm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[1,1],ymm5[2,0],ymm4[5,5],ymm5[6,4] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm11 = zero,xmm3[1],xmm6[1],zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm11[1,2],ymm5[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm10[3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm10 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] ; AVX1-ONLY-NEXT: vbroadcastss (%r10), %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[0,0],xmm3[0,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm4[0],ymm7[1],ymm4[1],ymm7[4],ymm4[4],ymm7[5],ymm4[5] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm6[0],xmm3[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6],ymm10[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,3],ymm7[3,3],ymm4[7,7],ymm7[7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[2,1],ymm4[2,0],ymm9[6,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] ; AVX1-ONLY-NEXT: vbroadcastss 12(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -453,11 +391,11 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm2[1,2],zero +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm2[1,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vmovaps %xmm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -468,14 +406,14 @@ ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm3 -; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm1 +; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovaps (%r8), %xmm4 ; AVX2-FAST-NEXT: vmovaps (%r9), %xmm5 -; AVX2-FAST-NEXT: vmovaps (%r10), %xmm0 +; AVX2-FAST-NEXT: vmovaps (%r10), %xmm1 ; AVX2-FAST-NEXT: vinsertf128 $1, (%rsi), %ymm2, %ymm2 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [5,1,5,1,5,1,5,1] @@ -491,7 +429,7 @@ ; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6],ymm9[7] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4],ymm8[5,6,7] @@ -509,12 +447,12 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm3 = [7,3,7,3,7,3,7,3] ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3] -; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm7, 32(%rax) -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) ; AVX2-FAST-NEXT: vmovaps %xmm0, 96(%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -564,11 +502,11 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm2[1,2],zero +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm2[1,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -580,17 +518,17 @@ ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-NEXT: vmovdqa (%r8), %xmm2 +; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6] +; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u> -; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 +; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm3 ; AVX512-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -617,236 +555,225 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride7_vf8: ; SSE: # %bb.0: -; SSE-NEXT: subq $104, %rsp +; SSE-NEXT: subq $40, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm14 +; SSE-NEXT: movdqa (%rdi), %xmm11 ; SSE-NEXT: movdqa (%rsi), %xmm3 -; SSE-NEXT: movdqa 16(%rsi), %xmm5 -; SSE-NEXT: movdqa 16(%rdx), %xmm13 -; SSE-NEXT: movdqa 16(%rcx), %xmm9 -; SSE-NEXT: movdqa 16(%r8), %xmm11 -; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm4 -; SSE-NEXT: movaps 16(%r9), %xmm1 -; SSE-NEXT: movdqa (%rax), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rax), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rsi), %xmm4 +; SSE-NEXT: movaps (%rdx), %xmm8 +; SSE-NEXT: movdqa 16(%rdx), %xmm15 +; SSE-NEXT: movaps (%rcx), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] +; SSE-NEXT: movaps 16(%rcx), %xmm12 +; SSE-NEXT: movaps (%r8), %xmm10 +; SSE-NEXT: movaps 16(%r8), %xmm13 +; SSE-NEXT: movdqa (%r9), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movdqa 16(%r9), %xmm9 +; SSE-NEXT: movdqa (%rax), %xmm14 +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSE-NEXT: movaps %xmm8, %xmm2 +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rdx), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm5[0],xmm1[1,2,3] -; SSE-NEXT: movaps (%rcx), %xmm3 -; SSE-NEXT: movaps (%r8), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,0] +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm12[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm13[0] +; SSE-NEXT: movdqa 16(%rax), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm10[3,3] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm12[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,1],xmm14[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2],xmm13[2,0] -; SSE-NEXT: movaps (%rsp), %xmm8 # 16-byte Reload -; SSE-NEXT: movaps %xmm8, %xmm13 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: movaps %xmm11, %xmm2 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[1,3] -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: movaps %xmm5, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm9[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movdqa %xmm11, %xmm7 +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: movdqa %xmm14, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,0],xmm11[1,0] +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0,2] +; SSE-NEXT: movaps %xmm10, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm4[0] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm5[3,3] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: movdqa %xmm15, %xmm8 +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm12[2],xmm8[3],xmm12[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm15[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[2,0] +; SSE-NEXT: movaps %xmm13, %xmm12 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm1[0] -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm8[0] -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] -; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[0,1],mem[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3],xmm5[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm11[2,0] -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm1[0],xmm6[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm4[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm7 = xmm1[0],xmm7[1,2,3] +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm4[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm5[2],xmm13[3],xmm5[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm13[0] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,0],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm15[2,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3],xmm0[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm11 = xmm4[0],xmm11[1,2,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[3,0],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm14[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm4[0],xmm10[1,2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm14, 112(%rax) -; SSE-NEXT: movdqa %xmm15, 176(%rax) -; SSE-NEXT: movaps %xmm9, (%rax) -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm2, 64(%rax) -; SSE-NEXT: movaps %xmm13, 128(%rax) -; SSE-NEXT: movaps %xmm12, 192(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps %xmm7, 96(%rax) +; SSE-NEXT: movaps %xmm3, 192(%rax) +; SSE-NEXT: movdqa %xmm8, 176(%rax) +; SSE-NEXT: movaps %xmm12, 128(%rax) +; SSE-NEXT: movaps %xmm2, 112(%rax) +; SSE-NEXT: movaps %xmm9, 64(%rax) +; SSE-NEXT: movaps %xmm1, 16(%rax) +; SSE-NEXT: movdqa %xmm7, (%rax) +; SSE-NEXT: movaps %xmm6, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) -; SSE-NEXT: movaps %xmm6, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) +; SSE-NEXT: movaps %xmm10, 96(%rax) +; SSE-NEXT: movaps %xmm11, 80(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 208(%rax) -; SSE-NEXT: addq $104, %rsp +; SSE-NEXT: movaps %xmm0, 32(%rax) +; SSE-NEXT: addq $40, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf8: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm4 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm3 ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm7 ; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,1],ymm3[1,1],ymm2[5,5],ymm3[5,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,1],ymm0[1,1],ymm1[5,5],ymm0[5,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6],ymm5[7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm7[2,1],ymm5[6,4],ymm7[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm5[2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6],ymm5[7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rax), %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm5 -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[1,1],xmm5[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm12[0],ymm6[2],ymm12[2] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm13[1],xmm11[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm11[1,1],xmm9[0,2] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm11[0],xmm10[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,0],xmm10[2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm15 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm13 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm14 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm14[1],xmm15[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2],ymm9[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm13[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,0],xmm13[2,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm15[2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6],ymm9[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm2[1,1],ymm3[1,1],ymm2[5,5],ymm3[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,1],ymm4[1,1],ymm1[5,5],ymm4[5,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4],ymm9[5,6],ymm15[7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm7[2,1],ymm15[6,4],ymm7[6,5] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2],ymm9[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm10[1],xmm11[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm11[1,1],xmm12[0,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6],ymm9[7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1],ymm9[0,2],ymm8[5,5],ymm9[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[3,3],ymm1[3,3],ymm0[7,7],ymm1[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,xmm14[1],xmm13[1],zero +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1,2],ymm12[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm0[3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm13 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm8[1,1],ymm14[0,2],ymm8[5,5],ymm14[4,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm4[3,3],ymm1[3,3],ymm4[7,7],ymm1[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,3],ymm8[3,3],ymm7[7,7],ymm8[7,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6],ymm9[7] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%rax), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6],ymm8[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm13[3,3],xmm11[3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4],ymm8[5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1],ymm1[0,2],ymm3[7,5],ymm1[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[3,3],xmm6[3,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%rax), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6],ymm7[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[3,3],xmm11[3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm8 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm8[2],mem[2],xmm8[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6],ymm8[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm5[0,1,2],xmm8[3] +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3,4],ymm7[5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm2[0,2],ymm3[7,5],ymm2[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[3,3],xmm11[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -854,124 +781,126 @@ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm2 -; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm8 -; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm3 -; AVX2-SLOW-NEXT: vmovaps (%r8), %ymm6 -; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm7 -; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm1 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm4 -; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm5[1,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3] +; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm7 +; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm2 +; AVX2-SLOW-NEXT: vmovaps (%r8), %ymm5 +; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm6 +; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm10 +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm3 +; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm4[1,1,1,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm3[1],xmm9[2,3] +; AVX2-SLOW-NEXT: vbroadcastsd %xmm9, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm11 +; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm12 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm12[1],xmm11[1],zero +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm13 +; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm14 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm9[1,2],ymm15[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm9 = ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[6],ymm2[6],ymm7[7],ymm2[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm6[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm5[2],ymm15[3,4,5],ymm5[6],ymm15[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0],ymm8[1],ymm15[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1],ymm9[2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vbroadcastss %xmm11, %xmm8 +; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm15 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5,6,7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX2-SLOW-NEXT: vbroadcastsd %xmm10, %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm12 -; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm13 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm13[1],xmm12[1],zero -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm14 -; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm15 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm15[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm10 = ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[6],ymm3[6],ymm8[7],ymm3[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm6[2],ymm11[3,4,5],ymm6[6],ymm11[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0],ymm9[1],ymm11[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm14[3,3],xmm15[3,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm11 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm15[0],ymm10[0],ymm15[2],ymm10[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm10[4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm13[3,3],xmm14[3,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm11 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6],ymm11[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3] -; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1],ymm10[2,3,4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm9 -; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm10 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm12 -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2,3],ymm10[4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6],ymm11[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm4[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm3[0,1,2],xmm11[3] +; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm11[2,3,4],ymm8[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm7[1,1],ymm2[1,1],ymm7[5,5],ymm2[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4],ymm8[5,6],ymm12[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm6[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm5[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm8[2],ymm3[3],ymm8[3],ymm3[6],ymm8[6],ymm3[7],ymm8[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm10 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,3],ymm7[3,3],ymm6[7,7],ymm7[7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6],ymm8[7] -; AVX2-SLOW-NEXT: vbroadcastsd 24(%rax), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] -; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3],xmm5[3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2],ymm8[3,4,5,6],ymm12[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[6],ymm7[6],ymm2[7],ymm7[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm12 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2],ymm7[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,3],ymm6[3,3],ymm5[7,7],ymm6[7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6],ymm7[7] +; AVX2-SLOW-NEXT: vbroadcastsd 24(%rax), %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3],xmm4[3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm6, 192(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm12, (%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm11, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm11, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm10, (%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i32_stride7_vf8: ; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: subq $24, %rsp ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm2 -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm10 -; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm3 -; AVX2-FAST-NEXT: vmovaps (%r8), %ymm7 -; AVX2-FAST-NEXT: vmovaps (%r9), %ymm8 -; AVX2-FAST-NEXT: vmovaps (%rax), %xmm0 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm5 +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm7 +; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%r8), %ymm3 +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%r9), %ymm6 +; AVX2-FAST-NEXT: vmovaps (%rax), %xmm10 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vmovaps (%r8), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vmovaps (%r8), %xmm4 -; AVX2-FAST-NEXT: vmovaps (%r9), %xmm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm5[1,1,1,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm4[1],xmm9[2,3] +; AVX2-FAST-NEXT: vmovaps (%r9), %xmm4 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm4[1,1,1,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm0[1],xmm9[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm9, %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm11 ; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm12 ; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm12[1],xmm11[1],zero @@ -981,84 +910,85 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm9[1,2],ymm15[3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2],ymm6[3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm6 = ymm10[2],ymm3[2],ymm10[3],ymm3[3],ymm10[6],ymm3[6],ymm10[7],ymm3[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm7[2],ymm9[3,4,5],ymm7[6],ymm9[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1],ymm9[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm6[2,3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm3[1,1],ymm10[5,5],ymm3[5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5,6],ymm6[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm8[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm7[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm15[2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2],ymm6[3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2],ymm8[3,4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm13[3,3],xmm14[3,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm9 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm7[2],ymm1[2],ymm7[3],ymm1[3],ymm7[6],ymm1[6],ymm7[7],ymm1[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[6],ymm5[6],ymm2[7],ymm5[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm6[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm3[2],ymm15[3,4,5],ymm3[6],ymm15[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0],ymm8[1],ymm15[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1],ymm9[2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vbroadcastss %xmm11, %xmm8 +; AVX2-FAST-NEXT: vbroadcastss %xmm12, %xmm15 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm15 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,1,2,2,0,1,2,2] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6],ymm9[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm5[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm4[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm10 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,3],ymm8[3,3],ymm7[7,7],ymm8[7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6],ymm9[7] -; AVX2-FAST-NEXT: vbroadcastsd 24(%rax), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vbroadcastss %xmm11, %xmm8 -; AVX2-FAST-NEXT: vbroadcastss %xmm12, %xmm9 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm9 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm8 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm15 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vbroadcastsd %xmm10, %ymm10 +; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm15[0],ymm10[0],ymm15[2],ymm10[2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm10[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm13[3,3],xmm14[3,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastsd %xmm10, %ymm9 -; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm11 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6],ymm0[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm4[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm1[0,1,2],xmm8[3] +; AVX2-FAST-NEXT: vmovaps %xmm1, %xmm3 +; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm8[2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm2[1,1],ymm7[5,5],ymm2[5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1],ymm8[2,3,4],ymm1[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6],ymm8[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm6[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm8[1],ymm12[2,3,4],ymm8[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] +; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[6],ymm7[6],ymm2[7],ymm7[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm8 = ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[6],ymm1[6],ymm5[7],ymm1[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm13[3,3],ymm6[3,3],ymm13[7,7],ymm6[7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6],ymm7[7] +; AVX2-FAST-NEXT: vbroadcastsd 24(%rax), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] +; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm5 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6],ymm2[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3],xmm5[3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm10[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3],xmm4[3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-NEXT: vmovaps %ymm7, 192(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm6, 64(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm9, 160(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm6, 192(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm11, 64(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm10, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-NEXT: addq $24, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -1066,103 +996,101 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm5[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm4[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm3[1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm12[1],xmm11[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm9[1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm9 = ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[6],ymm2[6],ymm7[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm6[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm5[2],ymm15[3,4,5],ymm5[6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0],ymm8[1],ymm15[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1],ymm9[2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm11, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm13[1],xmm12[1],zero -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm15[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm10 = ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[6],ymm3[6],ymm8[7],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm6[2],ymm11[3,4,5],ymm6[6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0],ymm9[1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm14[3,3],xmm15[3,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm11 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm15[0],ymm10[0],ymm15[2],ymm10[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm10[4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm13[3,3],xmm14[3,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm11 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1],ymm10[2,3,4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2,3],ymm10[4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm4[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm3[0,1,2],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm11[2,3,4],ymm8[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm7[1,1],ymm2[1,1],ymm7[5,5],ymm2[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4],ymm8[5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm6[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm5[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm8[2],ymm3[3],ymm8[3],ymm3[6],ymm8[6],ymm3[7],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm10 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,3],ymm7[3,3],ymm6[7,7],ymm7[7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rax), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3],xmm5[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2],ymm8[3,4,5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[6],ymm7[6],ymm2[7],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm12 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,3],ymm6[3,3],ymm5[7,7],ymm6[7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rax), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3],xmm4[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm10, (%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -1175,39 +1103,39 @@ ; AVX512F-NEXT: vmovdqa (%r8), %ymm2 ; AVX512F-NEXT: vmovdqa (%r10), %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 -; AVX512F-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 +; AVX512F-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512F-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [31,7,15,23,31,7,15,23] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,23,31,7,6,23,31,7] ; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7] -; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10> -; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [31,7,15,23,31,7,15,23] +; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3,4],ymm0[5,6,7] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10> +; AVX512F-NEXT: vpermi2d %zmm1, %zmm4, %zmm6 ; AVX512F-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12> -; AVX512F-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12> +; AVX512F-NEXT: vpermi2d %zmm4, %zmm1, %zmm7 ; AVX512F-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14> -; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm7 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512F-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14> +; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm1 ; AVX512F-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm6, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512F-NEXT: vmovdqa %ymm0, 192(%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1221,39 +1149,39 @@ ; AVX512BW-NEXT: vmovdqa (%r8), %ymm2 ; AVX512BW-NEXT: vmovdqa (%r10), %ymm3 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 -; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 +; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [31,7,15,23,31,7,15,23] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,23,31,7,6,23,31,7] ; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10> -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [31,7,15,23,31,7,15,23] +; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3,4],ymm0[5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10> +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm4, %zmm6 ; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12> -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm8 +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12> +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm1, %zmm7 ; AVX512BW-NEXT: movw $3612, %cx # imm = 0xE1C ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = -; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14> -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14> +; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: movw $15480, %cx # imm = 0x3C78 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512BW-NEXT: vmovdqa %ymm0, 192(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1280,84 +1208,85 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $536, %rsp # imm = 0x218 +; SSE-NEXT: subq $456, %rsp # imm = 0x1C8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa (%rsi), %xmm13 +; SSE-NEXT: movdqa 16(%rsi), %xmm4 +; SSE-NEXT: movaps (%rdx), %xmm10 +; SSE-NEXT: movdqa 16(%rdx), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rcx), %xmm9 +; SSE-NEXT: movaps 16(%rcx), %xmm6 +; SSE-NEXT: movaps (%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%r8), %xmm8 +; SSE-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa (%r9), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r9), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[1,1,1,1] +; SSE-NEXT: movaps %xmm10, %xmm3 +; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm7 -; SSE-NEXT: movdqa 16(%rsi), %xmm9 -; SSE-NEXT: movaps (%rdx), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm14 -; SSE-NEXT: movaps 16(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%r8), %xmm15 -; SSE-NEXT: movaps 16(%r8), %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm11 -; SSE-NEXT: movdqa 16(%r9), %xmm12 -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm4 -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm14[1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] -; SSE-NEXT: movss {{.*#+}} xmm5 = xmm1[0],xmm5[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[1,1] +; SSE-NEXT: movaps %xmm6, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa 16(%rax), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rsi), %xmm8 -; SSE-NEXT: movaps 32(%rdx), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movdqa 32(%rsi), %xmm1 +; SSE-NEXT: movaps 32(%rdx), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 32(%rcx), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rcx), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%r9), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%r9), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rax), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rsi), %xmm1 @@ -1365,178 +1294,176 @@ ; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm2 -; SSE-NEXT: movaps 48(%r8), %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movaps 48(%rcx), %xmm8 +; SSE-NEXT: movaps 48(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movaps 48(%r9), %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rax), %xmm7 -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[0,3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%r9), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm7[1,1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm7[2,3] +; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm4[1,1] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm6[3,3] -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm0[0] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[1,3] -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm2 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm11[2],xmm15[3],xmm11[3] -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm15[0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,0],xmm5[1,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movaps %xmm10, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm15[0,0] +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm15 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm14[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,0] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm12 -; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm4, %xmm10 -; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm0[0] +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm0[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm13[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm6[1,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps %xmm10, %xmm11 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] +; SSE-NEXT: movaps %xmm8, %xmm13 +; SSE-NEXT: movaps 48(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm13[2,0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[0,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm1[3,3] +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm1[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm2[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0,1],mem[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,0] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm7[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm6[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: # xmm7 = xmm7[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm7[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm6[0],xmm1[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = xmm6[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm6[2,0] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm13[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm7 = xmm6[0],xmm7[1,2,3] -; SSE-NEXT: shufps $255, (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm1[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm14 = xmm6[0],xmm14[1,2,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm1[2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm13 = xmm0[0],xmm13[1,2,3] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm13 = xmm6[0],xmm13[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm1[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm9 = xmm6[0],xmm9[1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm14 = xmm0[0],xmm14[1,2,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm1[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm0, 416(%rax) -; SSE-NEXT: movaps %xmm4, 400(%rax) -; SSE-NEXT: movaps %xmm3, 384(%rax) +; SSE-NEXT: movaps %xmm7, 416(%rax) +; SSE-NEXT: movaps %xmm5, 400(%rax) +; SSE-NEXT: movaps %xmm8, 384(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 352(%rax) +; SSE-NEXT: movaps %xmm3, 336(%rax) +; SSE-NEXT: movdqa %xmm6, 288(%rax) +; SSE-NEXT: movaps %xmm11, 240(%rax) +; SSE-NEXT: movdqa %xmm9, 224(%rax) +; SSE-NEXT: movaps %xmm12, 176(%rax) +; SSE-NEXT: movaps %xmm15, 128(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 336(%rax) -; SSE-NEXT: movdqa %xmm5, 288(%rax) -; SSE-NEXT: movaps %xmm8, 240(%rax) -; SSE-NEXT: movdqa %xmm11, 224(%rax) -; SSE-NEXT: movaps %xmm10, 176(%rax) -; SSE-NEXT: movaps %xmm12, 128(%rax) -; SSE-NEXT: movaps %xmm15, 112(%rax) +; SSE-NEXT: movaps %xmm0, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1547,1026 +1474,991 @@ ; SSE-NEXT: movaps %xmm0, 432(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 368(%rax) -; SSE-NEXT: movaps %xmm9, 320(%rax) -; SSE-NEXT: movaps %xmm13, 304(%rax) +; SSE-NEXT: movaps %xmm10, 320(%rax) +; SSE-NEXT: movaps %xmm14, 304(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 256(%rax) -; SSE-NEXT: movaps %xmm14, 208(%rax) -; SSE-NEXT: movaps %xmm7, 192(%rax) +; SSE-NEXT: movaps %xmm2, 208(%rax) +; SSE-NEXT: movaps %xmm13, 192(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 144(%rax) +; SSE-NEXT: movaps %xmm4, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps %xmm2, 80(%rax) +; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: addq $536, %rsp # imm = 0x218 +; SSE-NEXT: addq $456, %rsp # imm = 0x1C8 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8 +; AVX1-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm2 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[6],ymm7[6],ymm4[7],ymm7[7] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm13 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,0],ymm5[4,5],ymm4[6,4] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,1,2,2,5,5,6,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6],ymm4[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm10 -; AVX1-ONLY-NEXT: vmovaps %xmm10, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm11[0],xmm4[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm12 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovaps %xmm14, %xmm15 -; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm14 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[1,1],xmm10[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm11[1,1],xmm4[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm15[1],xmm14[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,1],ymm8[1,1],ymm6[5,5],ymm8[5,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1],ymm7[1,1],ymm13[5,5],ymm7[5,5] -; AVX1-ONLY-NEXT: vmovaps %ymm7, %ymm13 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,1],ymm1[6,4],ymm0[6,5] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm14 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1],xmm0[0,2] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm1[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm9[1],xmm2[1],zero -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm15 -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[1,1],xmm14[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1],xmm6[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm12[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[2,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm2[1],xmm1[1],zero +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm12[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[1,1],ymm10[1,1],ymm8[5,5],ymm10[5,5] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm1[1,1],ymm7[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm6 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1],ymm6[1,1],ymm7[5,5],ymm6[5,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm2 -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm2[2,1],ymm4[6,4],ymm2[6,5] -; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm13[1,1],ymm1[5,5],ymm13[5,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm15 +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm15[2,1],ymm1[6,4],ymm15[6,5] +; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,1],ymm3[0,2],ymm1[5,5],ymm3[4,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm9[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm9[2,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm10[2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rax), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm0[0],ymm10[2],ymm0[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1],xmm2[1],zero +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm13[3,3],mem[3,3],ymm13[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm4[1,1],ymm5[5,5],ymm4[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm13[3,3],mem[3,3],ymm13[7,7],mem[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 60(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss 60(%r9), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rax), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6],ymm3[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1],ymm3[1,1],ymm10[5,5],ymm3[5,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm14[2,1],ymm1[6,4],ymm14[6,5] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[3,3],ymm7[3,3],ymm6[7,7],ymm7[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm10[3,3],ymm8[3,3],ymm10[7,7],ymm8[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,3],ymm1[3,3],ymm2[7,7],ymm1[7,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,3],ymm1[1,2],ymm4[6,7],ymm1[5,6] +; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm14 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[6],ymm13[6],ymm6[7],ymm13[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1],ymm1[0,2],ymm11[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,3],ymm10[3,3],ymm3[7,7],ymm10[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,3],ymm5[3,3],ymm4[7,7],ymm5[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 60(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 60(%r9), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rax), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rax), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,3],xmm11[3,3] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm9[2],mem[2],xmm9[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3],xmm12[3,3] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm13[0],mem[0],ymm13[1],mem[1],ymm13[4],mem[4],ymm13[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm3[0,2],ymm4[7,5],ymm3[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3],xmm5[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[3,1],ymm4[0,2],ymm6[7,5],ymm4[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm14[3,3],xmm15[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm4[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,3],ymm6[3,3],ymm13[7,7],ymm6[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm14[3,3],ymm7[3,3],ymm14[7,7],ymm7[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm15[3,3],ymm11[3,3],ymm15[7,7],ymm11[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm5[2,3],ymm3[1,2],ymm5[6,7],ymm3[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[3,3],xmm8[3,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6],ymm5[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rax), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm14[0],ymm7[0],ymm14[1],ymm7[1],ymm14[4],ymm7[4],ymm14[5],ymm7[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm1[0],ymm13[0],ymm1[2],ymm13[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm13[3,1],ymm6[0,2],ymm13[7,5],ymm6[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm12[3,3],xmm4[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[3,1],ymm7[0,2],ymm8[7,5],ymm7[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm11[3,3],xmm9[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovaps %ymm4, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) -; AVX1-ONLY-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX1-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i32_stride7_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $504, %rsp # imm = 0x1F8 +; AVX2-SLOW-NEXT: subq $408, %rsp # imm = 0x198 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm10 -; AVX2-SLOW-NEXT: vmovaps %xmm10, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rax), %xmm0 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm5 +; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%r8), %ymm2 +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%r9), %ymm3 +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm0[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm13 -; AVX2-SLOW-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%r8), %xmm2 +; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm2 ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm14 -; AVX2-SLOW-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm1 -; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm7 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm1 -; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm11 -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm9 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm9[1],xmm11[1],zero -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm12 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm7[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm4[3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3] -; AVX2-SLOW-NEXT: vbroadcastsd %xmm5, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm12[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2],xmm5[3] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,2,1] -; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm5 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm5[1],xmm1[1],zero -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm6 +; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm9 +; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm8 +; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm3 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2],xmm4[3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm9[1],xmm6[1],zero +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm2[3,4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm15 +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm13 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps (%r8), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm0[2],ymm6[3,4,5],ymm0[6],ymm6[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm5 +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[6],ymm5[6],ymm0[7],ymm5[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps (%r8), %ymm11 +; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm10 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm10[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm11[2],ymm13[3,4,5],ymm11[6],ymm13[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm14[1],ymm6[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm15 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm6 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 32(%r8), %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovaps 32(%r9), %ymm2 -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0],ymm1[1,2,3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[6],ymm14[6],ymm15[7],ymm14[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 60(%r8), %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 60(%r9), %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vbroadcastsd 56(%rax), %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps 32(%r8), %xmm0 +; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm1 +; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm12 = xmm1[1,1,1,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0],xmm0[1],xmm12[2,3] +; AVX2-SLOW-NEXT: vbroadcastsd %xmm12, %ymm12 +; AVX2-SLOW-NEXT: vmovaps 32(%rax), %xmm0 +; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm12 = xmm2[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2],xmm12[3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,1,2,1] +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm13 +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm12 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm12[1],xmm13[1],zero +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss %xmm11, %xmm3 +; AVX2-SLOW-NEXT: vmovaps %xmm6, %xmm0 +; AVX2-SLOW-NEXT: vbroadcastss %xmm6, %xmm2 ; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm1 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastsd %xmm7, %ymm2 -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm12[3,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastsd %xmm4, %ymm14 +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1,2,3],ymm1[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm8[3,3],xmm3[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm0 -; AVX2-SLOW-NEXT: vbroadcastss %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; AVX2-SLOW-NEXT: vmovaps %xmm8, %xmm10 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastsd %xmm11, %ymm2 -; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm6[1,1],ymm4[5,5],ymm6[5,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-SLOW-NEXT: vbroadcastsd 48(%rax), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[2,2,2,2] +; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,1],ymm13[1,1],ymm2[5,5],ymm13[5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm3[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm3 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm4 = ymm12[2],ymm5[2],ymm12[3],ymm5[3],ymm12[6],ymm5[6],ymm12[7],ymm5[7] -; AVX2-SLOW-NEXT: vmovaps %ymm12, %ymm2 -; AVX2-SLOW-NEXT: vmovaps %ymm5, %ymm12 -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm0[3,3],ymm1[3,3],ymm0[7,7],ymm1[7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6],ymm5[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = xmm0[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[4],ymm15[4],ymm2[5],ymm15[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm6[3,3],xmm7[3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm5[1,1],ymm2[5,5],ymm5[5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[6],ymm4[6],ymm15[7],ymm4[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,3],ymm10[3,3],ymm11[7,7],ymm10[7,7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3,4],ymm2[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm2 +; AVX2-SLOW-NEXT: vbroadcastss %xmm12, %xmm3 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm9[3,3],xmm8[3,3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6],ymm4[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,2,2,2] +; AVX2-SLOW-NEXT: vmovaps %xmm5, %xmm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vmovaps %xmm7, %xmm10 +; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm4 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm10[3,3],xmm9[3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[4],ymm2[4],ymm12[5],ymm2[5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm13[3,1,2,0,7,5,6,4] -; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = xmm10[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm11[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovaps %ymm8, %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm7, %ymm9 +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1],ymm11[1,1],ymm12[5,5],ymm11[5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-SLOW-NEXT: vbroadcastsd 48(%rax), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm7 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm8 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 60(%r8), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 60(%r9), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vbroadcastsd 56(%rax), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6],ymm8[7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 416(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 352(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm4, 320(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 192(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 352(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm1, 160(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 288(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 256(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm2, 224(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm14, (%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-SLOW-NEXT: addq $504, %rsp # imm = 0x1F8 +; AVX2-SLOW-NEXT: addq $408, %rsp # imm = 0x198 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i32_stride7_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $536, %rsp # imm = 0x218 +; AVX2-FAST-NEXT: subq $360, %rsp # imm = 0x168 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps (%rax), %xmm2 -; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rax), %xmm0 +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm5 +; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%r9), %ymm3 +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5,6,5,6,5,6,5,6] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm0[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rax), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps (%r8), %xmm7 -; AVX2-FAST-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%r8), %xmm3 -; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps (%r9), %xmm8 -; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm4 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] -; AVX2-FAST-NEXT: vmovaps %xmm4, %xmm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX2-FAST-NEXT: vmovaps (%r8), %xmm12 +; AVX2-FAST-NEXT: vmovaps (%r9), %xmm10 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,1,1,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm12[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm11 -; AVX2-FAST-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm9 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm6[1],xmm9[1],zero -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm12 -; AVX2-FAST-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm10 -; AVX2-FAST-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm9 +; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm6 +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm5 +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm6[1],xmm9[1],zero +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm2[3,4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,1,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] -; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2],xmm1[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm2 -; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm2[1],xmm11[1],zero -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm7 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm15 +; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm4 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[6],ymm4[6],ymm15[7],ymm4[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps (%r8), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%r9), %ymm11 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm11[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm0[2],ymm13[3,4,5],ymm0[6],ymm13[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3] +; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm3[2,3,4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovaps 32(%r8), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[1,1,1,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3] +; AVX2-FAST-NEXT: vbroadcastsd %xmm3, %ymm3 +; AVX2-FAST-NEXT: vmovaps 32(%rax), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm13[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm13 = xmm2[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2],xmm13[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm13[0,1,2,1] +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm13 +; AVX2-FAST-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm13[1],xmm2[1],zero +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-NEXT: vbroadcastss %xmm9, %xmm0 +; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm3 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm14 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,1,2,2,0,1,2,2] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm3, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm14 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastsd %xmm2, %ymm13 +; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm13[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm1[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm10[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[4],ymm7[4],ymm8[5],ymm7[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,3],xmm10[3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm2 -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps (%r8), %ymm2 -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%r9), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,1],ymm4[1,1],ymm15[5,5],ymm4[5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm12 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm11[2],ymm2[3],ymm11[3],ymm2[6],ymm11[6],ymm2[7],ymm11[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovaps 32(%r9), %ymm4 -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm15 = [5,6,5,6,5,6,5,6] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm15, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm2[2],ymm11[3],ymm2[3],ymm11[6],ymm2[6],ymm11[7],ymm2[7] +; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm15[2],ymm4[3],ymm15[3],ymm4[6],ymm15[6],ymm4[7],ymm15[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 60(%r8), %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vbroadcastss 60(%r9), %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vbroadcastsd 56(%rax), %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm3[3,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm15 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [0,1,2,2,0,1,2,2] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm7, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4],ymm1[5,6],ymm15[7] -; AVX2-FAST-NEXT: vmovaps %xmm14, %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm14[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1,2],xmm15[3] -; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss %xmm9, %xmm1 -; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm6 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastsd %xmm15, %ymm5 -; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm11[1,1],ymm2[5,5],ymm11[5,5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-NEXT: vbroadcastsd 48(%rax), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3],xmm6[3,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,3],ymm11[3,3],ymm2[7,7],ymm11[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm11, %xmm1 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm12, %xmm2 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm3 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm4 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm10[3,3],xmm9[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6],ymm3[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm7[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps %xmm7, %xmm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vmovaps %xmm8, %xmm10 +; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss %xmm0, %xmm3 -; AVX2-FAST-NEXT: vbroadcastss %xmm9, %xmm4 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastsd %xmm10, %ymm4 -; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[2,2,2,2] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,1],ymm1[1,1],ymm3[5,5],ymm1[5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm4[5,6],ymm2[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm3 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6],ymm4[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0,1,2],ymm9[3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7] -; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovaps %ymm6, %ymm0 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm2[3,3],ymm7[3,3],ymm2[7,7],ymm7[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm5 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm11[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm8[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm3[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm7 = xmm14[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm10[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm4 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm10[3,3],xmm9[3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovaps %ymm8, %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm9 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1],ymm11[1,1],ymm12[5,5],ymm11[5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6],ymm4[7] +; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-FAST-NEXT: vbroadcastsd 48(%rax), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm7 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm8 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 60(%r8), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7] +; AVX2-FAST-NEXT: vbroadcastss 60(%r9), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vbroadcastsd 56(%rax), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm5, 320(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm9, 128(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm7, 416(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm4, 352(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm3, 320(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm2, 288(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm2, 256(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm1, 224(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm6, 128(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm14, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm13, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-FAST-NEXT: addq $536, %rsp # imm = 0x218 +; AVX2-FAST-NEXT: addq $360, %rsp # imm = 0x168 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride7_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $504, %rsp # imm = 0x1F8 +; AVX2-FAST-PERLANE-NEXT: subq $408, %rsp # imm = 0x198 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm10, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rax), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm9[1],xmm11[1],zero -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm7[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm4[3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm12[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm5[1],xmm1[1],zero -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm9[1],xmm6[1],zero +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2],ymm2[3,4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm2[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm0[2],ymm6[3,4,5],ymm0[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[6],ymm5[6],ymm0[7],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm10[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm11[2],ymm13[3,4,5],ymm11[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm14[1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,2,2,3,5,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0],ymm1[1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[6],ymm14[6],ymm15[7],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 60(%r8), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 60(%r9), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rax), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm12 = xmm1[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0],xmm0[1],xmm12[2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rax), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm12 = xmm2[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2],xmm12[3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm12[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm12[1],xmm13[1],zero +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm11, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm6, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm6, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm7, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm12[3,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm4, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[2],ymm14[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1,2,3],ymm1[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm8[3,3],xmm3[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm8, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm11, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm6[1,1],ymm4[5,5],ymm6[5,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%rax), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,1],ymm13[1,1],ymm2[5,5],ymm13[5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm3[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm3 = ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[6],ymm2[6],ymm13[7],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm4 = ymm12[2],ymm5[2],ymm12[3],ymm5[3],ymm12[6],ymm5[6],ymm12[7],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm0[3,3],ymm1[3,3],ymm0[7,7],ymm1[7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm0[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[4],ymm15[4],ymm2[5],ymm15[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm6[3,3],xmm7[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm5[1,1],ymm2[5,5],ymm5[5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm4[2],ymm15[3],ymm4[3],ymm15[6],ymm4[6],ymm15[7],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,3],ymm10[3,3],ymm11[7,7],ymm10[7,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3,4],ymm2[5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm12, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm9[3,3],xmm8[3,3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm5, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm7, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm5 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm10[3,3],xmm9[3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[4],ymm2[4],ymm12[5],ymm2[5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm13[3,1,2,0,7,5,6,4] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm6 = xmm10[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1],ymm11[1,1],ymm12[5,5],ymm11[5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm7 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm8 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 60(%r8), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 60(%r9), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rax), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 416(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 352(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 352(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 256(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $504, %rsp # imm = 0x1F8 +; AVX2-FAST-PERLANE-NEXT: addq $408, %rsp # imm = 0x198 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -2824,56 +2716,56 @@ ; SSE: # %bb.0: ; SSE-NEXT: subq $1256, %rsp # imm = 0x4E8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm7 -; SSE-NEXT: movdqa 16(%rsi), %xmm5 -; SSE-NEXT: movaps (%rdx), %xmm9 -; SSE-NEXT: movdqa 16(%rdx), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm10 -; SSE-NEXT: movaps 16(%rcx), %xmm13 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm9 +; SSE-NEXT: movdqa 16(%rsi), %xmm6 +; SSE-NEXT: movaps (%rdx), %xmm11 +; SSE-NEXT: movdqa 16(%rdx), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rcx), %xmm12 +; SSE-NEXT: movaps 16(%rcx), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%r8), %xmm14 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm2 +; SSE-NEXT: movaps 16(%r8), %xmm13 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r9), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r9), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r9), %xmm15 +; SSE-NEXT: movdqa (%rax), %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm12[1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; SSE-NEXT: movaps %xmm11, %xmm4 +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,1,1] -; SSE-NEXT: movaps %xmm9, %xmm3 -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm14, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1] +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rax), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 16(%rdi), %xmm14 ; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rsi), %xmm1 @@ -2898,8 +2790,8 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa 32(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2908,13 +2800,12 @@ ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] -; SSE-NEXT: movdqa %xmm8, %xmm12 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 48(%rcx), %xmm13 ; SSE-NEXT: movaps 48(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%r9), %xmm1 @@ -2924,24 +2815,23 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: movdqa 48(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 64(%rsi), %xmm1 -; SSE-NEXT: movaps 64(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movdqa 64(%rsi), %xmm2 +; SSE-NEXT: movaps 64(%rdx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 64(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rcx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%r9), %xmm1 @@ -2951,23 +2841,23 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: movdqa 64(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rsi), %xmm12 +; SSE-NEXT: movdqa 80(%rsi), %xmm4 ; SSE-NEXT: movdqa 80(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movaps 80(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%r9), %xmm1 @@ -2979,42 +2869,41 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 80(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rsi), %xmm4 -; SSE-NEXT: movaps 96(%rdx), %xmm12 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 +; SSE-NEXT: movdqa 96(%rsi), %xmm5 +; SSE-NEXT: movaps 96(%rdx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 96(%rcx), %xmm3 -; SSE-NEXT: movaps 96(%r8), %xmm15 -; SSE-NEXT: movaps %xmm15, %xmm0 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%r8), %xmm7 +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%r9), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rax), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa 96(%rax), %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3026,44 +2915,48 @@ ; SSE-NEXT: movaps 112(%rcx), %xmm3 ; SSE-NEXT: movaps 112(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps 112(%r9), %xmm15 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm15, %xmm1 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm0[1] +; SSE-NEXT: movaps 112(%r9), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[3,2] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[1,1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; SSE-NEXT: movaps %xmm3, %xmm7 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 112(%rax), %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm12[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm2[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,0],xmm2[1,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm11[0,2] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3076,32 +2969,32 @@ ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: # xmm14 = xmm14[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: shufps $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[1,0],mem[0,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm2 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm14[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm4[1,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -3110,27 +3003,27 @@ ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = xmm8[1,1],mem[0,3] +; SSE-NEXT: shufps $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = xmm10[1,0],mem[0,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] @@ -3138,18 +3031,18 @@ ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm8[1,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -3158,22 +3051,20 @@ ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm14 = xmm14[0],xmm0[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, %xmm12 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[0],mem[0],xmm12[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] +; SSE-NEXT: shufps $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1,0],mem[0,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm11 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -3183,171 +3074,173 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm0[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm9 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, %xmm9 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; SSE-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1] ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm14[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm13[1,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm8 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2] ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm2[0] -; SSE-NEXT: movaps 112(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm12[0,3] -; SSE-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[2,0] -; SSE-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm12[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm12[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,0] +; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movaps 112(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm0[1,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm7[0,2] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,3] +; SSE-NEXT: movaps %xmm3, %xmm7 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[0,1] +; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[0,1],mem[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[2,0] +; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: # xmm15 = xmm15[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm15[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,3],xmm1[2,0] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm15 = xmm0[0],xmm15[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm0[2,0] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm12 = xmm0[0],xmm12[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm1[2,0] +; SSE-NEXT: movss {{.*#+}} xmm15 = xmm0[0],xmm15[1,2,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3],xmm1[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm6 = xmm0[0],xmm6[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[3,3,3,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm0[2,0] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,3],xmm0[2,0] ; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm14 = xmm0[0],xmm14[1,2,3] +; SSE-NEXT: movss {{.*#+}} xmm13 = xmm0[0],xmm13[1,2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 864(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 864(%rax) ; SSE-NEXT: movaps %xmm5, 848(%rax) +; SSE-NEXT: movaps %xmm3, 832(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 832(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 800(%rax) ; SSE-NEXT: movaps %xmm4, 784(%rax) -; SSE-NEXT: movaps %xmm7, 736(%rax) +; SSE-NEXT: movaps %xmm6, 736(%rax) ; SSE-NEXT: movaps %xmm8, 688(%rax) ; SSE-NEXT: movaps %xmm9, 672(%rax) ; SSE-NEXT: movaps %xmm10, 624(%rax) ; SSE-NEXT: movaps %xmm11, 576(%rax) -; SSE-NEXT: movaps %xmm13, 560(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 512(%rax) +; SSE-NEXT: movaps %xmm12, 560(%rax) +; SSE-NEXT: movdqa %xmm14, 512(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 464(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3382,7 +3275,7 @@ ; SSE-NEXT: movaps %xmm0, 816(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 768(%rax) -; SSE-NEXT: movaps %xmm14, 752(%rax) +; SSE-NEXT: movaps %xmm13, 752(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 720(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3393,13 +3286,14 @@ ; SSE-NEXT: movaps %xmm0, 608(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 592(%rax) -; SSE-NEXT: movaps %xmm6, 544(%rax) -; SSE-NEXT: movaps %xmm12, 528(%rax) +; SSE-NEXT: movaps %xmm7, 544(%rax) +; SSE-NEXT: movaps %xmm15, 528(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 496(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 480(%rax) -; SSE-NEXT: movaps %xmm15, 432(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 432(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 416(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3435,7 +3329,7 @@ ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1656, %rsp # imm = 0x678 +; AVX1-ONLY-NEXT: subq $1496, %rsp # imm = 0x5D8 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3461,54 +3355,53 @@ ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm8 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm8 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[2],ymm0[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1],xmm4[1,1] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],mem[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm7[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm3[1],xmm2[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1],ymm4[1,1],ymm2[5,5],ymm4[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1],ymm1[1,1],ymm6[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] @@ -3527,55 +3420,51 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm8 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm2[1],xmm1[1],zero ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm5[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm6[1],xmm8[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm5[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[1,1],ymm1[5,5],ymm0[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm12[1,1],ymm0[5,5],ymm12[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm11 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1],ymm1[1,1],ymm11[5,5],ymm1[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm2[0],ymm13[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3585,154 +3474,151 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm8 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm5[1],xmm8[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm2[1],xmm1[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm5[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1],ymm13[1,1],ymm14[5,5],ymm13[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm10 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1],ymm10[1,1],ymm11[5,5],ymm10[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1],ymm8[1,1],ymm9[5,5],ymm8[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm7[1,1],ymm8[5,5],ymm7[5,5] ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm10 -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm10[0],ymm12[2],ymm10[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm10[2,1],ymm1[6,4],ymm10[6,5] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm2[0],ymm9[2],ymm2[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm2[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm14[0],ymm1[2],ymm14[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm7[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps %xmm4, %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm3[0],xmm5[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,0],xmm5[2,1] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm15 +; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm4[1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm2[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = zero,xmm3[1],xmm6[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[1,1],xmm15[1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1],xmm2[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm3[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[6],ymm5[6],ymm11[7],ymm5[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1],ymm1[0,2],ymm11[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm13[1],ymm1[3],ymm13[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1],ymm1[0,2],ymm13[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm1[0,2],ymm12[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1],ymm1[0,2],ymm9[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm5 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,3],ymm7[3,3],ymm14[7,7],ymm7[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,3],ymm3[3,3],ymm8[7,7],ymm3[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3],ymm2[3,3],ymm6[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,3],ymm13[3,3],ymm2[7,7],ymm13[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 124(%r8), %ymm1 @@ -3742,9 +3628,9 @@ ; AVX1-ONLY-NEXT: vbroadcastsd 120(%rax), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm14[0],ymm7[2],ymm14[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,1],ymm0[0,2],ymm14[7,5],ymm0[4,6] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm8[0],ymm3[2],ymm8[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,1],ymm0[0,2],ymm8[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[4],ymm13[4],ymm2[5],ymm13[5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 108(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] @@ -3753,8 +3639,8 @@ ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1],ymm14[1,1],ymm7[5,5],ymm14[5,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm6[1,1],ymm2[5,5],ymm6[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1],ymm8[1,1],ymm3[5,5],ymm8[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1],ymm2[1,1],ymm13[5,5],ymm2[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vbroadcastsd 112(%r8), %ymm1 @@ -3765,169 +3651,165 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,3],ymm4[3,3],ymm3[7,7],ymm4[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm13 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6],ymm13[7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm13[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = mem[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,3],ymm6[3,3],ymm0[7,7],ymm6[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm15[3,3],ymm14[3,3],ymm15[7,7],ymm14[7,7] +; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm6 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm4[3,3],mem[3,3],ymm4[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,3],ymm12[1,2],ymm0[6,7],ymm12[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm2[1,2,3,4],ymm12[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm0[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm14[2],mem[2],xmm14[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5,6],ymm14[7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm14[3] ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rax), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm13[2,3,4],ymm1[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm14[2,3,4],ymm12[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm5[3,3],ymm14[3,3],ymm5[7,7],ymm14[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm13[3,3],mem[3,3],ymm13[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[3,3],ymm7[3,3],ymm12[7,7],ymm7[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[3,3],ymm1[3,3],ymm12[7,7],ymm1[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm0[3,3],ymm11[3,3],ymm0[7,7],ymm11[7,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm0[2,3],ymm11[1,2],ymm0[6,7],ymm11[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm13[1,2,3,4],ymm11[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[2,3],ymm10[1,2],ymm0[6,7],ymm10[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0],ymm14[1,2,3,4],ymm10[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6],ymm8[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm13[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm0[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm9[2],mem[2],xmm9[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6],ymm9[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm14[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rax), %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm9[2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rax), %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm9[2,3,4],ymm8[5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm6[3,3],ymm8[3,3],ymm6[7,7],ymm8[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm8[3,3],ymm10[3,3],ymm8[7,7],ymm10[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[3,3],ymm0[3,3],ymm1[7,7],ymm0[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm2[3,3],mem[3,3],ymm2[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,3],ymm4[1,2],ymm2[6,7],ymm4[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3,4],ymm4[5,6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm2[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6],ymm2[7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rax), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm2[3,3],ymm0[3,3],ymm2[7,7],ymm0[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm7[3,3],ymm5[3,3],ymm7[7,7],ymm5[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,3],ymm3[1,2],ymm4[6,7],ymm3[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm9[1,2,3,4],ymm3[5,6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6],ymm5[7] +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rax), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm9[0],ymm5[2],ymm9[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm9[3,1],ymm5[0,2],ymm9[7,5],ymm5[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm5[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[4],ymm7[4],ymm12[5],ymm7[5] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[3,1],ymm7[0,2],ymm9[7,5],ymm7[4,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm6[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1],ymm9[0,2],ymm6[7,5],ymm9[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[3,3],xmm13[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = xmm7[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[4],ymm1[4],ymm12[5],ymm1[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm13[0],ymm1[2],ymm13[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm13[3,1],ymm9[0,2],ymm13[7,5],ymm9[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm1[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm9[1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[3,1],ymm11[0,2],ymm8[7,5],ymm11[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm15[3,3],xmm14[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm9 = xmm9[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm9[1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm6, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 736(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 640(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 640(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 512(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) @@ -3963,7 +3845,7 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%rax) -; AVX1-ONLY-NEXT: addq $1656, %rsp # imm = 0x678 +; AVX1-ONLY-NEXT: addq $1496, %rsp # imm = 0x5D8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -5532,405 +5414,391 @@ ; ; AVX512F-LABEL: store_i32_stride7_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: pushq %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm14 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm10 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm21 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm12 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm9 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm10 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm17 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm15 ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm28 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm20 -; AVX512F-NEXT: vmovdqa64 (%rax), %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm30 = -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm30, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm7, %zmm4 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm4 +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm2 +; AVX512F-NEXT: vmovdqa64 (%rax), %zmm8 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm18, %zmm13 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512F-NEXT: vpermt2d %zmm5, %zmm3, %zmm12 ; AVX512F-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512F-NEXT: vpermi2d %zmm28, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm22, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm13, %zmm4 -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> -; AVX512F-NEXT: vpermi2d %zmm28, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm2, %zmm16 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm12 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm19, %zmm13 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm1, %zmm21, %zmm16 +; AVX512F-NEXT: vmovdqa32 %zmm13, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm20, %zmm22 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm14, %zmm13 ; AVX512F-NEXT: movw $1548, %cx # imm = 0x60C +; AVX512F-NEXT: vpermi2d %zmm10, %zmm0, %zmm18 +; AVX512F-NEXT: vpermi2d %zmm17, %zmm15, %zmm3 +; AVX512F-NEXT: vmovdqa32 %zmm18, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512F-NEXT: vpermi2d %zmm6, %zmm7, %zmm18 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = +; AVX512F-NEXT: vpermi2d %zmm8, %zmm18, %zmm23 +; AVX512F-NEXT: movw $-30962, %dx # imm = 0x870E +; AVX512F-NEXT: kmovw %edx, %k2 +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm3 {%k2} ; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm16, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm27, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm25 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm2, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm10, %zmm0, %zmm18 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vpermt2d %zmm9, %zmm2, %zmm31 +; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm18 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> +; AVX512F-NEXT: vpermt2d %zmm15, %zmm19, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm27, %zmm19 +; AVX512F-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 +; AVX512F-NEXT: vmovdqa32 %zmm17, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> +; AVX512F-NEXT: vpermi2d %zmm7, %zmm6, %zmm17 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> +; AVX512F-NEXT: vpermi2d %zmm8, %zmm17, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512F-NEXT: vpermt2d %zmm6, %zmm28, %zmm17 +; AVX512F-NEXT: movw $-7741, %cx # imm = 0xE1C3 +; AVX512F-NEXT: kmovw %ecx, %k1 +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512F-NEXT: vpermt2d %zmm8, %zmm21, %zmm17 ; AVX512F-NEXT: movw $14448, %cx # imm = 0x3870 -; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} -; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm31, %zmm6 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm27, %zmm28 -; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm27 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm2, %zmm28 -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm18 {%k2} -; AVX512F-NEXT: vpermi2d %zmm12, %zmm1, %zmm30 -; AVX512F-NEXT: vpermi2d %zmm14, %zmm15, %zmm7 -; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> -; AVX512F-NEXT: vpermi2d %zmm3, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> -; AVX512F-NEXT: vpermi2d %zmm9, %zmm2, %zmm28 -; AVX512F-NEXT: movw $-7741, %ax # imm = 0xE1C3 -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vpermt2d %zmm11, %zmm2, %zmm25 -; AVX512F-NEXT: movw $-31994, %ax # imm = 0x8306 +; AVX512F-NEXT: kmovw %ecx, %k3 +; AVX512F-NEXT: vmovdqa32 %zmm17, %zmm13 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = +; AVX512F-NEXT: vpermt2d %zmm10, %zmm31, %zmm29 +; AVX512F-NEXT: vpermi2d %zmm11, %zmm5, %zmm20 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm9, %zmm14 +; AVX512F-NEXT: vmovdqa32 %zmm20, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> +; AVX512F-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 +; AVX512F-NEXT: movw $12384, %cx # imm = 0x3060 +; AVX512F-NEXT: kmovw %ecx, %k1 +; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512F-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm21, %zmm28 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512F-NEXT: vpermt2d %zmm6, %zmm21, %zmm30 +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm14 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512F-NEXT: vpermt2d %zmm8, %zmm28, %zmm30 +; AVX512F-NEXT: movw $3612, %ax # imm = 0xE1C ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm16 {%k2} -; AVX512F-NEXT: vpermi2d %zmm14, %zmm15, %zmm22 -; AVX512F-NEXT: vpermi2d %zmm1, %zmm12, %zmm13 -; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512F-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512F-NEXT: vpermi2d %zmm9, %zmm22, %zmm25 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512F-NEXT: vpermt2d %zmm20, %zmm22, %zmm31 -; AVX512F-NEXT: movw $-30962, %ax # imm = 0x870E -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512F-NEXT: vpermt2d %zmm27, %zmm25, %zmm31 -; AVX512F-NEXT: movw $7224, %ax # imm = 0x1C38 -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm31, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa32 %zmm30, %zmm18 {%k2} +; AVX512F-NEXT: vpermi2d %zmm1, %zmm9, %zmm31 +; AVX512F-NEXT: vpermi2d %zmm11, %zmm5, %zmm17 +; AVX512F-NEXT: vmovdqa32 %zmm31, %zmm17 {%k1} +; AVX512F-NEXT: vpermi2d %zmm2, %zmm4, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm28, %zmm21 +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm17 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512F-NEXT: vpermt2d %zmm15, %zmm21, %zmm26 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512F-NEXT: vpermt2d %zmm11, %zmm28, %zmm24 -; AVX512F-NEXT: vpermi2d %zmm12, %zmm1, %zmm2 -; AVX512F-NEXT: vpermi2d %zmm15, %zmm14, %zmm17 -; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-NEXT: vpermt2d %zmm10, %zmm2, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm15, %zmm28, %zmm24 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512F-NEXT: vpermt2d %zmm15, %zmm29, %zmm23 +; AVX512F-NEXT: vpermi2d %zmm11, %zmm5, %zmm21 +; AVX512F-NEXT: vpermi2d %zmm11, %zmm5, %zmm28 +; AVX512F-NEXT: vpermt2d %zmm11, %zmm29, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512F-NEXT: vpermt2d %zmm10, %zmm11, %zmm25 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512F-NEXT: vpermt2d %zmm10, %zmm15, %zmm22 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 +; AVX512F-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 +; AVX512F-NEXT: vpermt2d %zmm9, %zmm27, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> +; AVX512F-NEXT: vpermi2d %zmm4, %zmm12, %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-NEXT: vpermi2d %zmm4, %zmm16, %zmm10 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512F-NEXT: vpermt2d %zmm6, %zmm16, %zmm12 +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> +; AVX512F-NEXT: vpermt2d %zmm8, %zmm21, %zmm12 +; AVX512F-NEXT: movw $15480, %ax # imm = 0x3C78 +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm12 {%k1} +; AVX512F-NEXT: vpermi2d %zmm2, %zmm4, %zmm16 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm21, %zmm16 +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm16 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm11 ; AVX512F-NEXT: movw $3096, %ax # imm = 0xC18 -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm4 {%k2} -; AVX512F-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm25, %zmm22 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = -; AVX512F-NEXT: vpermt2d %zmm20, %zmm24, %zmm19 -; AVX512F-NEXT: vmovdqa32 %zmm22, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> -; AVX512F-NEXT: vpermt2d %zmm27, %zmm22, %zmm19 +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm22 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512F-NEXT: vpermt2d %zmm6, %zmm21, %zmm11 +; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> +; AVX512F-NEXT: vpermt2d %zmm8, %zmm24, %zmm11 ; AVX512F-NEXT: movw $28897, %ax # imm = 0x70E1 -; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm4 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512F-NEXT: vpermt2d %zmm11, %zmm19, %zmm29 -; AVX512F-NEXT: vpermi2d %zmm12, %zmm1, %zmm28 -; AVX512F-NEXT: vpermi2d %zmm14, %zmm15, %zmm2 -; AVX512F-NEXT: vmovdqa32 %zmm28, %zmm2 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512F-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 -; AVX512F-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm29, %zmm26 {%k1} -; AVX512F-NEXT: vpermi2d %zmm8, %zmm3, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm22, %zmm24 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512F-NEXT: vpermt2d %zmm20, %zmm22, %zmm21 -; AVX512F-NEXT: vmovdqa32 %zmm24, %zmm2 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> -; AVX512F-NEXT: vpermt2d %zmm27, %zmm24, %zmm21 -; AVX512F-NEXT: movw $15480, %ax # imm = 0x3C78 -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa32 %zmm26, %zmm21 {%k2} -; AVX512F-NEXT: vpermi2d %zmm12, %zmm1, %zmm19 -; AVX512F-NEXT: vpermi2d %zmm14, %zmm15, %zmm25 -; AVX512F-NEXT: vmovdqa32 %zmm19, %zmm25 {%k1} -; AVX512F-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm24, %zmm22 -; AVX512F-NEXT: vmovdqa32 %zmm25, %zmm22 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512F-NEXT: vpermt2d %zmm10, %zmm19, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm19, %zmm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> -; AVX512F-NEXT: vpermt2d %zmm11, %zmm10, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm10, %zmm1 -; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512F-NEXT: vpermt2d %zmm8, %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512F-NEXT: vpermt2d %zmm9, %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm15, %zmm1 {%k1} -; AVX512F-NEXT: movw $3612, %ax # imm = 0xE1C +; AVX512F-NEXT: vmovdqa32 %zmm11, %zmm22 {%k1} +; AVX512F-NEXT: vpermi2d %zmm2, %zmm4, %zmm21 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm24, %zmm21 +; AVX512F-NEXT: vmovdqa32 %zmm21, %zmm15 {%k1} +; AVX512F-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm20, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm20, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] -; AVX512F-NEXT: vpermi2d %zmm27, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] -; AVX512F-NEXT: vpermi2d %zmm27, %zmm3, %zmm0 +; AVX512F-NEXT: vmovdqa32 %zmm23, %zmm19 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512F-NEXT: vpermt2d %zmm6, %zmm11, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512F-NEXT: vpermt2d %zmm8, %zmm6, %zmm7 +; AVX512F-NEXT: vmovdqa32 %zmm5, %zmm1 {%k1} +; AVX512F-NEXT: movw $7224, %ax # imm = 0x1C38 +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqa32 %zmm7, %zmm19 {%k1} +; AVX512F-NEXT: vpermt2d %zmm2, %zmm11, %zmm4 +; AVX512F-NEXT: vpermt2d %zmm20, %zmm6, %zmm4 +; AVX512F-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> +; AVX512F-NEXT: vpermi2d %zmm2, %zmm9, %zmm4 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> +; AVX512F-NEXT: vpermi2d %zmm2, %zmm10, %zmm5 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512F-NEXT: vpermi2d %zmm20, %zmm4, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512F-NEXT: vpermi2d %zmm20, %zmm5, %zmm4 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm22, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm21, 576(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512F-NEXT: popq %rax +; AVX512F-NEXT: vmovdqa64 %zmm1, 704(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm15, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 512(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm14, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm22, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm13, (%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm2, 768(%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i32_stride7_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: pushq %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm21 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm12 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm17 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm15 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm28 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm20 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm30, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm7, %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm18, %zmm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm5, %zmm3, %zmm12 ; AVX512BW-NEXT: movw $6192, %cx # imm = 0x1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm22, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u> -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm13, %zmm4 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> -; AVX512BW-NEXT: vpermi2d %zmm28, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm2, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm19, %zmm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm1, %zmm21, %zmm16 +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm22 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm20, %zmm22 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18> +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm14, %zmm13 ; AVX512BW-NEXT: movw $1548, %cx # imm = 0x60C +; AVX512BW-NEXT: vpermi2d %zmm10, %zmm0, %zmm18 +; AVX512BW-NEXT: vpermi2d %zmm17, %zmm15, %zmm3 +; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512BW-NEXT: vpermi2d %zmm6, %zmm7, %zmm18 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm18, %zmm23 +; AVX512BW-NEXT: movw $-30962, %dx # imm = 0x870E +; AVX512BW-NEXT: kmovd %edx, %k2 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm3 {%k2} ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm27, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm25 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm17, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm2, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm2, %zmm31 +; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u> +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm19, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm27, %zmm19 +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm21, %zmm0 +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> +; AVX512BW-NEXT: vpermi2d %zmm7, %zmm6, %zmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> +; AVX512BW-NEXT: vpermi2d %zmm8, %zmm17, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm28, %zmm17 +; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 +; AVX512BW-NEXT: kmovd %ecx, %k1 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm21, %zmm17 ; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 -; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm18 {%k2} -; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm27, %zmm28 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm2, %zmm28 -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm18 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm12, %zmm1, %zmm30 -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm15, %zmm7 -; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u> -; AVX512BW-NEXT: vpermi2d %zmm3, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31> -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm2, %zmm28 -; AVX512BW-NEXT: movw $-7741, %ax # imm = 0xE1C3 -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm2, %zmm25 -; AVX512BW-NEXT: movw $-31994, %ax # imm = 0x8306 +; AVX512BW-NEXT: kmovd %ecx, %k3 +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm13 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm31, %zmm29 +; AVX512BW-NEXT: vpermi2d %zmm11, %zmm5, %zmm20 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm14 +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm17, %zmm18 +; AVX512BW-NEXT: movw $12384, %cx # imm = 0x3060 +; AVX512BW-NEXT: kmovd %ecx, %k1 +; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm21, %zmm28 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm21, %zmm30 +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm14 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm28, %zmm30 +; AVX512BW-NEXT: movw $3612, %ax # imm = 0xE1C ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm15, %zmm22 -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm12, %zmm13 -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm22, %zmm25 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm22, %zmm31 -; AVX512BW-NEXT: movw $-30962, %ax # imm = 0x870E -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm25, %zmm31 -; AVX512BW-NEXT: movw $7224, %ax # imm = 0x1C38 -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm30, %zmm18 {%k2} +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm31 +; AVX512BW-NEXT: vpermi2d %zmm11, %zmm5, %zmm17 +; AVX512BW-NEXT: vmovdqa32 %zmm31, %zmm17 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm28, %zmm21 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm21, %zmm26 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm28, %zmm24 -; AVX512BW-NEXT: vpermi2d %zmm12, %zmm1, %zmm2 -; AVX512BW-NEXT: vpermi2d %zmm15, %zmm14, %zmm17 -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm2, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm28, %zmm24 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512BW-NEXT: vpermt2d %zmm15, %zmm29, %zmm23 +; AVX512BW-NEXT: vpermi2d %zmm11, %zmm5, %zmm21 +; AVX512BW-NEXT: vpermi2d %zmm11, %zmm5, %zmm28 +; AVX512BW-NEXT: vpermt2d %zmm11, %zmm29, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm11, %zmm25 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vpermt2d %zmm10, %zmm15, %zmm22 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm11 +; AVX512BW-NEXT: vpermi2d %zmm1, %zmm9, %zmm15 +; AVX512BW-NEXT: vpermt2d %zmm9, %zmm27, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29> +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm12, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vpermi2d %zmm4, %zmm16, %zmm10 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm25 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm16, %zmm12 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm21, %zmm12 +; AVX512BW-NEXT: movw $15480, %ax # imm = 0x3C78 +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm12 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm16 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm21, %zmm16 +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm16 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 ; AVX512BW-NEXT: movw $3096, %ax # imm = 0xC18 -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm4 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm25, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm24, %zmm19 -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm22, %zmm19 +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm22 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm21, %zmm11 +; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u> +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm24, %zmm11 ; AVX512BW-NEXT: movw $28897, %ax # imm = 0x70E1 -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm4 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm19, %zmm29 -; AVX512BW-NEXT: vpermi2d %zmm12, %zmm1, %zmm28 -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm15, %zmm2 -; AVX512BW-NEXT: vmovdqa32 %zmm28, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm25, %zmm26 -; AVX512BW-NEXT: movw $12384, %ax # imm = 0x3060 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm29, %zmm26 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm3, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm22, %zmm24 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22> -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm22, %zmm21 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm2 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15> -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm24, %zmm21 -; AVX512BW-NEXT: movw $15480, %ax # imm = 0x3C78 -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa32 %zmm26, %zmm21 {%k2} -; AVX512BW-NEXT: vpermi2d %zmm12, %zmm1, %zmm19 -; AVX512BW-NEXT: vpermi2d %zmm14, %zmm15, %zmm25 -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm25 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm8, %zmm3, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm24, %zmm22 -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = -; AVX512BW-NEXT: vpermt2d %zmm10, %zmm19, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm19, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20> -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm10, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm1 -; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm1 {%k1} -; AVX512BW-NEXT: movw $3612, %ax # imm = 0xE1C +; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm22 {%k1} +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm4, %zmm21 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm24, %zmm21 +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm15 {%k1} +; AVX512BW-NEXT: movw $-31994, %ax # imm = 0x8306 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm20, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm20, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] -; AVX512BW-NEXT: vpermi2d %zmm27, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] -; AVX512BW-NEXT: vpermi2d %zmm27, %zmm3, %zmm0 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm19 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm11, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-NEXT: vpermt2d %zmm8, %zmm6, %zmm7 +; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm1 {%k1} +; AVX512BW-NEXT: movw $7224, %ax # imm = 0x1C38 +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqa32 %zmm7, %zmm19 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm2, %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2d %zmm20, %zmm6, %zmm4 +; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15> +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm9, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u> +; AVX512BW-NEXT: vpermi2d %zmm2, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,27,4,5,6,7,8,9,28,11,12,13,14,15] +; AVX512BW-NEXT: vpermi2d %zmm20, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31] +; AVX512BW-NEXT: vpermi2d %zmm20, %zmm5, %zmm4 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512BW-NEXT: popq %rax +; AVX512BW-NEXT: vmovdqa64 %zmm1, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 768(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <32 x i32>, ptr %in.vecptr0, align 64 @@ -5958,71 +5826,71 @@ ; SSE: # %bb.0: ; SSE-NEXT: subq $2760, %rsp # imm = 0xAC8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm5 -; SSE-NEXT: movdqa 16(%rsi), %xmm4 -; SSE-NEXT: movaps (%rdx), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm9 -; SSE-NEXT: movaps 16(%rcx), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rsi), %xmm4 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: movaps (%rdx), %xmm13 +; SSE-NEXT: movdqa 16(%rdx), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rcx), %xmm5 +; SSE-NEXT: movaps 16(%rcx), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%r8), %xmm8 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r9), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm15 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] +; SSE-NEXT: movaps 16(%r8), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r9), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r9), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm10[1,1] +; SSE-NEXT: movaps %xmm10, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rax), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rsi), %xmm1 -; SSE-NEXT: movaps 32(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 32(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rcx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 32(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%r9), %xmm1 @@ -6032,23 +5900,23 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 32(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 32(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rsi), %xmm2 +; SSE-NEXT: movdqa 48(%rsi), %xmm4 ; SSE-NEXT: movdqa 48(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 48(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%r8), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: movaps 48(%rcx), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%r9), %xmm1 @@ -6058,24 +5926,24 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 48(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 48(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rsi), %xmm1 -; SSE-NEXT: movaps 64(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdx), %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm8, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 64(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%r8), %xmm13 -; SSE-NEXT: movaps %xmm13, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: movaps 64(%rcx), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%r8), %xmm8 +; SSE-NEXT: movaps %xmm8, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%r9), %xmm1 @@ -6085,23 +5953,23 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 64(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 64(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rsi), %xmm2 +; SSE-NEXT: movdqa 80(%rsi), %xmm6 ; SSE-NEXT: movdqa 80(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 80(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rcx), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%r9), %xmm1 @@ -6111,24 +5979,24 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 80(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 80(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rsi), %xmm1 -; SSE-NEXT: movaps 96(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdx), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 96(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rcx), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 96(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%r9), %xmm1 @@ -6138,23 +6006,23 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 96(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movdqa 96(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rsi), %xmm2 +; SSE-NEXT: movdqa 112(%rsi), %xmm6 ; SSE-NEXT: movdqa 112(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 112(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rcx), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%r9), %xmm1 @@ -6166,22 +6034,22 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 112(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rsi), %xmm1 -; SSE-NEXT: movaps 128(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdx), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 128(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rcx), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 128(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%r9), %xmm1 @@ -6193,21 +6061,21 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rsi), %xmm2 +; SSE-NEXT: movdqa 144(%rsi), %xmm6 ; SSE-NEXT: movdqa 144(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 144(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 144(%rcx), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 144(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%r9), %xmm1 @@ -6219,22 +6087,22 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 144(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rsi), %xmm1 -; SSE-NEXT: movaps 160(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdx), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 160(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rcx), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 160(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%r9), %xmm1 @@ -6246,21 +6114,21 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 160(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rsi), %xmm2 +; SSE-NEXT: movdqa 176(%rsi), %xmm6 ; SSE-NEXT: movdqa 176(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 176(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 176(%rcx), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 176(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 176(%r9), %xmm1 @@ -6272,22 +6140,22 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 176(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rsi), %xmm1 -; SSE-NEXT: movaps 192(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdx), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE-NEXT: movaps 192(%rcx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rcx), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%r9), %xmm1 @@ -6299,146 +6167,196 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 192(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rsi), %xmm3 +; SSE-NEXT: movdqa 208(%rsi), %xmm6 ; SSE-NEXT: movdqa 208(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps 208(%rcx), %xmm8 -; SSE-NEXT: movaps 208(%r8), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[1,1] -; SSE-NEXT: movaps %xmm8, %xmm11 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 208(%rcx), %xmm11 +; SSE-NEXT: movaps 208(%r8), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[1,1] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%r9), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rax), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] +; SSE-NEXT: movdqa 208(%r9), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rax), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm12[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 224(%rsi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdx), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE-NEXT: movaps 224(%rcx), %xmm3 -; SSE-NEXT: movaps 224(%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm0[0],xmm6[1,2,3] +; SSE-NEXT: movaps 224(%rcx), %xmm2 +; SSE-NEXT: movaps 224(%r8), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps 224(%r9), %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm1 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 224(%rax), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%r9), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: movaps 224(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[3,2] +; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm2[1,1] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 240(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa 240(%rdx), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 240(%rcx), %xmm8 +; SSE-NEXT: movaps 240(%rcx), %xmm6 ; SSE-NEXT: movaps 240(%r8), %xmm0 ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm6[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: movaps 240(%r9), %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] +; SSE-NEXT: movaps 240(%r9), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm0[1] +; SSE-NEXT: movaps 240(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm9[3,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm9[1,1] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps 240(%rax), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,0],xmm3[1,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[0,2] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm4[1,3] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1,0],mem[0,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[0,2] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: movaps %xmm4, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm3[1,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm2, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: shufps $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1,0],mem[0,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: movaps %xmm4, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6446,21 +6364,21 @@ ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm3 ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm10[1,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] @@ -6470,46 +6388,45 @@ ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm3 ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[1,0],mem[0,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm14[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm14[1,3] -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm15[1,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] @@ -6520,93 +6437,44 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: shufps $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1,0],mem[0,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,0] ; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movaps %xmm3, %xmm1 @@ -6618,44 +6486,44 @@ ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] +; SSE-NEXT: shufps $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1,0],mem[0,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[0,2] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movaps %xmm3, %xmm1 @@ -6673,22 +6541,22 @@ ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: shufps $197, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[1,1],mem[0,3] +; SSE-NEXT: shufps $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[1,0],mem[0,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps %xmm15, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -6698,7 +6566,7 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -6707,325 +6575,320 @@ ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3] -; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm0[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movaps %xmm13, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm14[3,3] -; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[3,3] +; SSE-NEXT: movaps %xmm1, %xmm5 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: movaps %xmm2, %xmm12 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm3[2],xmm12[3],xmm3[3] ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,0] +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm3[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm3[0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm3[0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: movaps 224(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm7 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movaps 224(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm9 +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm0[0,2] +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0] -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[2,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm0 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm11[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm1[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, %xmm11 -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm0[0] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: movaps 240(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0] -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[0,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[2,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,1] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm1[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm2[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,0] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm12[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[0,1] +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] +; SSE-NEXT: shufps $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = xmm5[0,1],mem[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm11[0] +; SSE-NEXT: movaps 240(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm1, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0],xmm0[1,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm0[0,1] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm11[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[3,0],mem[3,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm11[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm0[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm12 = xmm4[0],xmm12[1,2,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm0[2,0] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; SSE-NEXT: movss {{.*#+}} xmm8 = xmm4[0],xmm8[1,2,3] -; SSE-NEXT: shufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,3,3,3] -; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] -; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] +; SSE-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3],xmm1[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm9 = xmm0[0],xmm9[1,2,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = xmm1[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm1[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm6 = xmm0[0],xmm6[1,2,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = xmm0[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm0[2,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: movss {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[3,0],mem[3,3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[2,0] +; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,3,3,3] +; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 1760(%rax) -; SSE-NEXT: movaps %xmm11, 1744(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1760(%rax) +; SSE-NEXT: movaps %xmm8, 1744(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1728(%rax) +; SSE-NEXT: movaps %xmm15, 1696(%rax) +; SSE-NEXT: movaps %xmm3, 1680(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1696(%rax) -; SSE-NEXT: movaps %xmm5, 1680(%rax) -; SSE-NEXT: movaps %xmm6, 1648(%rax) -; SSE-NEXT: movaps %xmm7, 1632(%rax) +; SSE-NEXT: movaps %xmm0, 1648(%rax) +; SSE-NEXT: movaps %xmm5, 1632(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1616(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1584(%rax) -; SSE-NEXT: movaps %xmm9, 1568(%rax) +; SSE-NEXT: movaps %xmm7, 1568(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1536(%rax) -; SSE-NEXT: movaps %xmm13, 1520(%rax) +; SSE-NEXT: movaps %xmm12, 1520(%rax) ; SSE-NEXT: movaps %xmm10, 1472(%rax) -; SSE-NEXT: movaps %xmm14, 1456(%rax) -; SSE-NEXT: movaps %xmm15, 1408(%rax) +; SSE-NEXT: movaps %xmm13, 1456(%rax) +; SSE-NEXT: movaps %xmm14, 1408(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1360(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7114,29 +6977,27 @@ ; SSE-NEXT: movaps %xmm0, 1552(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1504(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1488(%rax) -; SSE-NEXT: movaps %xmm4, 1440(%rax) -; SSE-NEXT: movaps %xmm8, 1424(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1392(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1376(%rax) -; SSE-NEXT: movaps %xmm12, 1328(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1312(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1280(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1264(%rax) +; SSE-NEXT: movaps %xmm0, 1488(%rax) +; SSE-NEXT: movaps %xmm1, 1440(%rax) +; SSE-NEXT: movaps %xmm4, 1424(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1392(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 1216(%rax) +; SSE-NEXT: movaps %xmm0, 1376(%rax) +; SSE-NEXT: movaps %xmm6, 1328(%rax) +; SSE-NEXT: movaps %xmm9, 1312(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1280(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1264(%rax) +; SSE-NEXT: movaps %xmm11, 1216(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1200(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1168(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, 1152(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1168(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 1152(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1104(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7179,7 +7040,8 @@ ; SSE-NEXT: movaps %xmm0, 592(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 544(%rax) -; SSE-NEXT: movaps %xmm2, 528(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 528(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 496(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7221,79 +7083,78 @@ ; ; AVX1-ONLY-LABEL: store_i32_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3416, %rsp # imm = 0xD58 +; AVX1-ONLY-NEXT: subq $3112, %rsp # imm = 0xC28 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rax), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 224(%rax), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] +; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm5, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm8 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm8 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm5[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1],xmm6[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm4[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm8[1],xmm7[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm2[1],xmm1[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm2[1,1],ymm0[5,5],ymm2[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm12 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1],ymm1[1,1],ymm12[5,5],ymm1[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7309,55 +7170,51 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm8[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm8 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm6 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm6[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[1,1],xmm7[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm4[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm8[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm2[1],xmm1[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[1,1],ymm1[5,5],ymm0[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm8[1,1],ymm1[5,5],ymm8[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7373,42 +7230,38 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm6[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm8[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[1,1],xmm7[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm7[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm2[1],xmm1[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 @@ -7417,16 +7270,16 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm2[0],ymm15[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm2[0],ymm11[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7436,42 +7289,38 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm8[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm7 +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm4[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[1,1],xmm7[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm7[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm2[1],xmm1[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 @@ -7480,16 +7329,17 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[1,1],ymm1[5,5],ymm0[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm2[0],ymm10[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm1 +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,1],ymm1[6,4],ymm2[6,5] ; AVX1-ONLY-NEXT: vmovaps 96(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7499,42 +7349,39 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%r8), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm7[1,1] +; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm6[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm2[1],xmm1[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm7[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 @@ -7543,11 +7390,11 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm15[1,1],ymm1[5,5],ymm15[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7563,61 +7410,56 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] -; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm8 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps %xmm3, %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm7[1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[1,1],xmm7[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm9[1],xmm6[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm2[1],xmm1[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm5[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm13 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[1,1],ymm0[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm13[1,1],ymm0[5,5],ymm13[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1],ymm1[1,1],ymm6[5,5],ymm1[5,5] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm5[1,1],ymm1[5,5],ymm5[5,5] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %ymm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],ymm9[0],ymm13[2],ymm9[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm9[2,1],ymm1[6,4],ymm9[6,5] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm12 +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %ymm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm12[2,1],ymm1[6,4],ymm12[6,5] ; AVX1-ONLY-NEXT: vmovaps 160(%rax), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm1[2,3] @@ -7626,54 +7468,51 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm10[0],xmm1[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm14 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rax), %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1],xmm7[1,1] +; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm14[1],xmm11[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1],xmm1[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm4[1],xmm5[1],zero -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm2[1],xmm1[1],zero +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm10[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[1,1],xmm2[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm4[1,1],ymm5[5,5],ymm4[5,5] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm14 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1],ymm6[1,1],ymm14[5,5],ymm6[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1],ymm14[1,1],ymm7[5,5],ymm14[5,5] +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1],ymm10[1,1],ymm7[5,5],ymm10[5,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 192(%r9), %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm11[0],ymm3[2],ymm11[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm11[2,1],ymm2[6,4],ymm11[6,5] +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 192(%r9), %ymm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm4[2,1],ymm2[6,4],ymm4[6,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm2[2,3] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[3] @@ -7682,10 +7521,11 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] @@ -7700,7 +7540,8 @@ ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[6],ymm8[6],ymm1[7],ymm8[7] +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7715,31 +7556,31 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm15[1],ymm1[3],ymm15[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[1,1],ymm1[0,2],ymm15[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1],ymm1[0,2],ymm11[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm10[1],ymm1[3],ymm10[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1],ymm1[0,2],ymm10[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[0,2],ymm2[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 112(%rax), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] @@ -7750,8 +7591,7 @@ ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7763,57 +7603,56 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[6],ymm2[6],ymm6[7],ymm2[7] -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm10 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm13[1],ymm9[3],ymm13[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1],ymm1[0,2],ymm13[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm12[1],ymm9[1],ymm12[3],ymm9[3] +; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1],ymm1[0,2],ymm9[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 176(%rax), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm6[1],ymm14[3],ymm6[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm14[2],ymm7[3],ymm14[3],ymm7[6],ymm14[6],ymm7[7],ymm14[7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm10[2],ymm7[3],ymm10[3],ymm7[6],ymm10[6],ymm7[7],ymm10[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm11[1],ymm3[1],ymm11[3],ymm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm1[0,2],ymm3[5,5],ymm1[4,6] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm8[1],ymm4[3],ymm8[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm1[0,2],ymm8[5,5],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 208(%rax), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm2[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,1],xmm0[0,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm2[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[1],xmm4[1],zero ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm0[1,2],ymm3[3,4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 228(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss 228(%r9), %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vbroadcastss 228(%r8), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastss 228(%r9), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rax), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm1[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[0,2],xmm0[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[3,3],xmm2[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[3,3],xmm2[3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6],ymm0[7] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -7823,9 +7662,9 @@ ; AVX1-ONLY-NEXT: vbroadcastss 232(%rax), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,3],ymm7[3,3],ymm14[7,7],ymm7[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[3,3],ymm7[3,3],ymm10[7,7],ymm7[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[3,3],ymm5[3,3],ymm6[7,7],ymm5[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[3,3],ymm14[3,3],ymm6[7,7],ymm14[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 220(%r8), %ymm3 @@ -7836,8 +7675,8 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm11[0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[2,1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm8[0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[2,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] @@ -7848,13 +7687,13 @@ ; AVX1-ONLY-NEXT: vbroadcastss 224(%rax), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,3],ymm3[3,3],ymm4[7,7],ymm3[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,3],ymm4[3,3],ymm8[7,7],ymm4[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3],ymm2[3,3],ymm6[7,7],ymm2[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,3],ymm3[3,3],ymm2[7,7],ymm3[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 252(%r8), %ymm1 @@ -7864,9 +7703,9 @@ ; AVX1-ONLY-NEXT: vbroadcastsd 248(%rax), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,1],ymm0[0,2],ymm4[7,5],ymm0[4,6] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm8[0],ymm4[2],ymm8[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm8[3,1],ymm0[0,2],ymm8[7,5],ymm0[4,6] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 236(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] @@ -7875,8 +7714,8 @@ ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1],ymm4[1,1],ymm3[5,5],ymm4[5,5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm6[1,1],ymm2[5,5],ymm6[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1],ymm8[1,1],ymm4[5,5],ymm8[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1],ymm2[1,1],ymm3[5,5],ymm2[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vbroadcastsd 240(%r8), %ymm1 @@ -7889,13 +7728,12 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpermilps $170, (%rsp), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] @@ -7907,9 +7745,9 @@ ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,3],ymm7[3,3],ymm1[7,7],ymm7[7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,3],ymm11[3,3],ymm1[7,7],ymm11[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7924,13 +7762,12 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX1-ONLY-NEXT: vpermilps $170, (%rsp), %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] @@ -7942,9 +7779,9 @@ ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm4[3,3],mem[3,3],ymm4[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7959,10 +7796,9 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -7974,11 +7810,12 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm8[3,3],ymm0[7,7],ymm8[7,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,3],ymm6[3,3],ymm1[7,7],ymm6[7,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -7993,10 +7830,9 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -8008,11 +7844,12 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm12[3,3],ymm0[7,7],ymm12[7,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,3],ymm15[3,3],ymm1[7,7],ymm15[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm3[3,3],mem[3,3],ymm3[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -8027,182 +7864,179 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vbroadcastsd 136(%rax), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm14[3,3],mem[3,3],ymm14[7,7],mem[7,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,3],ymm13[3,3],ymm12[7,7],ymm13[7,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm12[3,3],ymm1[3,3],ymm12[7,7],ymm1[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,3],ymm9[3,3],ymm1[7,7],ymm9[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,3],mem[3,3],ymm2[7,7],mem[7,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3],ymm2[1,2],ymm3[6,7],ymm2[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vbroadcastsd 168(%rax), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vbroadcastsd 168(%rax), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[3,3],ymm11[3,3],ymm10[7,7],ymm11[7,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm8[3,3],mem[3,3],ymm8[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3],ymm10[3,3],ymm9[7,7],ymm10[7,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm7[3,3],mem[3,3],ymm7[7,7],mem[7,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm5[3,3],mem[3,3],ymm5[7,7],mem[7,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,3],mem[3,3],ymm2[7,7],mem[7,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3],ymm2[1,2],ymm3[6,7],ymm2[5,6] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vbroadcastsd 200(%rax), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4],ymm0[5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3] +; AVX1-ONLY-NEXT: vbroadcastsd 200(%rax), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[1],ymm7[1],ymm0[4],ymm7[4],ymm0[5],ymm7[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,1],ymm2[0,2],ymm7[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[2],ymm7[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,1],ymm2[0,2],ymm7[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, (%rsp), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm11[0],ymm0[2],ymm11[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,1],ymm2[0,2],ymm11[7,5],ymm2[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, (%rsp), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm2[1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[4],mem[4],ymm4[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[4],ymm6[4],ymm0[5],ymm6[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[3,1],ymm2[0,2],ymm6[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm2[0,2],ymm4[7,5],ymm2[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm2[1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm2[0,2],ymm4[7,5],ymm2[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm12[0],ymm1[0],ymm12[1],ymm1[1],ymm12[4],ymm1[4],ymm12[5],ymm1[5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm2 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm4[0],ymm14[0],ymm4[2],ymm14[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm14[3,1],ymm12[0,2],ymm14[7,5],ymm12[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1,2,3],ymm2[4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm1[3,3],mem[3,3] -; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm12[1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],ymm1[0],ymm11[2],ymm1[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[3,1],ymm14[0,2],ymm1[7,5],ymm14[4,6] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm13[3,3],xmm15[3,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm3[3,1],ymm11[0,2],ymm3[7,5],ymm11[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm3[3,3],mem[3,3] +; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm11 = xmm11[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm3[0],ymm9[0],ymm3[1],ymm9[1],ymm3[4],ymm9[4],ymm3[5],ymm9[5] +; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm3[3,1],ymm12[0,2],ymm3[7,5],ymm12[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm15[3,3],mem[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[4],mem[4],ymm7[5],mem[5] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm1[3,1],ymm14[0,2],ymm1[7,5],ymm14[4,6] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm8[0],ymm4[2],ymm8[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm8[3,1],ymm13[0,2],ymm8[7,5],ymm13[4,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm5[3,3],xmm8[3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm10[3,3],xmm14[3,3] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = xmm12[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm12[1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm12 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm12 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm3[3,1],ymm13[0,2],ymm3[7,5],ymm13[4,6] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm5[3,3],xmm6[3,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1,2],mem[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm9, 1440(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 1216(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 992(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 768(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 1440(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 1216(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 992(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8303,58 +8137,58 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1632(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1600(%rax) -; AVX1-ONLY-NEXT: addq $3416, %rsp # imm = 0xD58 +; AVX1-ONLY-NEXT: addq $3112, %rsp # imm = 0xC28 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i32_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $3000, %rsp # imm = 0xBB8 +; AVX2-SLOW-NEXT: subq $3016, %rsp # imm = 0xBC8 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps (%rax), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rax), %xmm3 +; AVX2-SLOW-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm2 ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%r8), %xmm4 -; AVX2-SLOW-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%r8), %xmm7 +; AVX2-SLOW-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm7 -; AVX2-SLOW-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm10 +; AVX2-SLOW-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm9 -; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm3 -; AVX2-SLOW-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm4 +; AVX2-SLOW-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm8 ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm8[1],xmm9[1],zero ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm5 -; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm11 +; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm5 +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm12 +; AVX2-SLOW-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 32(%rax), %xmm0 -; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,1,1,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2],xmm1[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm7[1],xmm3[1],zero +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm7[1],xmm4[1],zero ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8376,10 +8210,9 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %xmm2 -; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %xmm11 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm11[1],xmm2[1],zero +; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %xmm11 +; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %xmm10 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm11[1],zero ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8533,11 +8366,11 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8665,8 +8498,8 @@ ; AVX2-SLOW-NEXT: vbroadcastss %xmm3, %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] ; AVX2-SLOW-NEXT: vbroadcastss 224(%rax), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7] -; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7] +; AVX2-SLOW-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm2[2],xmm14[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] @@ -8678,8 +8511,8 @@ ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 224(%rax), %ymm15, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 @@ -8694,10 +8527,11 @@ ; AVX2-SLOW-NEXT: vbroadcastss 232(%rax), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm10 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm12 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8768,9 +8602,8 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm5 -; AVX2-SLOW-NEXT: vbroadcastss %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vbroadcastss %xmm11, %xmm5 +; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm6 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload @@ -8787,7 +8620,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] @@ -8798,10 +8631,10 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm5 ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm5 -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm6 +; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm6 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -8809,30 +8642,30 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm1[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,2,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm11[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vbroadcastsd 104(%rax), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm5 -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm6 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm11, %xmm6 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -8840,21 +8673,21 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm4[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vbroadcastsd 136(%rax), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -8862,8 +8695,8 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm5 -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm6 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm11, %xmm6 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -8871,21 +8704,21 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm4[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vbroadcastsd 168(%rax), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -8893,8 +8726,8 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm5 -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-SLOW-NEXT: vbroadcastss %xmm11, %xmm6 +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vbroadcastss %xmm10, %xmm6 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -8911,7 +8744,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm4[3,3] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] @@ -8922,13 +8755,13 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm4[2],ymm13[2],ymm4[3],ymm13[3],ymm4[6],ymm13[6],ymm4[7],ymm13[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 220(%r8), %ymm6 @@ -8941,16 +8774,18 @@ ; AVX2-SLOW-NEXT: vbroadcastss 240(%rdx), %ymm5 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm2[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[4],ymm10[4],ymm12[5],ymm10[5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm12[0],ymm0[1],ymm12[1],ymm0[4],ymm12[4],ymm0[5],ymm12[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vbroadcastss 236(%r8), %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm5 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm5 = ymm0[2],ymm12[2],ymm0[3],ymm12[3],ymm0[6],ymm12[6],ymm0[7],ymm12[7] +; AVX2-SLOW-NEXT: vmovaps %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm6 = ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[6],ymm2[6],ymm9[7],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] @@ -8963,8 +8798,8 @@ ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[2,3],ymm6[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[6],ymm1[6],ymm12[7],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -8975,33 +8810,33 @@ ; AVX2-SLOW-NEXT: vbroadcastsd 248(%rax), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1],ymm3[1,1],ymm10[5,5],ymm3[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm7[1,1],ymm1[5,5],ymm7[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm9[1,1],ymm1[5,5],ymm9[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -9013,14 +8848,14 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm15[1,1],ymm1[5,5],ymm15[5,5] +; AVX2-SLOW-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -9090,9 +8925,9 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm4[1,1],ymm0[5,5],ymm4[5,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm8[1,1],ymm0[5,5],ymm8[5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -9108,7 +8943,7 @@ ; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -9117,30 +8952,31 @@ ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm8[2],ymm3[3],ymm8[3],ymm3[6],ymm8[6],ymm3[7],ymm8[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,3],ymm12[3,3],ymm9[7,7],ymm12[7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3],ymm14[3,3],ymm11[7,7],ymm14[7,7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vmovaps %ymm9, %ymm8 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm14[0],ymm10[1],ymm14[1],ymm10[4],ymm14[4],ymm10[5],ymm14[5] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[4],ymm15[4],ymm12[5],ymm15[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm10[2],ymm14[3],ymm10[3],ymm14[6],ymm10[6],ymm14[7],ymm10[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[6],ymm12[6],ymm15[7],ymm12[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -9150,14 +8986,13 @@ ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 80(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vmovaps %ymm15, %ymm13 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[4],ymm6[4],ymm2[5],ymm6[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -9165,9 +9000,9 @@ ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm13[2],mem[2],ymm13[3],mem[3],ymm13[6],mem[6],ymm13[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] +; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[6],ymm2[6],ymm6[7],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -9237,8 +9072,8 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm10[0],ymm1[1],ymm10[1],ymm1[4],ymm10[4],ymm1[5],ymm10[5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm11[0],ymm1[1],ymm11[1],ymm1[4],ymm11[4],ymm1[5],ymm11[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm8 # 16-byte Folded Reload @@ -9248,7 +9083,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[6],ymm1[6],ymm10[7],ymm1[7] +; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm1[2],ymm11[3],ymm1[3],ymm11[6],ymm1[6],ymm11[7],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] @@ -9259,8 +9094,9 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0],ymm8[1,2],ymm10[3,4],ymm8[5,6],ymm10[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] ; AVX2-SLOW-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm10 = mem[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vbroadcastss 208(%rdx), %ymm11 @@ -9269,8 +9105,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm10 = xmm1[3,3],mem[3,3] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm10 = xmm10[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm10[1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm8, 1440(%rax) @@ -9282,9 +9117,9 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm6, 768(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm13, 640(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm15, 544(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm14, 416(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm12, 320(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm12, 416(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm9, 320(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm14, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9373,7 +9208,7 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1600(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 1568(%rax) -; AVX2-SLOW-NEXT: addq $3000, %rsp # imm = 0xBB8 +; AVX2-SLOW-NEXT: addq $3016, %rsp # imm = 0xBC8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -9383,53 +9218,53 @@ ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovaps (%rax), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rax), %xmm3 +; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovaps (%r8), %xmm2 ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%r8), %xmm3 -; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%r8), %xmm4 +; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps (%r9), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm4 -; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm5 +; AVX2-FAST-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm2 ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm5 -; AVX2-FAST-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm6 +; AVX2-FAST-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1],xmm2[1],zero -; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm9 +; AVX2-FAST-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm2 ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX2-FAST-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm8 +; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2],xmm2[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2],xmm2[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rax), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1,1,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2],xmm1[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm2 ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm5[1],zero +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1],xmm6[1],zero ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9569,11 +9404,11 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm11 +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm13 ; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm11[2],ymm1[2],ymm11[3],ymm1[3],ymm11[6],ymm1[6],ymm11[7],ymm1[7] -; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[6],ymm1[6],ymm13[7],ymm1[7] +; AVX2-FAST-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps (%r8), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9609,11 +9444,11 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9701,12 +9536,11 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm11 ; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm9 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[6],ymm9[6],ymm11[7],ymm9[7] ; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm13 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 192(%rdx), %ymm7 ; AVX2-FAST-NEXT: vmovaps 192(%rcx), %ymm8 @@ -9788,7 +9622,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm9[2],ymm13[2],ymm9[3],ymm13[3],ymm9[6],ymm13[6],ymm9[7],ymm13[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm9[2],ymm11[2],ymm9[3],ymm11[3],ymm9[6],ymm11[6],ymm9[7],ymm11[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 220(%r8), %ymm15 @@ -9833,10 +9667,10 @@ ; AVX2-FAST-NEXT: vbroadcastsd 248(%rax), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm0 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm4, %xmm1 +; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -9844,19 +9678,19 @@ ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -9867,8 +9701,8 @@ ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm11[1,1],mem[1,1],ymm11[5,5],mem[5,5] +; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm13[1,1],mem[1,1],ymm13[5,5],mem[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload @@ -9879,30 +9713,30 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm4, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vbroadcastss %xmm8, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm6[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm6[3,3],xmm7[3,3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm5[2],xmm8[3],xmm5[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -9939,7 +9773,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] @@ -9950,16 +9784,16 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 72(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -10006,12 +9840,12 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4],ymm15[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm11[1,1],ymm1[5,5],ymm11[5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm12[1,1],ymm1[5,5],ymm12[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -10054,12 +9888,12 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm9[1,1],ymm1[5,5],ymm9[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -10081,9 +9915,9 @@ ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] @@ -10094,20 +9928,20 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 168(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm14[1,1],ymm1[5,5],ymm14[5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm8[1,1],ymm1[5,5],ymm8[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -10119,31 +9953,31 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm4, %xmm1 +; AVX2-FAST-NEXT: vbroadcastss %xmm4, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm3, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 200(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] @@ -10151,10 +9985,10 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm13[1,1],ymm0[5,5],ymm13[5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -10168,12 +10002,12 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[4],ymm5[4],ymm2[5],ymm5[5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -10182,27 +10016,27 @@ ; AVX2-FAST-NEXT: # xmm1 = xmm1[0,1,2],mem[3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3,4],ymm1[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[4],ymm5[4],ymm2[5],ymm5[5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -10211,17 +10045,17 @@ ; AVX2-FAST-NEXT: # xmm1 = xmm1[0,1,2],mem[3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm6[2],mem[2],ymm6[3],mem[3],ymm6[6],mem[6],ymm6[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3,4],ymm1[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10229,127 +10063,125 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm15, %ymm3 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[4],ymm15[4],ymm2[5],ymm15[5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm11[0],ymm3[0],ymm11[1],ymm3[1],ymm11[4],ymm3[4],ymm11[5],ymm3[5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm5[2],mem[2],ymm5[3],mem[3],ymm5[6],mem[6],ymm5[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm11[2],ymm3[3],ymm11[3],ymm3[6],ymm11[6],ymm3[7],ymm11[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3,4],ymm1[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm1[1,2],ymm11[3,4],ymm1[5,6],ymm11[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm11[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6],ymm5[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm12[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm1[6],ymm11[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[4],ymm2[4],ymm12[5],ymm2[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm11 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[4],ymm2[4],ymm15[5],ymm2[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm4[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm5[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm11 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm11 = xmm4[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm11 = xmm11[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0],ymm11[1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm12 = ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[6],ymm15[6],ymm2[7],ymm15[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm12 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0],ymm12[1,2],ymm15[3,4],ymm12[5,6],ymm15[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4],ymm12[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 144(%rdx), %ymm12 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm9[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5],ymm12[6],ymm15[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0],ymm11[1,2],ymm12[3,4],ymm11[5,6],ymm12[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm5[1,2,3,4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 144(%rdx), %ymm5 -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm2[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5],ymm5[6],ymm12[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm12 = ymm9[0],ymm1[0],ymm9[1],ymm1[1],ymm9[4],ymm1[4],ymm9[5],ymm1[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm3[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0],ymm12[1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm15 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm15 = xmm3[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm15 = xmm15[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm15[1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0],ymm5[1,2],ymm9[3,4],ymm5[5,6],ymm9[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm0[1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 176(%rdx), %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm14[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6],ymm9[7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1,2],ymm9[3,4],ymm15[5,6],ymm9[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0],ymm0[1,2,3,4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 176(%rdx), %ymm9 +; AVX2-FAST-NEXT: vmovaps %ymm8, %ymm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm8[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5],ymm9[6],ymm15[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm15[4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm9 = xmm8[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm9 = xmm9[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm9[1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm14[2],mem[2],ymm14[3],mem[3],ymm14[6],mem[6],ymm14[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm15 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm15 = xmm8[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm15 = xmm15[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2],ymm8[3,4],ymm9[5,6],ymm8[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm15[1,2],ymm8[3,4],ymm15[5,6],ymm8[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm13[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm8 = ymm7[0],ymm10[0],ymm7[1],ymm10[1],ymm7[4],ymm10[4],ymm7[5],ymm10[5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm13[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vbroadcastss 208(%rdx), %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1,2,3,4,5],ymm7[6],ymm15[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm8 = xmm1[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm8 = xmm8[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1,2],mem[3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovaps %ymm7, 1440(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm0, 1312(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm5, 1216(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm9, 1216(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm3, 1088(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm12, 992(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm11, 864(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm4, 768(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm10, 640(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm15, 544(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm5, 640(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm6, 544(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10450,52 +10282,52 @@ ; ; AVX2-FAST-PERLANE-LABEL: store_i32_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $3000, %rsp # imm = 0xBB8 +; AVX2-FAST-PERLANE-NEXT: subq $3016, %rsp # imm = 0xBC8 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rax), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rax), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r8), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm8[1],xmm9[1],zero ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rax), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm7[1],xmm3[1],zero +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm7[1],xmm4[1],zero ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10517,10 +10349,9 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm11[1],xmm2[1],zero +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm10[1],xmm11[1],zero ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10674,11 +10505,11 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10806,8 +10637,8 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm3, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 224(%rax), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],xmm2[2],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,1,2,1] @@ -10819,8 +10650,8 @@ ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 224(%rax), %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 @@ -10835,10 +10666,11 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 232(%rax), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10909,9 +10741,8 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm11, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload @@ -10928,7 +10759,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] @@ -10939,10 +10770,10 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -10950,30 +10781,30 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm8[0],xmm11[0],xmm8[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm1[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm11[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 104(%rax), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm11, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -10981,21 +10812,21 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm4[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 136(%rax), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -11003,8 +10834,8 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm11, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -11012,21 +10843,21 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm4[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 168(%rax), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -11034,8 +10865,8 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm11, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm10, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -11052,7 +10883,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm8[3,3],xmm4[3,3] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] @@ -11063,13 +10894,13 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm4[2],ymm13[2],ymm4[3],ymm13[3],ymm4[6],ymm13[6],ymm4[7],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 220(%r8), %ymm6 @@ -11082,16 +10913,18 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%rdx), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm2[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[4],ymm10[4],ymm12[5],ymm10[5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm12[0],ymm0[1],ymm12[1],ymm0[4],ymm12[4],ymm0[5],ymm12[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 236(%r8), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm5 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm5 = ymm0[2],ymm12[2],ymm0[3],ymm12[3],ymm0[6],ymm12[6],ymm0[7],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm6 = ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[6],ymm2[6],ymm9[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] @@ -11104,8 +10937,8 @@ ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[2,3],ymm6[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm1[2],ymm12[3],ymm1[3],ymm12[6],ymm1[6],ymm12[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -11116,33 +10949,33 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 248(%rax), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1],ymm3[1,1],ymm10[5,5],ymm3[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm7[1,1],ymm1[5,5],ymm7[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm9[1,1],ymm1[5,5],ymm9[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -11154,14 +10987,14 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm15[1,1],ymm1[5,5],ymm15[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -11231,9 +11064,9 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm4[1,1],ymm0[5,5],ymm4[5,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm8[1,1],ymm0[5,5],ymm8[5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -11249,7 +11082,7 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -11258,30 +11091,31 @@ ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm8[2],ymm3[3],ymm8[3],ymm3[6],ymm8[6],ymm3[7],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,3],ymm12[3,3],ymm9[7,7],ymm12[7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,3],ymm14[3,3],ymm11[7,7],ymm14[7,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm14[0],ymm10[1],ymm14[1],ymm10[4],ymm14[4],ymm10[5],ymm14[5] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[4],ymm15[4],ymm12[5],ymm15[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[3,3],mem[3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm10[2],ymm14[3],ymm10[3],ymm14[6],ymm10[6],ymm14[7],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[6],ymm12[6],ymm15[7],ymm12[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -11291,14 +11125,13 @@ ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[4],ymm4[4],ymm2[5],ymm4[5] +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[4],ymm6[4],ymm2[5],ymm6[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -11306,9 +11139,9 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm13[2],mem[2],ymm13[3],mem[3],ymm13[6],mem[6],ymm13[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[6],ymm2[6],ymm6[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] @@ -11378,8 +11211,8 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm10[0],ymm1[1],ymm10[1],ymm1[4],ymm10[4],ymm1[5],ymm10[5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm11[0],ymm1[1],ymm11[1],ymm1[4],ymm11[4],ymm1[5],ymm11[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm8 # 16-byte Folded Reload @@ -11389,7 +11222,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[6],ymm1[6],ymm10[7],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm1[2],ymm11[3],ymm1[3],ymm11[6],ymm1[6],ymm11[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] @@ -11400,8 +11233,9 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0],ymm8[1,2],ymm10[3,4],ymm8[5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] ; AVX2-FAST-PERLANE-NEXT: vpermilps $39, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdx), %ymm11 @@ -11410,8 +11244,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm10 = xmm1[3,3],mem[3,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm10 = xmm10[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm10[1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 1440(%rax) @@ -11423,9 +11256,9 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 768(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 640(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, 544(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 416(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, 416(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11514,7 +11347,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1600(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 1568(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $3000, %rsp # imm = 0xBB8 +; AVX2-FAST-PERLANE-NEXT: addq $3016, %rsp # imm = 0xBC8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -12476,7 +12309,6 @@ ; AVX: {{.*}} ; AVX1: {{.*}} ; AVX2: {{.*}} -; AVX2-ONLY: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} ; AVX512BW-ONLY-SLOW: {{.*}} ; AVX512DQ-FAST: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll @@ -151,45 +151,45 @@ ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm5 -; SSE-NEXT: movaps (%rdx), %xmm1 -; SSE-NEXT: movaps (%rcx), %xmm6 -; SSE-NEXT: movaps (%r8), %xmm2 -; SSE-NEXT: movaps (%r9), %xmm7 -; SSE-NEXT: movaps (%r11), %xmm8 +; SSE-NEXT: movaps (%rsi), %xmm6 +; SSE-NEXT: movaps (%rdx), %xmm4 +; SSE-NEXT: movaps (%rcx), %xmm7 +; SSE-NEXT: movaps (%r8), %xmm1 +; SSE-NEXT: movaps (%r9), %xmm8 +; SSE-NEXT: movaps (%r11), %xmm5 ; SSE-NEXT: movaps (%r10), %xmm9 -; SSE-NEXT: movaps %xmm1, %xmm10 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE-NEXT: movaps %xmm4, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0] -; SSE-NEXT: movaps %xmm8, %xmm11 +; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; SSE-NEXT: movaps %xmm3, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm10[0] +; SSE-NEXT: movaps %xmm5, %xmm11 ; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1] +; SSE-NEXT: movaps %xmm1, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] ; SSE-NEXT: movaps %xmm12, %xmm13 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm4[0] +; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE-NEXT: movaps %xmm1, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; SSE-NEXT: movaps %xmm1, 112(%rax) ; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: movaps %xmm2, 112(%rax) -; SSE-NEXT: movaps %xmm6, 80(%rax) -; SSE-NEXT: movaps %xmm5, 64(%rax) -; SSE-NEXT: movaps %xmm4, 32(%rax) +; SSE-NEXT: movaps %xmm7, 80(%rax) +; SSE-NEXT: movaps %xmm6, 64(%rax) ; SSE-NEXT: movaps %xmm12, 48(%rax) +; SSE-NEXT: movaps %xmm3, 32(%rax) ; SSE-NEXT: movaps %xmm13, 16(%rax) -; SSE-NEXT: movaps %xmm3, (%rax) +; SSE-NEXT: movaps %xmm2, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride8_vf4: @@ -197,50 +197,50 @@ ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovaps (%r11), %xmm6 -; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm9[0],ymm6[0],ymm9[2],ymm6[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[4],ymm8[4],ymm4[5],ymm8[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,0],ymm5[4,5],ymm7[6,4] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm8[1,0],ymm4[1,0],ymm8[5,4],ymm4[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[2,0],ymm7[2,3],ymm10[6,4],ymm7[6,7] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = xmm0[1],xmm1[1],zero,zero -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm3 +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%r11), %xmm5 +; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,0],ymm4[4,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm6[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm10[0,1],xmm8[0,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,0],ymm3[1,0],ymm2[5,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[2,0],ymm8[2,3],ymm10[6,4],ymm8[6,7] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm10 = xmm9[1],xmm0[1],zero,zero +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm6[1],ymm9[3],ymm6[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm7[1],ymm5[1],ymm7[3],ymm5[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm2[2],xmm3[2] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm12 = zero,zero,xmm6[2],xmm1[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[6],ymm9[6],ymm6[7],ymm9[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,0],ymm4[3,0],ymm8[7,4],ymm4[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm6[2,3],ymm4[6,4],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[6],ymm7[6],ymm5[7],ymm7[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm3[3,0],ymm2[7,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm5[2,3],ymm2[6,4],ymm5[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,0],xmm0[3,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -352,133 +352,118 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride8_vf8: ; SSE: # %bb.0: -; SSE-NEXT: subq $56, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movaps (%rdi), %xmm3 -; SSE-NEXT: movaps (%rsi), %xmm0 -; SSE-NEXT: movaps (%rdx), %xmm9 -; SSE-NEXT: movaps (%rcx), %xmm8 -; SSE-NEXT: movaps (%r8), %xmm5 -; SSE-NEXT: movaps 16(%r8), %xmm7 -; SSE-NEXT: movaps (%r9), %xmm1 +; SSE-NEXT: movaps (%rdi), %xmm9 +; SSE-NEXT: movaps 16(%rdi), %xmm4 +; SSE-NEXT: movaps (%rsi), %xmm1 +; SSE-NEXT: movaps 16(%rsi), %xmm8 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rdx), %xmm11 +; SSE-NEXT: movaps 16(%rdx), %xmm5 +; SSE-NEXT: movaps (%rcx), %xmm12 +; SSE-NEXT: movaps (%r8), %xmm7 +; SSE-NEXT: movaps (%r9), %xmm0 ; SSE-NEXT: movaps (%r10), %xmm14 -; SSE-NEXT: movaps 16(%r10), %xmm15 -; SSE-NEXT: movaps (%rax), %xmm4 -; SSE-NEXT: movaps 16(%rax), %xmm10 -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: movaps %xmm4, %xmm11 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm14[0] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0] -; SSE-NEXT: movaps %xmm3, %xmm6 +; SSE-NEXT: movaps (%rax), %xmm15 +; SSE-NEXT: movaps %xmm12, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm11[0] +; SSE-NEXT: movaps %xmm9, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm15, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm14[0] +; SSE-NEXT: movaps %xmm7, %xmm6 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] -; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm11[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm3[2,0] +; SSE-NEXT: movaps %xmm15, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm14[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm3[2,0] +; SSE-NEXT: movaps %xmm12, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] ; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, %xmm0 -; SSE-NEXT: movaps %xmm10, %xmm4 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm15[0] -; SSE-NEXT: movaps 16(%r9), %xmm10 -; SSE-NEXT: movaps %xmm7, %xmm13 -; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1] -; SSE-NEXT: movaps %xmm13, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rdx), %xmm0 -; SSE-NEXT: movaps 16(%rcx), %xmm12 -; SSE-NEXT: movaps %xmm12, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm0[0] -; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps 16(%rsi), %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; SSE-NEXT: movaps %xmm15, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm4[1] -; SSE-NEXT: movaps %xmm7, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm10[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; SSE-NEXT: movaps %xmm2, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movaps %xmm4, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm14[1,1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm1[2,0] -; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm9[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm1[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3],xmm4[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm14[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3],xmm5[3,3] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm9[0,2] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movaps %xmm9, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm15[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm1[2,0] -; SSE-NEXT: movaps %xmm12, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,3],xmm9[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm15[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm12[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm15, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm14[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE-NEXT: movaps %xmm7, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rcx), %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,0],xmm11[3,3] +; SSE-NEXT: movaps 16(%r10), %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm12[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,0],xmm14[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm15[2,0] +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE-NEXT: movaps %xmm4, %xmm14 +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] +; SSE-NEXT: movaps %xmm14, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] +; SSE-NEXT: movaps 16(%rax), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] +; SSE-NEXT: movaps 16(%r8), %xmm15 +; SSE-NEXT: movaps 16(%r9), %xmm8 +; SSE-NEXT: movaps %xmm15, %xmm3 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm0[2,0] +; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] +; SSE-NEXT: movaps %xmm13, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] +; SSE-NEXT: movaps %xmm15, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,0] +; SSE-NEXT: shufps $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = xmm13[3,0],mem[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm13[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm11[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm1[2,0] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm2, 224(%rax) -; SSE-NEXT: movaps %xmm7, 240(%rax) -; SSE-NEXT: movaps %xmm3, 160(%rax) -; SSE-NEXT: movaps %xmm13, 176(%rax) -; SSE-NEXT: movaps %xmm4, 96(%rax) -; SSE-NEXT: movaps %xmm6, 112(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 32(%rax) -; SSE-NEXT: movaps %xmm8, 48(%rax) -; SSE-NEXT: movaps %xmm10, 192(%rax) -; SSE-NEXT: movaps %xmm11, 208(%rax) +; SSE-NEXT: movaps %xmm15, 240(%rax) +; SSE-NEXT: movaps %xmm4, 224(%rax) +; SSE-NEXT: movaps %xmm8, 208(%rax) +; SSE-NEXT: movaps %xmm2, 192(%rax) +; SSE-NEXT: movaps %xmm3, 176(%rax) +; SSE-NEXT: movaps %xmm14, 160(%rax) +; SSE-NEXT: movaps %xmm5, 144(%rax) +; SSE-NEXT: movaps %xmm12, 128(%rax) +; SSE-NEXT: movaps %xmm7, 112(%rax) +; SSE-NEXT: movaps %xmm9, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movaps %xmm0, 64(%rax) +; SSE-NEXT: movaps %xmm6, 48(%rax) +; SSE-NEXT: movaps %xmm10, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: addq $56, %rsp +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i32_stride8_vf8: @@ -486,227 +471,219 @@ ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm5 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm7 ; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm8 ; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm9 ; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm10 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm7[1,0],ymm8[5,4],ymm7[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[4],ymm6[4],ymm4[5],ymm6[5] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm3[1,0],ymm5[5,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[3,0],ymm7[3,0],ymm8[7,4],ymm7[7,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm7[1,0],ymm8[5,4],ymm7[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[2,3],ymm2[6,4],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm5[3,0],ymm3[3,0],ymm5[7,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,0],ymm11[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,0],ymm7[4,5],ymm9[6,4] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[3,0],ymm7[3,0],ymm8[7,4],ymm7[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0],ymm9[2,3],ymm7[6,4],ymm9[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm5 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm6 -; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm13 -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm13[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm10 +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm11 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm12 +; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm13 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm14[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm7[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm8[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm11[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm10[1],xmm15[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm0[1],xmm15[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm10[0],xmm9[0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm5[0,1,2],xmm11[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm8[0,1],xmm11[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i32_stride8_vf8: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm8 ; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm9 -; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm10 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] -; AVX2-ONLY-NEXT: vbroadcastss 20(%r8), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm11 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm3[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm11 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm12[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss 24(%rax), %ymm10 +; AVX2-ONLY-NEXT: vmovaps (%r11), %ymm3 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[4],ymm3[4],ymm9[5],ymm3[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[4],ymm5[4],ymm2[5],ymm5[5] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[4],ymm6[4],ymm4[5],ymm6[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 20(%r8), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm10 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm5[1],ymm11[2,3,4],ymm5[5],ymm11[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 24(%r11), %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm10 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm8[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[6],ymm5[6],ymm2[7],ymm5[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm2 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm6 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm7[1],ymm3[1],ymm7[3],ymm3[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm9 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm4 +; AVX2-ONLY-NEXT: vbroadcastss %xmm4, %xmm7 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm10 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm10[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm6 -; AVX2-ONLY-NEXT: vmovaps (%r10), %xmm7 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm12 -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm13 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm12[1],xmm14[2,3] +; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm10 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm11 +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm12 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vmovaps (%r11), %xmm7 +; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm14 +; AVX2-ONLY-NEXT: vmovaps (%r10), %xmm10 +; AVX2-ONLY-NEXT: vbroadcastss %xmm10, %xmm15 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm14[2,3,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm8[1],xmm14[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm12[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm11[1],xmm15[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm0[1],xmm15[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-ONLY-NEXT: vbroadcastss %xmm4, %xmm10 -; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm15 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX2-ONLY-NEXT: vbroadcastss %xmm6, %xmm12 -; AVX2-ONLY-NEXT: vbroadcastss %xmm7, %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm9, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rax) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1,2],xmm9[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm11 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm10[0,1,2],xmm14[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],xmm4[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm11[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -827,9 +804,9 @@ ; SSE-NEXT: movaps 16(%rdx), %xmm0 ; SSE-NEXT: movaps (%rcx), %xmm3 ; SSE-NEXT: movaps (%r8), %xmm11 -; SSE-NEXT: movaps (%r9), %xmm6 +; SSE-NEXT: movaps (%r9), %xmm7 ; SSE-NEXT: movaps (%r10), %xmm5 -; SSE-NEXT: movaps (%rax), %xmm7 +; SSE-NEXT: movaps (%rax), %xmm6 ; SSE-NEXT: movaps %xmm3, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] ; SSE-NEXT: movaps %xmm9, %xmm13 @@ -837,52 +814,52 @@ ; SSE-NEXT: movaps %xmm13, %xmm12 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movaps %xmm6, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm5[0] ; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1] ; SSE-NEXT: movaps %xmm14, %xmm12 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm5[1,1] +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm5[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3] ; SSE-NEXT: movaps %xmm9, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm11, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; SSE-NEXT: movaps %xmm11, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 16(%rcx), %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: movaps 16(%r10), %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] +; SSE-NEXT: movaps 16(%r10), %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,0],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm6[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movaps %xmm10, %xmm6 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: movaps %xmm6, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rax), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: movaps 16(%rax), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 16(%r8), %xmm11 ; SSE-NEXT: movaps 16(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm9 @@ -891,121 +868,121 @@ ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm3[1,1] +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE-NEXT: movaps %xmm10, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdx), %xmm1 -; SSE-NEXT: movaps 32(%rcx), %xmm3 -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movaps 32(%rdx), %xmm0 +; SSE-NEXT: movaps 32(%rcx), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movaps 32(%rdi), %xmm14 -; SSE-NEXT: movaps 32(%rsi), %xmm2 -; SSE-NEXT: movaps %xmm14, %xmm15 -; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] -; SSE-NEXT: movaps %xmm15, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] +; SSE-NEXT: movaps 32(%rsi), %xmm3 +; SSE-NEXT: movaps %xmm14, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] +; SSE-NEXT: movaps %xmm13, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%r10), %xmm0 -; SSE-NEXT: movaps 32(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE-NEXT: movaps 32(%r10), %xmm4 +; SSE-NEXT: movaps 32(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps 32(%r8), %xmm9 ; SSE-NEXT: movaps 32(%r9), %xmm6 -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] -; SSE-NEXT: movaps %xmm11, %xmm7 +; SSE-NEXT: movaps %xmm9, %xmm15 +; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] +; SSE-NEXT: movaps %xmm15, %xmm7 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm5[2,0] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm2[2],xmm14[3],xmm2[3] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm3[2],xmm14[3],xmm3[3] +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] ; SSE-NEXT: movaps %xmm14, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm9, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm0[0,2] -; SSE-NEXT: movaps 48(%rdx), %xmm1 -; SSE-NEXT: movaps 48(%rcx), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[2,0] +; SSE-NEXT: movaps 48(%rdx), %xmm7 +; SSE-NEXT: movaps 48(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] ; SSE-NEXT: movaps 48(%rdi), %xmm4 -; SSE-NEXT: movaps 48(%rsi), %xmm12 +; SSE-NEXT: movaps 48(%rsi), %xmm11 ; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1] -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%r10), %xmm0 -; SSE-NEXT: movaps 48(%rax), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%r10), %xmm6 +; SSE-NEXT: movaps 48(%rax), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSE-NEXT: movaps 48(%r8), %xmm3 -; SSE-NEXT: movaps 48(%r9), %xmm13 +; SSE-NEXT: movaps 48(%r9), %xmm12 ; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] ; SSE-NEXT: movaps %xmm8, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm7, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm2[2,0] -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm2[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm12[2],xmm4[3],xmm12[3] -; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] -; SSE-NEXT: movaps %xmm4, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm2[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] -; SSE-NEXT: movaps %xmm3, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm0[2,0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm11[2],xmm4[3],xmm11[3] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE-NEXT: movaps %xmm4, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm12[2],xmm3[3],xmm12[3] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE-NEXT: movaps %xmm3, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm2[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[2,0] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm3, 496(%rax) ; SSE-NEXT: movaps %xmm4, 480(%rax) -; SSE-NEXT: movaps %xmm13, 464(%rax) -; SSE-NEXT: movaps %xmm12, 448(%rax) +; SSE-NEXT: movaps %xmm12, 464(%rax) +; SSE-NEXT: movaps %xmm11, 448(%rax) ; SSE-NEXT: movaps %xmm8, 432(%rax) ; SSE-NEXT: movaps %xmm5, 416(%rax) ; SSE-NEXT: movaps %xmm10, 400(%rax) @@ -1017,8 +994,8 @@ ; SSE-NEXT: movaps %xmm0, 336(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 320(%rax) -; SSE-NEXT: movaps %xmm11, 304(%rax) -; SSE-NEXT: movaps %xmm15, 288(%rax) +; SSE-NEXT: movaps %xmm15, 304(%rax) +; SSE-NEXT: movaps %xmm13, 288(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 272(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1060,491 +1037,463 @@ ; ; AVX1-ONLY-LABEL: store_i32_stride8_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $264, %rsp # imm = 0x108 +; AVX1-ONLY-NEXT: subq $136, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm9 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 32(%r10), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,0],ymm10[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm6[3,0],ymm5[3,0],ymm6[7,4],ymm5[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm10[2,0],ymm2[2,3],ymm10[6,4],ymm2[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1],ymm2[2,0],ymm10[4,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm2[0,1],xmm13[2,3] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm10 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[4],ymm12[4],ymm11[5],ymm12[5] -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,0],ymm5[1,0],ymm6[5,4],ymm5[5,4] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm13 +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm14 ; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm13[2,3],ymm5[6,4],ymm13[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm13 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7] +; AVX1-ONLY-NEXT: vmovaps (%rax), %ymm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[4],ymm14[4],ymm13[5],ymm14[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm2[1],ymm9[1],ymm2[3],ymm9[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,0],ymm10[3,0],ymm11[7,4],ymm10[7,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[6],ymm2[6],ymm9[7],ymm2[7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[3,0],ymm7[3,0],ymm8[7,4],ymm7[7,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[4],ymm15[4],ymm12[5],ymm15[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,0],ymm13[1,0],ymm14[5,4],ymm13[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[2,3],ymm2[6,4],ymm1[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,0],ymm10[4,5],ymm2[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1],xmm11[2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm11 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm12[2],ymm15[2],ymm12[3],ymm15[3],ymm12[6],ymm15[6],ymm12[7],ymm15[7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,0],ymm13[3,0],ymm14[7,4],ymm13[7,4] +; AVX1-ONLY-NEXT: vmovaps 32(%r10), %ymm13 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[2,0],ymm15[2,3],ymm14[6,4],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %ymm14 +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[4],ymm12[4],ymm11[5],ymm12[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,0],ymm5[4,5],ymm4[6,4] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[4],ymm14[4],ymm13[5],ymm14[5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,0],ymm11[1,0],ymm12[5,4],ymm11[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,0],ymm5[2,3],ymm6[6,4],ymm5[6,7] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm6 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[4],ymm10[4],ymm9[5],ymm10[5] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[1,0],ymm7[1,0],ymm8[5,4],ymm7[5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[4],ymm13[4],ymm12[5],ymm13[5] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,0],ymm10[1,0],ymm11[5,4],ymm10[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[4],ymm2[4],ymm9[5],ymm2[5] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm7[1,0],ymm8[5,4],ymm7[5,4] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm15 -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm13[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm15[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm11[1,1,1,1] -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm14 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,0],ymm15[4,5],ymm6[6,4] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm5[0],xmm11[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm12[0] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[3,0],ymm11[3,0],ymm12[7,4],ymm11[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[2,0],ymm0[2,3],ymm11[6,4],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm8[3,0],ymm7[3,0],ymm8[7,4],ymm7[7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm5 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0,1,2],xmm12[3] -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1],xmm12[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm10 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm10[0],xmm2[0],xmm10[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm12 +; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm13 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm14 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm15[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm4[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm2[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm2[2],xmm10[3],xmm2[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm3 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm12 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm2[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm1[1],xmm7[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm4[1],xmm13[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm15[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm5[2],xmm12[3],xmm5[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 320(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 288(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rax) -; AVX1-ONLY-NEXT: addq $264, %rsp # imm = 0x108 +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: addq $136, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i32_stride8_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $296, %rsp # imm = 0x128 +; AVX2-ONLY-NEXT: subq $136, %rsp ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %ymm13 -; AVX2-ONLY-NEXT: vbroadcastss 56(%rax), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm1 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm14 = ymm5[2],ymm11[2],ymm5[3],ymm11[3],ymm5[6],ymm11[6],ymm5[7],ymm11[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm10[2],ymm4[2],ymm10[3],ymm4[3],ymm10[6],ymm4[6],ymm10[7],ymm4[7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 60(%r10), %ymm0 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm9[0,1],xmm14[2,3] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[4],ymm13[4],ymm12[5],ymm13[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm5[0],ymm11[0],ymm5[1],ymm11[1],ymm5[4],ymm11[4],ymm5[5],ymm11[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm11[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm10[0],ymm4[0],ymm10[1],ymm4[1],ymm10[4],ymm4[4],ymm10[5],ymm4[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3,4],ymm4[5],ymm10[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm10 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 24(%rax), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[6],ymm1[6],ymm6[7],ymm1[7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm9[2],ymm0[2],ymm9[3],ymm0[3],ymm9[6],ymm0[6],ymm9[7],ymm0[7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm2 -; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm4 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm11[1],ymm4[1],ymm11[3],ymm4[3] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm8 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm0 +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm11 +; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm12 +; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm14 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[4],ymm14[4],ymm12[5],ymm14[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm13[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm15 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[4],ymm11[4],ymm0[5],ymm11[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm1[0],ymm15[2],ymm1[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 20(%r8), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm10 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm3[1],ymm13[2,3,4],ymm3[5],ymm13[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 24(%rax), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm12 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm11[2],ymm0[3],ymm11[3],ymm0[6],ymm11[6],ymm0[7],ymm11[7] +; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm13 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm5 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm14[2],ymm5[3],ymm14[3],ymm5[6],ymm14[6],ymm5[7],ymm14[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[4],ymm1[4],ymm6[5],ymm1[5] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 20(%r8), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2,3,4],ymm8[5],ymm2[6,7] -; AVX2-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm5[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm14[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 52(%r8), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm7[1],ymm14[2,3,4],ymm7[5],ymm14[6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm15 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm1[0],xmm15[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps %xmm1, %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm4 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm5 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm11 -; AVX2-ONLY-NEXT: vmovaps (%r10), %xmm12 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm6 -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm14 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm14[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm0 -; AVX2-ONLY-NEXT: vbroadcastss %xmm15, %xmm1 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm1 -; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm2 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] +; AVX2-ONLY-NEXT: vbroadcastss 56(%rax), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm6 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm8[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vbroadcastss 60(%r10), %ymm0 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm11[2],ymm0[3],ymm11[3],ymm0[6],ymm11[6],ymm0[7],ymm11[7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm13 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm13[0],xmm10[0],xmm13[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm11 +; AVX2-ONLY-NEXT: vbroadcastss %xmm11, %xmm8 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm12 +; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm9 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm0[0,1],xmm8[2,3] +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm14 +; AVX2-ONLY-NEXT: vbroadcastss %xmm14, %xmm9 +; AVX2-ONLY-NEXT: vmovaps (%r10), %xmm1 +; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm15 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm15 +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm0 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm15[1],xmm3[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm13[2],xmm10[2],xmm13[3],xmm10[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm14[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm11[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm2[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm12 +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3] +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm13 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 32(%r10), %xmm5 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm9 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm1[0,1,2],xmm13[3] -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastss %xmm12, %xmm0 +; AVX2-ONLY-NEXT: vbroadcastss %xmm13, %xmm1 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm15 +; AVX2-ONLY-NEXT: vmovaps 32(%r10), %xmm3 +; AVX2-ONLY-NEXT: vbroadcastss %xmm15, %xmm14 +; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm4 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm0 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm0 -; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm6 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastss %xmm8, %xmm6 -; AVX2-ONLY-NEXT: vbroadcastss %xmm5, %xmm13 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm1[1],xmm5[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[1,1,1,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm2 = xmm14[2],mem[2],xmm14[3],mem[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm12[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm15[2],xmm3[3],xmm15[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm15[0,1,2],xmm4[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm15[2],xmm5[2],xmm15[3],xmm5[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 256(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 320(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 288(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 256(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rax) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rax) -; AVX2-ONLY-NEXT: addq $296, %rsp # imm = 0x128 +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-ONLY-NEXT: addq $136, %rsp ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1817,8 +1766,8 @@ ; SSE-NEXT: movaps 16(%rdx), %xmm0 ; SSE-NEXT: movaps (%rcx), %xmm4 ; SSE-NEXT: movaps (%r8), %xmm11 -; SSE-NEXT: movaps (%r9), %xmm6 -; SSE-NEXT: movaps (%r10), %xmm5 +; SSE-NEXT: movaps (%r9), %xmm5 +; SSE-NEXT: movaps (%r10), %xmm6 ; SSE-NEXT: movaps (%rax), %xmm7 ; SSE-NEXT: movaps %xmm4, %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] @@ -1828,100 +1777,100 @@ ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm5[0] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0] ; SSE-NEXT: movaps %xmm11, %xmm14 -; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1] ; SSE-NEXT: movaps %xmm14, %xmm12 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm6[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm2[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] ; SSE-NEXT: movaps %xmm9, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm8[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm11, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm3[0,2] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rcx), %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] -; SSE-NEXT: movaps 16(%r10), %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[0,2] +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] +; SSE-NEXT: movaps %xmm11, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rcx), %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[3,3] +; SSE-NEXT: movaps 16(%r10), %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,0],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm7[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm10, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rax), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: movaps %xmm3, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movaps %xmm10, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rax), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; SSE-NEXT: movaps 16(%r8), %xmm11 -; SSE-NEXT: movaps 16(%r9), %xmm5 +; SSE-NEXT: movaps 16(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] ; SSE-NEXT: movaps %xmm9, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[2,0] +; SSE-NEXT: movaps %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm3[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm4[2,0] +; SSE-NEXT: movaps %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movaps %xmm10, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] ; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm10, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] +; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: movaps %xmm11, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm11, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm0[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdx), %xmm0 -; SSE-NEXT: movaps 32(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 32(%rdx), %xmm1 +; SSE-NEXT: movaps 32(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 32(%rdi), %xmm8 ; SSE-NEXT: movaps 32(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%r10), %xmm1 -; SSE-NEXT: movaps 32(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 32(%r10), %xmm4 +; SSE-NEXT: movaps 32(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps 32(%r8), %xmm11 ; SSE-NEXT: movaps 32(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -1929,48 +1878,48 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdx), %xmm0 -; SSE-NEXT: movaps 48(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 48(%rdx), %xmm1 +; SSE-NEXT: movaps 48(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 48(%rdi), %xmm8 ; SSE-NEXT: movaps 48(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%r10), %xmm1 -; SSE-NEXT: movaps 48(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 48(%r10), %xmm4 +; SSE-NEXT: movaps 48(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps 48(%r8), %xmm11 ; SSE-NEXT: movaps 48(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -1978,48 +1927,48 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdx), %xmm0 -; SSE-NEXT: movaps 64(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 64(%rdx), %xmm1 +; SSE-NEXT: movaps 64(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 64(%rdi), %xmm8 ; SSE-NEXT: movaps 64(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%r10), %xmm1 -; SSE-NEXT: movaps 64(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 64(%r10), %xmm4 +; SSE-NEXT: movaps 64(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps 64(%r8), %xmm11 ; SSE-NEXT: movaps 64(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -2027,48 +1976,48 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdx), %xmm0 -; SSE-NEXT: movaps 80(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 80(%rdx), %xmm1 +; SSE-NEXT: movaps 80(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 80(%rdi), %xmm8 ; SSE-NEXT: movaps 80(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%r10), %xmm1 -; SSE-NEXT: movaps 80(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 80(%r10), %xmm4 +; SSE-NEXT: movaps 80(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps 80(%r8), %xmm11 ; SSE-NEXT: movaps 80(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -2076,134 +2025,134 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdx), %xmm0 -; SSE-NEXT: movaps 96(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps 96(%rdi), %xmm12 +; SSE-NEXT: movaps 96(%rdx), %xmm1 +; SSE-NEXT: movaps 96(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movaps 96(%rdi), %xmm6 ; SSE-NEXT: movaps 96(%rsi), %xmm3 -; SSE-NEXT: movaps %xmm12, %xmm15 +; SSE-NEXT: movaps %xmm6, %xmm15 ; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] ; SSE-NEXT: movaps %xmm15, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%r10), %xmm1 -; SSE-NEXT: movaps 96(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: movaps 96(%r8), %xmm7 -; SSE-NEXT: movaps 96(%r9), %xmm9 -; SSE-NEXT: movaps %xmm7, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; SSE-NEXT: movaps %xmm11, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%r10), %xmm4 +; SSE-NEXT: movaps 96(%rax), %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-NEXT: movaps 96(%r8), %xmm7 +; SSE-NEXT: movaps 96(%r9), %xmm10 +; SSE-NEXT: movaps %xmm7, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; SSE-NEXT: movaps %xmm12, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[2,0] -; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm3[2],xmm12[3],xmm3[3] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm5[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE-NEXT: movaps %xmm12, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm0[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm1[0,2] -; SSE-NEXT: movaps 112(%rdx), %xmm2 -; SSE-NEXT: movaps 112(%rcx), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm2[2,0] +; SSE-NEXT: movaps 112(%rdx), %xmm5 +; SSE-NEXT: movaps 112(%rcx), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE-NEXT: movaps 112(%rdi), %xmm0 ; SSE-NEXT: movaps 112(%rsi), %xmm13 -; SSE-NEXT: movaps %xmm0, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1] -; SSE-NEXT: movaps %xmm9, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%r10), %xmm1 -; SSE-NEXT: movaps 112(%rax), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; SSE-NEXT: movaps 112(%r8), %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm10 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%r10), %xmm4 +; SSE-NEXT: movaps 112(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE-NEXT: movaps 112(%r8), %xmm8 ; SSE-NEXT: movaps 112(%r9), %xmm14 -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1] -; SSE-NEXT: movaps %xmm8, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm4[2,0] -; SSE-NEXT: movaps %xmm6, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm4[2,0] -; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm4[2,0] +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm5[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm4[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm1[2,0] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; SSE-NEXT: movaps %xmm2, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] ; SSE-NEXT: movaps %xmm0, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm4[0,2] -; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm14[2],xmm3[3],xmm14[3] -; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; SSE-NEXT: movaps %xmm3, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm4[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm5[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[2,0] +; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm14[2],xmm8[3],xmm14[3] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE-NEXT: movaps %xmm8, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm5[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm2[2,0] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm3, 1008(%rax) +; SSE-NEXT: movaps %xmm8, 1008(%rax) ; SSE-NEXT: movaps %xmm0, 992(%rax) ; SSE-NEXT: movaps %xmm14, 976(%rax) ; SSE-NEXT: movaps %xmm13, 960(%rax) -; SSE-NEXT: movaps %xmm8, 944(%rax) -; SSE-NEXT: movaps %xmm9, 928(%rax) -; SSE-NEXT: movaps %xmm10, 912(%rax) +; SSE-NEXT: movaps %xmm9, 944(%rax) +; SSE-NEXT: movaps %xmm10, 928(%rax) +; SSE-NEXT: movaps %xmm11, 912(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 896(%rax) ; SSE-NEXT: movaps %xmm7, 880(%rax) -; SSE-NEXT: movaps %xmm12, 864(%rax) +; SSE-NEXT: movaps %xmm6, 864(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 848(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 832(%rax) -; SSE-NEXT: movaps %xmm11, 816(%rax) +; SSE-NEXT: movaps %xmm12, 816(%rax) ; SSE-NEXT: movaps %xmm15, 800(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 784(%rax) @@ -2317,8 +2266,8 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm8 ; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm10 @@ -2521,100 +2470,100 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm10 -; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm1 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm0 +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm1 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm3 +; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm7 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm2[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm2[1],xmm8[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm0[1],xmm10[2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm3 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm4 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm7 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm8 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm4[1],xmm15[2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[2,2,2,2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -2627,8 +2576,8 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 64(%r10), %xmm8 -; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm15 @@ -2642,12 +2591,12 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm1[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm5[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] @@ -3255,56 +3204,56 @@ ; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512F-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm17 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm3, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512F-NEXT: vpermt2d %zmm13, %zmm4, %zmm5 ; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512F-NEXT: vpermt2d %zmm13, %zmm6, %zmm7 ; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512F-NEXT: vpermt2d %zmm13, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512F-NEXT: vpermt2d %zmm13, %zmm10, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 ; AVX512F-NEXT: vpermt2d %zmm13, %zmm12, %zmm14 ; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm14, %zmm15 ; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm18 ; AVX512F-NEXT: vpermt2d %zmm30, %zmm16, %zmm18 ; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm31 = ; AVX512F-NEXT: vpermt2d %zmm30, %zmm31, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 @@ -3326,11 +3275,11 @@ ; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2d %zmm17, %zmm31, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512F-NEXT: vpermt2d %zmm13, %zmm21, %zmm31 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = ; AVX512F-NEXT: vpermt2d %zmm13, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm13 @@ -3351,33 +3300,33 @@ ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm13 ; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm9 ; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm27 = ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm29 ; AVX512F-NEXT: vpermt2d %zmm1, %zmm27, %zmm29 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512F-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512F-NEXT: vpermt2d %zmm1, %zmm22, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512F-NEXT: vpermt2d %zmm1, %zmm23, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512F-NEXT: vpermt2d %zmm1, %zmm24, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512F-NEXT: vpermt2d %zmm1, %zmm25, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm30 ; AVX512F-NEXT: vpermt2d %zmm1, %zmm26, %zmm30 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = ; AVX512F-NEXT: vpermt2d %zmm1, %zmm19, %zmm9 ; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm2 ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -3391,28 +3340,28 @@ ; AVX512F-NEXT: vpermt2d %zmm0, %zmm19, %zmm2 ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm17 ; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm19 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm20 ; AVX512F-NEXT: vpermt2d %zmm19, %zmm11, %zmm20 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm18 ; AVX512F-NEXT: vpermt2d %zmm19, %zmm8, %zmm18 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm16 ; AVX512F-NEXT: vpermt2d %zmm19, %zmm7, %zmm16 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm15 ; AVX512F-NEXT: vpermt2d %zmm19, %zmm6, %zmm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm14 ; AVX512F-NEXT: vpermt2d %zmm19, %zmm5, %zmm14 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm12 ; AVX512F-NEXT: vpermt2d %zmm19, %zmm4, %zmm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm10 ; AVX512F-NEXT: vpermt2d %zmm19, %zmm1, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> ; AVX512F-NEXT: vpermt2d %zmm19, %zmm3, %zmm17 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm19 ; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -3513,22 +3462,22 @@ ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm19 {%k2} ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm19 {%k3} ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm19, 896(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, 960(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 512(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm19, 960(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm6, 704(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm8, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 512(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm14, 256(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm20, (%rax) ; AVX512F-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -3544,56 +3493,56 @@ ; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm3, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm4, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm5, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm6, %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm7, %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm9, %zmm10 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm10, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm11, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm12, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm14, %zmm15 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm16, %zmm18 ; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm31 = ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm31, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 @@ -3615,11 +3564,11 @@ ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm31, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm21, %zmm31 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm13 @@ -3640,33 +3589,33 @@ ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm13 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm9 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm27 = ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm29 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm27, %zmm29 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm28, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm22, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm23, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm24, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm30 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm26, %zmm30 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm19, %zmm9 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm2 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm0 @@ -3680,28 +3629,28 @@ ; AVX512BW-NEXT: vpermt2d %zmm0, %zmm19, %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm17 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm20 ; AVX512BW-NEXT: vpermt2d %zmm19, %zmm11, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 ; AVX512BW-NEXT: vpermt2d %zmm19, %zmm8, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm19, %zmm7, %zmm16 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm19, %zmm6, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm19, %zmm5, %zmm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm19, %zmm4, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm10 ; AVX512BW-NEXT: vpermt2d %zmm19, %zmm1, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermt2d %zmm19, %zmm3, %zmm17 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm19 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm0 @@ -3802,22 +3751,22 @@ ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm19 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 {%k3} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm19, 896(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 960(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 960(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 896(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 768(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rax) ; AVX512BW-NEXT: addq $2056, %rsp # imm = 0x808 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3873,32 +3822,32 @@ ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm3[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm3[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm6[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,3],xmm8[2,0] ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm3[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm2[2],xmm9[3],xmm2[3] ; SSE-NEXT: movaps %xmm9, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm8[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1] +; SSE-NEXT: movaps %xmm7, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 16(%rcx), %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm3[3,3] ; SSE-NEXT: movaps 16(%r10), %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,0],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm7[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm3 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] @@ -3919,47 +3868,47 @@ ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm4[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm10, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm11, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm5[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm3[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm3[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdx), %xmm0 -; SSE-NEXT: movaps 32(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 32(%rdx), %xmm1 +; SSE-NEXT: movaps 32(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 32(%rdi), %xmm8 ; SSE-NEXT: movaps 32(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%r10), %xmm1 -; SSE-NEXT: movaps 32(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 32(%r10), %xmm4 +; SSE-NEXT: movaps 32(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps 32(%r8), %xmm11 ; SSE-NEXT: movaps 32(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -3967,48 +3916,48 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdx), %xmm0 -; SSE-NEXT: movaps 48(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 48(%rdx), %xmm1 +; SSE-NEXT: movaps 48(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 48(%rdi), %xmm8 ; SSE-NEXT: movaps 48(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%r10), %xmm1 -; SSE-NEXT: movaps 48(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 48(%r10), %xmm4 +; SSE-NEXT: movaps 48(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps 48(%r8), %xmm11 ; SSE-NEXT: movaps 48(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4016,48 +3965,48 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdx), %xmm0 -; SSE-NEXT: movaps 64(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 64(%rdx), %xmm1 +; SSE-NEXT: movaps 64(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 64(%rdi), %xmm8 ; SSE-NEXT: movaps 64(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%r10), %xmm1 -; SSE-NEXT: movaps 64(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 64(%r10), %xmm4 +; SSE-NEXT: movaps 64(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps 64(%r8), %xmm11 ; SSE-NEXT: movaps 64(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4065,48 +4014,48 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdx), %xmm0 -; SSE-NEXT: movaps 80(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 80(%rdx), %xmm1 +; SSE-NEXT: movaps 80(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 80(%rdi), %xmm8 ; SSE-NEXT: movaps 80(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%r10), %xmm1 -; SSE-NEXT: movaps 80(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 80(%r10), %xmm4 +; SSE-NEXT: movaps 80(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps 80(%r8), %xmm11 ; SSE-NEXT: movaps 80(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4114,48 +4063,48 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdx), %xmm0 -; SSE-NEXT: movaps 96(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 96(%rdx), %xmm1 +; SSE-NEXT: movaps 96(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 96(%rdi), %xmm8 ; SSE-NEXT: movaps 96(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%r10), %xmm1 -; SSE-NEXT: movaps 96(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 96(%r10), %xmm4 +; SSE-NEXT: movaps 96(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps 96(%r8), %xmm11 ; SSE-NEXT: movaps 96(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4163,48 +4112,48 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%rdx), %xmm0 -; SSE-NEXT: movaps 112(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 112(%rdx), %xmm1 +; SSE-NEXT: movaps 112(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 112(%rdi), %xmm8 ; SSE-NEXT: movaps 112(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 112(%r10), %xmm1 -; SSE-NEXT: movaps 112(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 112(%r10), %xmm4 +; SSE-NEXT: movaps 112(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps 112(%r8), %xmm11 ; SSE-NEXT: movaps 112(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4212,48 +4161,48 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdx), %xmm0 -; SSE-NEXT: movaps 128(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 128(%rdx), %xmm1 +; SSE-NEXT: movaps 128(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 128(%rdi), %xmm8 ; SSE-NEXT: movaps 128(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%r10), %xmm1 -; SSE-NEXT: movaps 128(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 128(%r10), %xmm4 +; SSE-NEXT: movaps 128(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps 128(%r8), %xmm11 ; SSE-NEXT: movaps 128(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4261,48 +4210,48 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdx), %xmm0 -; SSE-NEXT: movaps 144(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 144(%rdx), %xmm1 +; SSE-NEXT: movaps 144(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 144(%rdi), %xmm8 ; SSE-NEXT: movaps 144(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 144(%r10), %xmm1 -; SSE-NEXT: movaps 144(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 144(%r10), %xmm4 +; SSE-NEXT: movaps 144(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps 144(%r8), %xmm11 ; SSE-NEXT: movaps 144(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4310,48 +4259,48 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdx), %xmm0 -; SSE-NEXT: movaps 160(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 160(%rdx), %xmm1 +; SSE-NEXT: movaps 160(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 160(%rdi), %xmm8 ; SSE-NEXT: movaps 160(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%r10), %xmm1 -; SSE-NEXT: movaps 160(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 160(%r10), %xmm4 +; SSE-NEXT: movaps 160(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps 160(%r8), %xmm11 ; SSE-NEXT: movaps 160(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4359,48 +4308,48 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%rdx), %xmm0 -; SSE-NEXT: movaps 176(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 176(%rdx), %xmm1 +; SSE-NEXT: movaps 176(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 176(%rdi), %xmm8 ; SSE-NEXT: movaps 176(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 176(%r10), %xmm1 -; SSE-NEXT: movaps 176(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 176(%r10), %xmm4 +; SSE-NEXT: movaps 176(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps 176(%r8), %xmm11 ; SSE-NEXT: movaps 176(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4408,48 +4357,48 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdx), %xmm0 -; SSE-NEXT: movaps 192(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 192(%rdx), %xmm1 +; SSE-NEXT: movaps 192(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 192(%rdi), %xmm8 ; SSE-NEXT: movaps 192(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%r10), %xmm1 -; SSE-NEXT: movaps 192(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 192(%r10), %xmm4 +; SSE-NEXT: movaps 192(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps 192(%r8), %xmm11 ; SSE-NEXT: movaps 192(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4457,48 +4406,48 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdx), %xmm0 -; SSE-NEXT: movaps 208(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 208(%rdx), %xmm1 +; SSE-NEXT: movaps 208(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 208(%rdi), %xmm8 ; SSE-NEXT: movaps 208(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm8, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; SSE-NEXT: movaps %xmm5, %xmm4 ; SSE-NEXT: movaps %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%r10), %xmm1 -; SSE-NEXT: movaps 208(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] +; SSE-NEXT: movaps 208(%r10), %xmm4 +; SSE-NEXT: movaps 208(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] ; SSE-NEXT: movaps 208(%r8), %xmm11 ; SSE-NEXT: movaps 208(%r9), %xmm6 ; SSE-NEXT: movaps %xmm11, %xmm10 @@ -4506,128 +4455,128 @@ ; SSE-NEXT: movaps %xmm10, %xmm9 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm5[2,0] ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,3],xmm5[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movaps %xmm11, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm0[2,0] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm2[2,0] ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdx), %xmm0 -; SSE-NEXT: movaps 224(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps 224(%rdx), %xmm1 +; SSE-NEXT: movaps 224(%rcx), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movaps 224(%rdi), %xmm12 ; SSE-NEXT: movaps 224(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm12, %xmm15 ; SSE-NEXT: unpcklps {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] ; SSE-NEXT: movaps %xmm15, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%r10), %xmm1 -; SSE-NEXT: movaps 224(%rax), %xmm4 -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: movaps 224(%r8), %xmm8 -; SSE-NEXT: movaps 224(%r9), %xmm6 -; SSE-NEXT: movaps %xmm8, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] -; SSE-NEXT: movaps %xmm11, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,0] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%r10), %xmm4 +; SSE-NEXT: movaps 224(%rax), %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-NEXT: movaps 224(%r8), %xmm9 +; SSE-NEXT: movaps 224(%r9), %xmm8 +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; SSE-NEXT: movaps %xmm11, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,0] +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,3],xmm5[2,0] -; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,3],xmm5[2,0] ; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm3[2],xmm12[3],xmm3[3] ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm12, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: movaps %xmm2, %xmm3 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: movaps %xmm8, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: movaps %xmm9, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm2[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm0[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm1[0,2] -; SSE-NEXT: movaps 240(%rdx), %xmm2 -; SSE-NEXT: movaps 240(%rcx), %xmm7 -; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,3],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm2[2,0] +; SSE-NEXT: movaps 240(%rdx), %xmm7 +; SSE-NEXT: movaps 240(%rcx), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] ; SSE-NEXT: movaps 240(%rdi), %xmm0 ; SSE-NEXT: movaps 240(%rsi), %xmm13 ; SSE-NEXT: movaps %xmm0, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] -; SSE-NEXT: movaps %xmm5, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0] -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%r10), %xmm1 -; SSE-NEXT: movaps 240(%rax), %xmm6 -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 240(%r10), %xmm6 +; SSE-NEXT: movaps 240(%rax), %xmm2 +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] ; SSE-NEXT: movaps 240(%r8), %xmm4 ; SSE-NEXT: movaps 240(%r9), %xmm14 -; SSE-NEXT: movaps %xmm4, %xmm9 -; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] -; SSE-NEXT: movaps %xmm9, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm7, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm3[2,0] -; SSE-NEXT: movaps %xmm6, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm3[2,0] +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1] +; SSE-NEXT: movaps %xmm8, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[2,0] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm7[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm1[2,0] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm6[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm1[2,0] ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] ; SSE-NEXT: movaps %xmm0, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[2,0] ; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm14[2],xmm4[3],xmm14[3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm4, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm3[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm7[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm6[3,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm7[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm6[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,3],xmm2[2,0] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movaps %xmm4, 2032(%rax) ; SSE-NEXT: movaps %xmm0, 2016(%rax) ; SSE-NEXT: movaps %xmm14, 2000(%rax) ; SSE-NEXT: movaps %xmm13, 1984(%rax) -; SSE-NEXT: movaps %xmm9, 1968(%rax) +; SSE-NEXT: movaps %xmm8, 1968(%rax) ; SSE-NEXT: movaps %xmm5, 1952(%rax) ; SSE-NEXT: movaps %xmm10, 1936(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1920(%rax) -; SSE-NEXT: movaps %xmm8, 1904(%rax) +; SSE-NEXT: movaps %xmm9, 1904(%rax) ; SSE-NEXT: movaps %xmm12, 1888(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 1872(%rax) @@ -4875,8 +4824,8 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm6 ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm7 ; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm8 @@ -5807,8 +5756,8 @@ ; AVX2-ONLY-LABEL: store_i32_stride8_vf64: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: subq $1672, %rsp # imm = 0x688 -; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 @@ -5819,8 +5768,8 @@ ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm7 ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm8 ; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm11 -; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm10 -; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm12 +; AVX2-ONLY-NEXT: vmovaps (%rax), %ymm10 +; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm12 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[4],ymm12[4],ymm10[5],ymm12[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 @@ -5842,21 +5791,21 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 24(%rax), %ymm13 +; AVX2-ONLY-NEXT: vbroadcastss 24(%r10), %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm10[0,1,2,3,4,5,6],ymm13[7] ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm10 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[6],ymm11[6],ymm8[7],ymm11[7] -; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %ymm8 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm5 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm6[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm5[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 32(%r10), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 28(%r10), %ymm7 +; AVX2-ONLY-NEXT: vbroadcastss 28(%rax), %ymm7 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm7[2],ymm12[2],ymm7[3],ymm12[3],ymm7[6],ymm12[6],ymm7[7],ymm12[7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 @@ -5886,7 +5835,7 @@ ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 56(%rax), %ymm6 +; AVX2-ONLY-NEXT: vbroadcastss 56(%r10), %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm6[7] ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm7 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] @@ -5900,7 +5849,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 60(%r10), %ymm3 +; AVX2-ONLY-NEXT: vbroadcastss 60(%rax), %ymm3 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] ; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm5 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm9[1],ymm3[1],ymm9[3],ymm3[3] @@ -5916,8 +5865,8 @@ ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%r9), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 64(%r10), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 64(%rax), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 64(%rax), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 64(%r10), %ymm9 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[4],ymm3[4],ymm5[5],ymm3[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] @@ -5933,7 +5882,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 88(%rax), %ymm1 +; AVX2-ONLY-NEXT: vbroadcastss 88(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[6],ymm7[6],ymm4[7],ymm7[7] @@ -5944,7 +5893,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 92(%r10), %ymm1 +; AVX2-ONLY-NEXT: vbroadcastss 92(%rax), %ymm1 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] @@ -5964,8 +5913,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 96(%r10), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 96(%rax), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 96(%rax), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 96(%r10), %ymm9 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] @@ -5981,7 +5930,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 120(%rax), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastss 120(%r10), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] @@ -5992,7 +5941,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 124(%r10), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastss 124(%rax), %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[6],ymm9[6],ymm0[7],ymm9[7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -6012,8 +5961,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 128(%r9), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 128(%r10), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 128(%rax), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 128(%rax), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 128(%r10), %ymm9 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] @@ -6029,7 +5978,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 152(%rax), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastss 152(%r10), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] @@ -6040,7 +5989,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 156(%r10), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastss 156(%rax), %ymm2 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -6060,8 +6009,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 160(%r10), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 160(%rax), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 160(%rax), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 160(%r10), %ymm9 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] @@ -6077,7 +6026,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 184(%rax), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastss 184(%r10), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] @@ -6088,7 +6037,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 188(%r10), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastss 188(%rax), %ymm2 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -6108,8 +6057,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 192(%r9), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 192(%r10), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 192(%rax), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 192(%rax), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 192(%r10), %ymm9 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] @@ -6125,7 +6074,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 216(%rax), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastss 216(%r10), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] @@ -6136,7 +6085,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 220(%r10), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastss 220(%rax), %ymm2 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -6156,8 +6105,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 224(%r9), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 224(%r10), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 224(%rax), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 224(%rax), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 224(%r10), %ymm9 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] @@ -6173,7 +6122,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 248(%rax), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastss 248(%r10), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] @@ -6184,7 +6133,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastss 252(%r10), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastss 252(%rax), %ymm2 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm9[2],ymm2[3],ymm9[3],ymm2[6],ymm9[6],ymm2[7],ymm9[7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -6202,9 +6151,9 @@ ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm5 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps (%r10), %xmm2 ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm7 -; AVX2-ONLY-NEXT: vmovaps (%r10), %xmm3 +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm8 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm8 @@ -6257,8 +6206,8 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps 32(%r10), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 32(%r10), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm7 ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm8 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] @@ -6312,8 +6261,8 @@ ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps 64(%r10), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 64(%r10), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm7 ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm8 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] @@ -6367,9 +6316,9 @@ ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 96(%r10), %xmm2 ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm7 -; AVX2-ONLY-NEXT: vmovaps 96(%r10), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 96(%rax), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm8 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %xmm8 @@ -6422,9 +6371,9 @@ ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps 128(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 128(%r10), %xmm2 ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm7 -; AVX2-ONLY-NEXT: vmovaps 128(%r10), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 128(%rax), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm8 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-ONLY-NEXT: vmovaps 128(%r9), %xmm8 @@ -6477,9 +6426,9 @@ ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps 160(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 160(%r10), %xmm2 ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm7 -; AVX2-ONLY-NEXT: vmovaps 160(%r10), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 160(%rax), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm8 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-ONLY-NEXT: vmovaps 160(%r9), %xmm8 @@ -6533,9 +6482,9 @@ ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm3[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vmovaps 192(%rax), %xmm2 -; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm7 ; AVX2-ONLY-NEXT: vmovaps 192(%r10), %xmm3 -; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm8 +; AVX2-ONLY-NEXT: vbroadcastss %xmm3, %xmm7 +; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm8 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-ONLY-NEXT: vmovaps 192(%r9), %xmm0 ; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm7 @@ -6548,7 +6497,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm0[1],xmm14[2,3] @@ -6557,8 +6506,8 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,2,2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] @@ -6568,7 +6517,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -6583,9 +6532,9 @@ ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm14 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm6[0],xmm14[1],xmm6[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 224(%rax), %xmm2 -; AVX2-ONLY-NEXT: vmovaps 224(%r10), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 224(%r10), %xmm2 ; AVX2-ONLY-NEXT: vbroadcastss %xmm2, %xmm15 +; AVX2-ONLY-NEXT: vmovaps 224(%rax), %xmm1 ; AVX2-ONLY-NEXT: vbroadcastss %xmm1, %xmm13 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] ; AVX2-ONLY-NEXT: vmovaps 224(%r9), %xmm15 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll @@ -60,15 +60,15 @@ ; SSE-NEXT: movaps (%rsi), %xmm2 ; SSE-NEXT: movaps 16(%rsi), %xmm3 ; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps %xmm2, 48(%rdx) -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps %xmm4, 16(%rdx) +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps %xmm2, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps %xmm4, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride2_vf4: @@ -130,25 +130,25 @@ ; SSE-NEXT: movaps 32(%rsi), %xmm6 ; SSE-NEXT: movaps 48(%rsi), %xmm7 ; SSE-NEXT: movaps %xmm0, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] ; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] ; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] -; SSE-NEXT: movaps %xmm3, 96(%rdx) -; SSE-NEXT: movaps %xmm6, 112(%rdx) -; SSE-NEXT: movaps %xmm2, 64(%rdx) -; SSE-NEXT: movaps %xmm5, 80(%rdx) -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps %xmm4, 48(%rdx) -; SSE-NEXT: movaps %xmm0, (%rdx) -; SSE-NEXT: movaps %xmm8, 16(%rdx) +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm7[1] +; SSE-NEXT: movaps %xmm3, 112(%rdx) +; SSE-NEXT: movaps %xmm6, 96(%rdx) +; SSE-NEXT: movaps %xmm2, 80(%rdx) +; SSE-NEXT: movaps %xmm5, 64(%rdx) +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps %xmm4, 32(%rdx) +; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps %xmm8, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride2_vf8: @@ -182,22 +182,22 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm1[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -240,48 +240,48 @@ ; SSE-NEXT: movaps 32(%rsi), %xmm14 ; SSE-NEXT: movaps 48(%rsi), %xmm15 ; SSE-NEXT: movaps %xmm8, %xmm7 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm9[0] ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] ; SSE-NEXT: movaps %xmm1, %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] ; SSE-NEXT: movaps %xmm2, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm14[1] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm14[0] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm14[1] ; SSE-NEXT: movaps %xmm5, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm15[1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm15[0] +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm15[1] ; SSE-NEXT: movaps %xmm3, %xmm15 -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm13[1] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm13[0] +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm13[1] ; SSE-NEXT: movaps %xmm4, %xmm13 -; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm12[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm12[0] +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm12[1] ; SSE-NEXT: movaps %xmm6, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm11[0] +; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm11[1] ; SSE-NEXT: movaps 112(%rsi), %xmm11 ; SSE-NEXT: movaps %xmm0, %xmm7 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm11[1] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0] -; SSE-NEXT: movaps %xmm0, 224(%rdx) -; SSE-NEXT: movaps %xmm7, 240(%rdx) -; SSE-NEXT: movaps %xmm6, 192(%rdx) -; SSE-NEXT: movaps %xmm12, 208(%rdx) -; SSE-NEXT: movaps %xmm4, 160(%rdx) -; SSE-NEXT: movaps %xmm13, 176(%rdx) -; SSE-NEXT: movaps %xmm3, 128(%rdx) -; SSE-NEXT: movaps %xmm15, 144(%rdx) -; SSE-NEXT: movaps %xmm5, 96(%rdx) -; SSE-NEXT: movaps %xmm14, 112(%rdx) -; SSE-NEXT: movaps %xmm2, 64(%rdx) -; SSE-NEXT: movaps %xmm10, 80(%rdx) -; SSE-NEXT: movaps %xmm1, 32(%rdx) -; SSE-NEXT: movaps %xmm9, 48(%rdx) -; SSE-NEXT: movaps %xmm8, (%rdx) +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1] +; SSE-NEXT: movaps %xmm0, 240(%rdx) +; SSE-NEXT: movaps %xmm7, 224(%rdx) +; SSE-NEXT: movaps %xmm6, 208(%rdx) +; SSE-NEXT: movaps %xmm12, 192(%rdx) +; SSE-NEXT: movaps %xmm4, 176(%rdx) +; SSE-NEXT: movaps %xmm13, 160(%rdx) +; SSE-NEXT: movaps %xmm3, 144(%rdx) +; SSE-NEXT: movaps %xmm15, 128(%rdx) +; SSE-NEXT: movaps %xmm5, 112(%rdx) +; SSE-NEXT: movaps %xmm14, 96(%rdx) +; SSE-NEXT: movaps %xmm2, 80(%rdx) +; SSE-NEXT: movaps %xmm10, 64(%rdx) +; SSE-NEXT: movaps %xmm1, 48(%rdx) +; SSE-NEXT: movaps %xmm9, 32(%rdx) +; SSE-NEXT: movaps %xmm8, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rdx) +; SSE-NEXT: movaps %xmm0, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride2_vf16: @@ -294,18 +294,18 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm4[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm0[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm7[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[3],ymm4[3] @@ -318,14 +318,14 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[3],ymm7[3] -; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 96(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 160(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 224(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 224(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 160(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 96(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX1-ONLY-NEXT: vmovapd %ymm0, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -339,38 +339,38 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm7 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm4[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm0[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm1[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm1[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm2[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm2[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm7[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm3[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm7[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm3[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3],ymm3[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm3, 192(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 160(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -380,17 +380,17 @@ ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm2 ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,12,5,13,6,14,7,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,1,9,2,10,3,11] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-NEXT: vpermt2q %zmm2, %zmm4, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,1,9,2,10,3,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,12,5,13,6,14,7,15] ; AVX512-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm4 ; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 @@ -556,44 +556,44 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride2_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm4[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm0[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm6[1],xmm2[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm2[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm7[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm4[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm9[1],xmm5[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm5[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm6[0],xmm5[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm7[0],xmm6[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 @@ -621,23 +621,23 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[3],ymm15[3] -; AVX1-ONLY-NEXT: vmovapd %ymm0, 416(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm14, 352(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 224(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 32(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 96(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 480(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 416(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 352(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 288(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 224(%rdx) ; AVX1-ONLY-NEXT: vmovapd %ymm10, 160(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 288(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 480(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 384(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 320(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 96(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 32(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 320(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -657,76 +657,76 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm11 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm14 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm1[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm1[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm7[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm4[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm7[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm4[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm9[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm9[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm14[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm12[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm14[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm12[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm8[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm13[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm8[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2,3],ymm8[4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm10[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm5[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm10[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm5[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm2[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm2[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm6 -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,2,2,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[2,1,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm6[0,0,2,1] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm15[0,1,1,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,0,2,1] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm15[0,1,1,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[0,2,2,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm15[2,1,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm1, 448(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 384(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 416(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 320(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 352(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 256(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm14, 288(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 192(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 128(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 416(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 384(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 352(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 320(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 288(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 256(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 192(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 160(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -740,10 +740,10 @@ ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,12,5,13,6,14,7,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,8,1,9,2,10,3,11] ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm9 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,8,1,9,2,10,3,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [4,12,5,13,6,14,7,15] ; AVX512-NEXT: vpermt2q %zmm4, %zmm10, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 @@ -753,14 +753,14 @@ ; AVX512-NEXT: vpermt2q %zmm6, %zmm10, %zmm2 ; AVX512-NEXT: vpermi2q %zmm7, %zmm3, %zmm8 ; AVX512-NEXT: vpermt2q %zmm7, %zmm10, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm3, 384(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm8, 448(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm5, 320(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm3, 448(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm8, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm2, 320(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm5, 256(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 @@ -1522,10 +1522,10 @@ ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm13 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm14 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm15 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [4,12,5,13,6,14,7,15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,8,1,9,2,10,3,11] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512-NEXT: vpermt2q %zmm12, %zmm16, %zmm17 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,8,1,9,2,10,3,11] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [4,12,5,13,6,14,7,15] ; AVX512-NEXT: vpermt2q %zmm12, %zmm18, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12 ; AVX512-NEXT: vpermt2q %zmm13, %zmm16, %zmm12 @@ -1547,22 +1547,22 @@ ; AVX512-NEXT: vpermt2q %zmm9, %zmm18, %zmm1 ; AVX512-NEXT: vpermi2q %zmm8, %zmm0, %zmm16 ; AVX512-NEXT: vpermt2q %zmm8, %zmm18, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, 896(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm16, 960(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm1, 768(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm10, 832(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm2, 640(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm11, 704(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm3, 512(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm15, 576(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm14, 448(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm6, 256(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm13, 320(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm5, 128(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm12, 192(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm4, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm0, 960(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm16, 896(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm1, 832(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm10, 768(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm2, 704(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm11, 640(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm3, 576(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm15, 512(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm14, 384(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm6, 320(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm13, 256(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm12, 128(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm4, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm17, (%rdx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll @@ -23,10 +23,10 @@ ; SSE-NEXT: movapd (%rdx), %xmm2 ; SSE-NEXT: movapd %xmm0, %xmm3 ; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] -; SSE-NEXT: movapd %xmm0, 16(%rcx) +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE-NEXT: movapd %xmm1, 32(%rcx) +; SSE-NEXT: movapd %xmm0, 16(%rcx) ; SSE-NEXT: movapd %xmm3, (%rcx) ; SSE-NEXT: retq ; @@ -84,26 +84,26 @@ define void @store_i64_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i64_stride3_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps (%rsi), %xmm2 -; SSE-NEXT: movaps 16(%rsi), %xmm3 -; SSE-NEXT: movaps (%rdx), %xmm4 -; SSE-NEXT: movaps 16(%rdx), %xmm5 -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movaps %xmm0, (%rcx) -; SSE-NEXT: movaps %xmm4, 16(%rcx) -; SSE-NEXT: movaps %xmm3, 32(%rcx) -; SSE-NEXT: movaps %xmm1, 48(%rcx) -; SSE-NEXT: movaps %xmm5, 64(%rcx) -; SSE-NEXT: movaps %xmm6, 80(%rcx) +; SSE-NEXT: movapd (%rdi), %xmm0 +; SSE-NEXT: movapd 16(%rdi), %xmm1 +; SSE-NEXT: movapd (%rsi), %xmm2 +; SSE-NEXT: movapd 16(%rsi), %xmm3 +; SSE-NEXT: movapd (%rdx), %xmm4 +; SSE-NEXT: movapd 16(%rdx), %xmm5 +; SSE-NEXT: movapd %xmm0, %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm2[0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE-NEXT: movapd %xmm1, %xmm4 +; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; SSE-NEXT: movapd %xmm3, 80(%rcx) +; SSE-NEXT: movapd %xmm1, 64(%rcx) +; SSE-NEXT: movapd %xmm4, 48(%rcx) +; SSE-NEXT: movapd %xmm2, 32(%rcx) +; SSE-NEXT: movapd %xmm0, 16(%rcx) +; SSE-NEXT: movapd %xmm6, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride3_vf4: @@ -123,8 +123,8 @@ ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd %ymm3, 64(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 64(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq @@ -139,15 +139,15 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rcx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -179,46 +179,46 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i64_stride3_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm3 -; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps 32(%rdi), %xmm1 -; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm7 -; SSE-NEXT: movaps 16(%rsi), %xmm8 -; SSE-NEXT: movaps 32(%rsi), %xmm9 -; SSE-NEXT: movaps 48(%rsi), %xmm10 -; SSE-NEXT: movaps (%rdx), %xmm11 -; SSE-NEXT: movaps 16(%rdx), %xmm12 -; SSE-NEXT: movaps 32(%rdx), %xmm6 -; SSE-NEXT: movaps 48(%rdx), %xmm5 -; SSE-NEXT: movaps %xmm10, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0] -; SSE-NEXT: movaps %xmm9, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm6[1] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0] -; SSE-NEXT: movaps %xmm8, %xmm9 -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm2[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm8[0] -; SSE-NEXT: movaps %xmm7, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm3[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0] -; SSE-NEXT: movaps %xmm3, (%rcx) -; SSE-NEXT: movaps %xmm11, 16(%rcx) -; SSE-NEXT: movaps %xmm8, 32(%rcx) -; SSE-NEXT: movaps %xmm2, 48(%rcx) -; SSE-NEXT: movaps %xmm12, 64(%rcx) -; SSE-NEXT: movaps %xmm9, 80(%rcx) -; SSE-NEXT: movaps %xmm1, 96(%rcx) -; SSE-NEXT: movaps %xmm6, 112(%rcx) -; SSE-NEXT: movaps %xmm10, 128(%rcx) -; SSE-NEXT: movaps %xmm0, 144(%rcx) -; SSE-NEXT: movaps %xmm5, 160(%rcx) -; SSE-NEXT: movaps %xmm4, 176(%rcx) +; SSE-NEXT: movapd (%rdi), %xmm0 +; SSE-NEXT: movapd 16(%rdi), %xmm1 +; SSE-NEXT: movapd 32(%rdi), %xmm3 +; SSE-NEXT: movapd 48(%rdi), %xmm6 +; SSE-NEXT: movapd (%rsi), %xmm2 +; SSE-NEXT: movapd 16(%rsi), %xmm4 +; SSE-NEXT: movapd 32(%rsi), %xmm7 +; SSE-NEXT: movapd 48(%rsi), %xmm8 +; SSE-NEXT: movapd (%rdx), %xmm9 +; SSE-NEXT: movapd 16(%rdx), %xmm10 +; SSE-NEXT: movapd 32(%rdx), %xmm11 +; SSE-NEXT: movapd 48(%rdx), %xmm12 +; SSE-NEXT: movapd %xmm0, %xmm5 +; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm2[0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm9[0],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm9[1] +; SSE-NEXT: movapd %xmm1, %xmm9 +; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm4[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] +; SSE-NEXT: movapd %xmm3, %xmm10 +; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm7[0] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm11[0],xmm3[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm11[1] +; SSE-NEXT: movapd %xmm6, %xmm11 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0],xmm8[0] +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm12[0],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm12[1] +; SSE-NEXT: movapd %xmm8, 176(%rcx) +; SSE-NEXT: movapd %xmm6, 160(%rcx) +; SSE-NEXT: movapd %xmm11, 144(%rcx) +; SSE-NEXT: movapd %xmm7, 128(%rcx) +; SSE-NEXT: movapd %xmm3, 112(%rcx) +; SSE-NEXT: movapd %xmm10, 96(%rcx) +; SSE-NEXT: movapd %xmm4, 80(%rcx) +; SSE-NEXT: movapd %xmm1, 64(%rcx) +; SSE-NEXT: movapd %xmm9, 48(%rcx) +; SSE-NEXT: movapd %xmm2, 32(%rcx) +; SSE-NEXT: movapd %xmm0, 16(%rcx) +; SSE-NEXT: movapd %xmm5, (%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride3_vf8: @@ -239,23 +239,23 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm3[2,3],ymm8[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm0[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm2[2,3],ymm8[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm3[2,3],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2],ymm1[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd %ymm5, 64(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm7, 160(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm1, 128(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 64(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rcx) @@ -270,36 +270,36 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm5 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm2[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm0[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm5[2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm4[2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm4[2,1,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm5[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, (%rcx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -444,78 +444,78 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride3_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm6 -; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm2 ; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm1 ; AVX1-ONLY-NEXT: vmovapd (%rdx), %ymm9 -; AVX1-ONLY-NEXT: vmovapd 32(%rdx), %ymm4 -; AVX1-ONLY-NEXT: vmovapd 64(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vmovapd 32(%rdx), %ymm8 +; AVX1-ONLY-NEXT: vmovapd 64(%rdx), %ymm5 ; AVX1-ONLY-NEXT: vmovapd 96(%rdx), %ymm3 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3],ymm10[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3],ymm10[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm10[0],mem[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm11, %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm11[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm5[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm8[2,3],ymm14[2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm14, %ymm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3],ymm14[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm7[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm9[2,3],ymm14[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm2[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm4[2,3],ymm14[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm4[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm8[2,3],ymm14[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3] -; AVX1-ONLY-NEXT: vmovapd 112(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm1[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm3[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm2[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[2,3],ymm15[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2],ymm14[3] -; AVX1-ONLY-NEXT: vmovapd 16(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[0,0,3,2] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],mem[2],ymm6[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[1,0,2,2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2],ymm5[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2],ymm7[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1,0,2,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],mem[2],ymm4[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm8[1],ymm4[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],mem[2],ymm2[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1,0,2,2] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],mem[2],ymm1[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd %ymm0, 64(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm14, 352(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 352(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm1, 320(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 256(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 224(%rcx) ; AVX1-ONLY-NEXT: vmovapd %ymm13, 160(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 128(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 224(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 256(%rcx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 32(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 128(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm12, 64(%rcx) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 32(%rcx) ; AVX1-ONLY-NEXT: vmovaps %ymm11, 288(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 96(%rcx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rcx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rcx) ; AVX1-ONLY-NEXT: vzeroupper @@ -523,78 +523,78 @@ ; ; AVX2-ONLY-LABEL: store_i64_stride3_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm11 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm11[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm8[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm6[4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm12[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm5[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm3[4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm10 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm4 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm10[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm11[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1],ymm7[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm11[1],ymm12[1],ymm11[3],ymm12[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm12 = ymm13[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm1[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm1[2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 288(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 352(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 320(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 160(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 128(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 256(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 32(%rcx) +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm6[4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm9[2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm9[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm1[0,1,2,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm4[2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm4[2,1,2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps %ymm1, 352(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 320(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 288(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 256(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 224(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 192(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 160(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 128(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 96(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -611,27 +611,26 @@ ; AVX512-NEXT: vpermt2q %zmm2, %zmm6, %zmm7 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7] ; AVX512-NEXT: vpermt2q %zmm4, %zmm8, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = <5,u,14,6,u,15,7,u> -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15] -; AVX512-NEXT: vpermt2q %zmm5, %zmm11, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512-NEXT: vpermt2q %zmm3, %zmm12, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7] -; AVX512-NEXT: vpermt2q %zmm5, %zmm14, %zmm13 -; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 -; AVX512-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 -; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm9 -; AVX512-NEXT: vpermt2q %zmm4, %zmm11, %zmm9 -; AVX512-NEXT: vpermt2q %zmm2, %zmm12, %zmm0 -; AVX512-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm9, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm13, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm10, 320(%rcx) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512-NEXT: vpermt2q %zmm2, %zmm9, %zmm10 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [10,1,2,11,4,5,12,7] +; AVX512-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = <5,u,14,6,u,15,7,u> +; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15] +; AVX512-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 +; AVX512-NEXT: vpermt2q %zmm5, %zmm8, %zmm6 +; AVX512-NEXT: vpermi2q %zmm3, %zmm1, %zmm9 +; AVX512-NEXT: vpermt2q %zmm5, %zmm11, %zmm9 +; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm3 +; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm3, 320(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm9, 256(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm6, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm2, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm10, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm7, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1235,12 +1234,12 @@ ; ; AVX512-LABEL: store_i64_stride3_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm0 -; AVX512-NEXT: vmovdqa64 (%rsi), %zmm4 -; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm5 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rsi), %zmm6 ; AVX512-NEXT: vmovdqa64 192(%rsi), %zmm7 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm8 @@ -1248,53 +1247,52 @@ ; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm10 ; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm11 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,8,u,1,9,u,2,10> -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512-NEXT: vpermt2q %zmm4, %zmm12, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512-NEXT: vpermt2q %zmm0, %zmm12, %zmm13 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7] ; AVX512-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = <5,u,14,6,u,15,7,u> -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm16 ; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm16 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15] -; AVX512-NEXT: vpermt2q %zmm11, %zmm17, %zmm16 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512-NEXT: vpermt2q %zmm7, %zmm18, %zmm19 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7] -; AVX512-NEXT: vpermt2q %zmm11, %zmm20, %zmm19 -; AVX512-NEXT: vpermt2q %zmm7, %zmm12, %zmm0 -; AVX512-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7 -; AVX512-NEXT: vpermt2q %zmm1, %zmm15, %zmm7 -; AVX512-NEXT: vpermt2q %zmm10, %zmm17, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11 -; AVX512-NEXT: vpermt2q %zmm6, %zmm18, %zmm11 -; AVX512-NEXT: vpermt2q %zmm10, %zmm20, %zmm11 -; AVX512-NEXT: vpermt2q %zmm6, %zmm12, %zmm1 -; AVX512-NEXT: vpermt2q %zmm10, %zmm14, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 -; AVX512-NEXT: vpermt2q %zmm2, %zmm15, %zmm6 -; AVX512-NEXT: vpermt2q %zmm9, %zmm17, %zmm6 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512-NEXT: vpermt2q %zmm5, %zmm18, %zmm10 -; AVX512-NEXT: vpermt2q %zmm9, %zmm20, %zmm10 -; AVX512-NEXT: vpermt2q %zmm5, %zmm12, %zmm2 -; AVX512-NEXT: vpermt2q %zmm9, %zmm14, %zmm2 -; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm15 -; AVX512-NEXT: vpermt2q %zmm8, %zmm17, %zmm15 -; AVX512-NEXT: vpermt2q %zmm4, %zmm18, %zmm3 -; AVX512-NEXT: vpermt2q %zmm8, %zmm20, %zmm3 -; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm10, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm6, 320(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm1, 384(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm11, 448(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm7, 512(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm0, 576(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm19, 640(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm16, 704(%rcx) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [10,1,2,11,4,5,12,7] +; AVX512-NEXT: vpermt2q %zmm8, %zmm17, %zmm16 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = <5,u,14,6,u,15,7,u> +; AVX512-NEXT: vpermt2q %zmm2, %zmm18, %zmm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,14,5,6,15] +; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm8 +; AVX512-NEXT: vpermt2q %zmm1, %zmm12, %zmm8 +; AVX512-NEXT: vpermt2q %zmm9, %zmm14, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm19 +; AVX512-NEXT: vpermt2q %zmm1, %zmm15, %zmm19 +; AVX512-NEXT: vpermt2q %zmm9, %zmm17, %zmm19 +; AVX512-NEXT: vpermt2q %zmm3, %zmm18, %zmm1 +; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512-NEXT: vpermt2q %zmm6, %zmm12, %zmm3 +; AVX512-NEXT: vpermt2q %zmm10, %zmm14, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm9 +; AVX512-NEXT: vpermt2q %zmm6, %zmm15, %zmm9 +; AVX512-NEXT: vpermt2q %zmm10, %zmm17, %zmm9 +; AVX512-NEXT: vpermt2q %zmm4, %zmm18, %zmm6 +; AVX512-NEXT: vpermt2q %zmm10, %zmm2, %zmm6 +; AVX512-NEXT: vpermi2q %zmm7, %zmm5, %zmm12 +; AVX512-NEXT: vpermt2q %zmm11, %zmm14, %zmm12 +; AVX512-NEXT: vpermi2q %zmm7, %zmm5, %zmm15 +; AVX512-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 +; AVX512-NEXT: vpermt2q %zmm5, %zmm18, %zmm7 +; AVX512-NEXT: vpermt2q %zmm11, %zmm2, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm7, 704(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm15, 640(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm12, 576(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm6, 512(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm9, 448(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm3, 384(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm1, 320(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm19, 256(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm8, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm13, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll @@ -26,10 +26,10 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, %xmm5 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-NEXT: movaps %xmm2, 48(%r8) +; SSE-NEXT: movaps %xmm0, 32(%r8) ; SSE-NEXT: movaps %xmm5, 16(%r8) ; SSE-NEXT: movaps %xmm4, (%r8) ; SSE-NEXT: retq @@ -97,26 +97,26 @@ ; SSE-NEXT: movaps 16(%rdx), %xmm5 ; SSE-NEXT: movaps (%rcx), %xmm6 ; SSE-NEXT: movaps 16(%rcx), %xmm7 -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0] -; SSE-NEXT: movaps %xmm0, %xmm9 -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm2[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; SSE-NEXT: movaps %xmm0, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm2[0] +; SSE-NEXT: movaps %xmm4, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm6[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE-NEXT: movaps %xmm5, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0] -; SSE-NEXT: movaps %xmm1, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: movaps %xmm1, 96(%r8) +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] ; SSE-NEXT: movaps %xmm5, 112(%r8) -; SSE-NEXT: movaps %xmm6, 64(%r8) -; SSE-NEXT: movaps %xmm2, 80(%r8) -; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movaps %xmm1, 96(%r8) +; SSE-NEXT: movaps %xmm6, 80(%r8) +; SSE-NEXT: movaps %xmm2, 64(%r8) ; SSE-NEXT: movaps %xmm4, 48(%r8) -; SSE-NEXT: movaps %xmm9, (%r8) -; SSE-NEXT: movaps %xmm8, 16(%r8) +; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movaps %xmm9, 16(%r8) +; SSE-NEXT: movaps %xmm8, (%r8) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride4_vf4: @@ -131,8 +131,8 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm4, (%r8) ; AVX1-ONLY-NEXT: vzeroupper @@ -194,129 +194,129 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %out.vec) nounwind { ; SSE-LABEL: store_i64_stride4_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm5 +; SSE-NEXT: movaps (%rdi), %xmm4 ; SSE-NEXT: movaps 16(%rdi), %xmm1 ; SSE-NEXT: movaps 32(%rdi), %xmm3 -; SSE-NEXT: movaps 48(%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm10 +; SSE-NEXT: movaps 48(%rdi), %xmm6 +; SSE-NEXT: movaps (%rsi), %xmm8 ; SSE-NEXT: movaps 16(%rsi), %xmm12 -; SSE-NEXT: movaps 32(%rsi), %xmm11 +; SSE-NEXT: movaps 32(%rsi), %xmm13 +; SSE-NEXT: movaps 48(%rsi), %xmm9 ; SSE-NEXT: movaps (%rdx), %xmm2 -; SSE-NEXT: movaps 16(%rdx), %xmm4 +; SSE-NEXT: movaps 16(%rdx), %xmm5 ; SSE-NEXT: movaps 32(%rdx), %xmm7 -; SSE-NEXT: movaps 48(%rdx), %xmm9 -; SSE-NEXT: movaps (%rcx), %xmm8 -; SSE-NEXT: movaps 16(%rcx), %xmm13 -; SSE-NEXT: movaps 32(%rcx), %xmm14 -; SSE-NEXT: movaps 48(%rcx), %xmm15 -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] -; SSE-NEXT: movaps %xmm5, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm10[1] +; SSE-NEXT: movaps 48(%rdx), %xmm0 +; SSE-NEXT: movaps (%rcx), %xmm11 +; SSE-NEXT: movaps 16(%rcx), %xmm14 +; SSE-NEXT: movaps 32(%rcx), %xmm15 ; SSE-NEXT: movaps %xmm4, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm13[1] -; SSE-NEXT: movaps %xmm1, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm8[1] +; SSE-NEXT: movaps %xmm2, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] +; SSE-NEXT: movaps %xmm1, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; SSE-NEXT: movaps %xmm7, %xmm12 +; SSE-NEXT: movaps %xmm5, %xmm12 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm14[1] ; SSE-NEXT: movaps %xmm3, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1] -; SSE-NEXT: movaps %xmm9, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1] -; SSE-NEXT: movaps 48(%rsi), %xmm15 -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] -; SSE-NEXT: movaps %xmm0, 224(%r8) -; SSE-NEXT: movaps %xmm9, 240(%r8) -; SSE-NEXT: movaps %xmm6, 192(%r8) -; SSE-NEXT: movaps %xmm11, 208(%r8) -; SSE-NEXT: movaps %xmm3, 160(%r8) +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm13[1] +; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm15[1] +; SSE-NEXT: movaps %xmm6, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm9[1] +; SSE-NEXT: movaps 48(%rcx), %xmm9 +; SSE-NEXT: movaps %xmm0, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] +; SSE-NEXT: movaps %xmm0, 240(%r8) +; SSE-NEXT: movaps %xmm6, 224(%r8) +; SSE-NEXT: movaps %xmm10, 208(%r8) +; SSE-NEXT: movaps %xmm15, 192(%r8) ; SSE-NEXT: movaps %xmm7, 176(%r8) +; SSE-NEXT: movaps %xmm3, 160(%r8) +; SSE-NEXT: movaps %xmm13, 144(%r8) ; SSE-NEXT: movaps %xmm14, 128(%r8) -; SSE-NEXT: movaps %xmm12, 144(%r8) +; SSE-NEXT: movaps %xmm5, 112(%r8) ; SSE-NEXT: movaps %xmm1, 96(%r8) -; SSE-NEXT: movaps %xmm4, 112(%r8) -; SSE-NEXT: movaps %xmm13, 64(%r8) -; SSE-NEXT: movaps %xmm10, 80(%r8) -; SSE-NEXT: movaps %xmm5, 32(%r8) +; SSE-NEXT: movaps %xmm12, 80(%r8) +; SSE-NEXT: movaps %xmm11, 64(%r8) ; SSE-NEXT: movaps %xmm2, 48(%r8) -; SSE-NEXT: movaps %xmm8, (%r8) +; SSE-NEXT: movaps %xmm4, 32(%r8) +; SSE-NEXT: movaps %xmm8, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r8) +; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride4_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm5[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm7 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm8[0],xmm7[0] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm11 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vmovaps %xmm8, 48(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 16(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm10, (%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 144(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 128(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 160(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 176(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 144(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 48(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm9, (%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 224(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 192(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r8) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride4_vf8: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm8 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] @@ -329,34 +329,34 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm5[0] -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm10 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm12 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm10[0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm5[1] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm12[1],xmm10[1] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm6[0],xmm3[0] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm9[0] +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm7 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm6[0],xmm3[0] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm11[0],xmm9[0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm11[1],xmm9[1] -; AVX2-ONLY-NEXT: vmovaps %xmm6, 48(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm3, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm12, 16(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm10, (%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm7, 176(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm5, 160(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm13, 144(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm8, 128(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r8) +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm7[0],xmm5[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm12[0],xmm10[0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm7[1],xmm5[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm12[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps %xmm7, 160(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm5, 176(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm11, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm9, 144(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm6, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm3, 48(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm13, (%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm8, 16(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r8) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -645,56 +645,56 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm10[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm13 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm14[0],xmm13[0] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm13 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm14[0],xmm12[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm13 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm14[1],xmm12[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm12 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm14[0],xmm12[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm12[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm14[1],xmm12[1] -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm14[0],xmm13[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm0[0],xmm14[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm0[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm14[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm0[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm14[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm14 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm14 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm14[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm4, 432(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 416(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 400(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 384(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 176(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 160(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 416(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 400(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 384(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 304(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 288(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 272(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 256(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 176(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 160(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm8, 144(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm15, 128(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 304(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 288(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 272(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 48(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 16(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -766,56 +766,56 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm9[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm10[0] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm11[0],xmm9[0] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm13 -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm14 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm12[1],xmm10[1] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm15[0],xmm14[0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm15[1],xmm14[1] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm11[0],xmm9[0] +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm14 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm15 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm11[1],xmm9[1] -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm11 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm11[0],xmm13[0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm13[1] -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm15[0],xmm13[0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm15[1],xmm13[1] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm12[0],xmm10[0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm12[1],xmm10[1] +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm12 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm12[0],xmm14[0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm0[0],xmm14[0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm0[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm14[0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm0[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm14 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm0[0],xmm13[0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm0[1],xmm13[1] -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm13 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm14[0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm14 ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm13[0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm0[1],xmm13[1] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm13 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm13[0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm13[1] -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm13 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm13[0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1] -; AVX2-ONLY-NEXT: vmovaps %xmm0, 48(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm2, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm3, (%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm4, 432(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm6, 416(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm5, 400(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm7, 384(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm11, 176(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm9, 160(%r8) +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm14[0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovaps %xmm0, 432(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm2, 416(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 400(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm3, 384(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm4, 304(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm6, 288(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm5, 272(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm7, 256(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm12, 176(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm10, 160(%r8) ; AVX2-ONLY-NEXT: vmovaps %xmm8, 144(%r8) ; AVX2-ONLY-NEXT: vmovaps %xmm15, 128(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm14, 304(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm10, 288(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm12, 272(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm13, 48(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm9, 32(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm11, 16(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 256(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, (%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -846,32 +846,32 @@ ; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <2,10,u,u,3,11,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,8,u,u,1,9,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512F-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 ; AVX512F-NEXT: movb $-52, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512F-NEXT: vpermt2q %zmm6, %zmm9, %zmm12 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,8,u,u,1,9,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <2,10,u,u,3,11,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512F-NEXT: vpermt2q %zmm6, %zmm12, %zmm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = <6,14,u,u,7,15,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = <4,12,u,u,5,13,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = ; AVX512F-NEXT: vpermt2q %zmm6, %zmm15, %zmm4 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <4,12,u,u,5,13,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <6,14,u,u,7,15,u,u> ; AVX512F-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512F-NEXT: vpermi2q %zmm7, %zmm5, %zmm8 @@ -886,14 +886,14 @@ ; AVX512F-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 ; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm16, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm13, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm10, 320(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm17, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm14, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm1, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm16, 384(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm13, 320(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm10, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm17, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -907,32 +907,32 @@ ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <2,10,u,u,3,11,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,8,u,u,1,9,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm10, %zmm11 ; AVX512BW-NEXT: movb $-52, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm9, %zmm12 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,8,u,u,1,9,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <2,10,u,u,3,11,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <6,14,u,u,7,15,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <4,12,u,u,5,13,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm15, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <4,12,u,u,5,13,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <6,14,u,u,7,15,u,u> ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} ; AVX512BW-NEXT: vpermi2q %zmm7, %zmm5, %zmm8 @@ -947,14 +947,14 @@ ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 @@ -1397,38 +1397,38 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -1445,21 +1445,21 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm1 @@ -1470,31 +1470,31 @@ ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm4, (%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 944(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 928(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 912(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 896(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm5, 816(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm7, 800(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm6, 784(%r8) ; AVX1-ONLY-NEXT: vmovaps %xmm8, 768(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 944(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 928(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 912(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 896(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 688(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 672(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 656(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 688(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 672(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 656(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 640(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 560(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 544(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 528(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 640(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 512(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1504,29 +1504,29 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%r8) ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 560(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 544(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 528(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 512(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 992(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1662,38 +1662,38 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -1710,21 +1710,21 @@ ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm1[0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm1[0],xmm0[0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 224(%rcx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %xmm1 @@ -1735,31 +1735,31 @@ ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %xmm0 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 224(%rcx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps %xmm0, 48(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm3, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm2, 16(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm4, (%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 944(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm3, 928(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm2, 912(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm4, 896(%r8) ; AVX2-ONLY-NEXT: vmovaps %xmm5, 816(%r8) ; AVX2-ONLY-NEXT: vmovaps %xmm7, 800(%r8) ; AVX2-ONLY-NEXT: vmovaps %xmm6, 784(%r8) ; AVX2-ONLY-NEXT: vmovaps %xmm8, 768(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm9, 944(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm11, 928(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm10, 912(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm12, 896(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm13, 688(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm15, 672(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm14, 656(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm9, 688(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm11, 672(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm10, 656(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm12, 640(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm13, 560(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm15, 544(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm14, 528(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 640(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 512(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 432(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1769,29 +1769,29 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 384(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 176(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 304(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 160(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 288(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 144(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 272(%r8) ; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 256(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 304(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 176(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 288(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 160(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 272(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 144(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 256(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 128(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 560(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 48(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 544(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 32(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 528(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 16(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 512(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, (%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 992(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1846,32 +1846,32 @@ ; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512F-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,10,u,u,3,11,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,8,u,u,1,9,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512F-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 ; AVX512F-NEXT: movb $-52, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512F-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,8,u,u,1,9,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <2,10,u,u,3,11,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512F-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512F-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <6,14,u,u,7,15,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <4,12,u,u,5,13,u,u> ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512F-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 ; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = ; AVX512F-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = <4,12,u,u,5,13,u,u> +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = <6,14,u,u,7,15,u,u> ; AVX512F-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -1922,22 +1922,22 @@ ; AVX512F-NEXT: vpermt2q %zmm9, %zmm20, %zmm6 ; AVX512F-NEXT: vpermt2q %zmm5, %zmm21, %zmm3 ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm3, 896(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm15, 960(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm11, 768(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm7, 832(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm2, 640(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm27, 704(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm26, 512(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm25, 576(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm24, 448(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm22, 256(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm17, 320(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm3, 960(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm15, 896(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm11, 832(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm7, 768(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm2, 704(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm27, 640(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm26, 576(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm25, 512(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm1, 448(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm22, 320(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm17, 256(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm10, 128(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%r8) +; AVX512F-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1959,32 +1959,32 @@ ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm26 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm19 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm14, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <2,10,u,u,3,11,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,8,u,u,1,9,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm4 ; AVX512BW-NEXT: movb $-52, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm16, %zmm10 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <0,8,u,u,1,9,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <2,10,u,u,3,11,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm20 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm18, %zmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <6,14,u,u,7,15,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <4,12,u,u,5,13,u,u> ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm20, %zmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <4,12,u,u,5,13,u,u> +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = <6,14,u,u,7,15,u,u> ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm22 @@ -2035,22 +2035,22 @@ ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm20, %zmm6 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm3 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, 896(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 768(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 832(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 704(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 512(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 576(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 448(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 256(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 320(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 960(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 896(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 832(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 768(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 704(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 640(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 576(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 512(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 384(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 320(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 256(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 @@ -2941,26 +2941,26 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -2977,26 +2977,26 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -3013,50 +3013,50 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 256(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 256(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 288(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 288(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -3073,76 +3073,76 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 352(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps 480(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 448(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 416(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 448(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 480(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 16(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm4, (%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 1584(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 1568(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 1552(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 1536(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 1840(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 1824(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 1808(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 1792(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 1968(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 1952(%r8) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 1936(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1968(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 1952(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 1936(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 1920(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 1840(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 1824(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 1808(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 1792(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 1712(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 1696(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 1680(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 1664(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 1584(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 1568(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 1552(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1920(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1536(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1712(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1456(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1696(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1440(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1680(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1424(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1664(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1408(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 1328(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3152,14 +3152,6 @@ ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 1280(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1456(%r8) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1440(%r8) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1424(%r8) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1408(%r8) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 1200(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 1184(%r8) @@ -3168,13 +3160,13 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 1152(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 816(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1072(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 800(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1056(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 784(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1040(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 768(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1024(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 944(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3184,6 +3176,14 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 896(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 816(%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 800(%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 784(%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 768(%r8) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 688(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 672(%r8) @@ -3192,21 +3192,21 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 640(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 560(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 544(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 528(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 512(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 416(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3216,21 +3216,21 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 560(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 544(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 528(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 512(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1072(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1056(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1040(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%r8) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1024(%r8) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2016(%r8) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3494,26 +3494,26 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rcx), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 256(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -3530,26 +3530,26 @@ ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -3566,50 +3566,50 @@ ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rcx), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 224(%rcx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 256(%rsi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rcx), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 288(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 256(%rcx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 256(%rdx), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 288(%rsi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rcx), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 288(%rcx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 288(%rdx), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] @@ -3626,76 +3626,76 @@ ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rcx), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 352(%rcx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vmovaps 480(%rcx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %xmm0 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm0[0],xmm1[0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 384(%rcx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm1[0],xmm0[0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 448(%rcx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 416(%rcx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %xmm0 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps 384(%rcx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 448(%rcx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %xmm0 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 480(%rcx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %xmm0 ; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovaps %xmm0, 48(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm3, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm2, 16(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm4, (%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm5, 1584(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm7, 1568(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm6, 1552(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm8, 1536(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm9, 1840(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm11, 1824(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm10, 1808(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm12, 1792(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm13, 1968(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm15, 1952(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm14, 1936(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 1968(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm3, 1952(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm2, 1936(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm4, 1920(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm5, 1840(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm7, 1824(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm6, 1808(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm8, 1792(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm9, 1712(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm11, 1696(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm10, 1680(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm12, 1664(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm13, 1584(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm15, 1568(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm14, 1552(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1920(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 1536(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1712(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 1456(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1696(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 1440(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1680(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 1424(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1664(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 1408(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 1328(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3705,14 +3705,6 @@ ; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 1280(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1456(%r8) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1440(%r8) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1424(%r8) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1408(%r8) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 1200(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 1184(%r8) @@ -3721,13 +3713,13 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 1152(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 816(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 1072(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 800(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 1056(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 784(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 1040(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 768(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 1024(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 944(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3737,6 +3729,14 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 896(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm0, 816(%r8) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm0, 800(%r8) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm0, 784(%r8) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm0, 768(%r8) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 688(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 672(%r8) @@ -3745,21 +3745,21 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 640(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 432(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 560(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 416(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 544(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 400(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 528(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 384(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 512(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 176(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 432(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 160(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 416(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 144(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 400(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 384(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 304(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3769,21 +3769,21 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 256(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 560(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 176(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 544(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 160(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 528(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 144(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 512(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 128(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1072(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 48(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1056(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 32(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1040(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 16(%r8) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1024(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm0, (%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2016(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll @@ -29,9 +29,9 @@ ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: movapd %xmm2, 16(%r9) -; SSE-NEXT: movapd %xmm0, 32(%r9) ; SSE-NEXT: movapd %xmm1, 48(%r9) +; SSE-NEXT: movapd %xmm0, 32(%r9) +; SSE-NEXT: movapd %xmm2, 16(%r9) ; SSE-NEXT: movapd %xmm3, 64(%r9) ; SSE-NEXT: movapd %xmm5, (%r9) ; SSE-NEXT: retq @@ -109,70 +109,70 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i64_stride5_vf4: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm1 -; SSE-NEXT: movaps 16(%rdi), %xmm0 -; SSE-NEXT: movaps (%rsi), %xmm2 -; SSE-NEXT: movaps 16(%rsi), %xmm3 -; SSE-NEXT: movaps (%rdx), %xmm4 -; SSE-NEXT: movaps 16(%rdx), %xmm5 -; SSE-NEXT: movaps (%rcx), %xmm6 -; SSE-NEXT: movaps 16(%rcx), %xmm7 -; SSE-NEXT: movaps (%r8), %xmm8 -; SSE-NEXT: movaps 16(%r8), %xmm9 -; SSE-NEXT: movaps %xmm3, %xmm10 -; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm5[1] -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm7[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0] +; SSE-NEXT: movapd (%rdi), %xmm0 +; SSE-NEXT: movapd 16(%rdi), %xmm3 +; SSE-NEXT: movapd (%rsi), %xmm2 +; SSE-NEXT: movapd 16(%rsi), %xmm4 +; SSE-NEXT: movapd (%rdx), %xmm1 +; SSE-NEXT: movapd 16(%rdx), %xmm5 +; SSE-NEXT: movapd (%rcx), %xmm6 +; SSE-NEXT: movapd 16(%rcx), %xmm7 +; SSE-NEXT: movapd (%r8), %xmm8 +; SSE-NEXT: movapd 16(%r8), %xmm9 +; SSE-NEXT: movapd %xmm0, %xmm10 +; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm2[0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm8[0],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm6[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps %xmm1, (%r9) -; SSE-NEXT: movaps %xmm4, 16(%r9) -; SSE-NEXT: movaps %xmm8, 32(%r9) -; SSE-NEXT: movaps %xmm3, 48(%r9) -; SSE-NEXT: movaps %xmm6, 64(%r9) -; SSE-NEXT: movaps %xmm0, 80(%r9) -; SSE-NEXT: movaps %xmm5, 96(%r9) -; SSE-NEXT: movaps %xmm9, 112(%r9) -; SSE-NEXT: movaps %xmm10, 128(%r9) -; SSE-NEXT: movaps %xmm7, 144(%r9) +; SSE-NEXT: movapd %xmm3, %xmm8 +; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm9[0],xmm3[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm5 = xmm5[0],xmm7[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] +; SSE-NEXT: movapd %xmm7, 144(%r9) +; SSE-NEXT: movapd %xmm4, 128(%r9) +; SSE-NEXT: movapd %xmm3, 112(%r9) +; SSE-NEXT: movapd %xmm5, 96(%r9) +; SSE-NEXT: movapd %xmm8, 80(%r9) +; SSE-NEXT: movapd %xmm6, 64(%r9) +; SSE-NEXT: movapd %xmm2, 48(%r9) +; SSE-NEXT: movapd %xmm0, 32(%r9) +; SSE-NEXT: movapd %xmm1, 16(%r9) +; SSE-NEXT: movapd %xmm10, (%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf4: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm1[1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovlpd {{.*#+}} xmm3 = mem[0],xmm3[1] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm1[2],ymm3[3] +; AVX1-ONLY-NEXT: vmovlpd {{.*#+}} xmm4 = mem[0],xmm4[1] ; AVX1-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm5, (%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 96(%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 96(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 128(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 64(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 128(%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -182,33 +182,33 @@ ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm2 ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm4 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm5[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm6[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],mem[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm5[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps %xmm3, 16(%r9) ; AVX2-ONLY-NEXT: vmovaps %xmm2, (%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 128(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%r9) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -221,6 +221,7 @@ ; AVX512-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = <15,3,7,u> ; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,4,8,12,u,1,5,9> ; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7] @@ -231,8 +232,7 @@ ; AVX512-NEXT: vpermi2q %zmm2, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%r9) ; AVX512-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512-NEXT: vmovdqa %ymm0, 128(%r9) +; AVX512-NEXT: vmovdqa %ymm3, 128(%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %in.vec0 = load <4 x i64>, ptr %in.vecptr0, align 64 @@ -331,129 +331,129 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm9 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm3 -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm5[1],ymm6[2,3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[2],ymm8[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm5[2],ymm8[3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm5[0],ymm9[1,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm3[2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm13[1],xmm10[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm11[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm5 +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm2 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm7[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm2[0],ymm8[1,2,3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0],ymm2[1],ymm11[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[2],ymm7[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm2[2],ymm7[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm4, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm7, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 64(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 192(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 256(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 224(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 256(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm11, 224(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm8, 192(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 64(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 288(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 288(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%r9) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride5_vf8: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm4 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm9 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm9 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm5 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm7[0],mem[0],ymm7[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1],ymm10[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1],ymm10[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],mem[0],ymm3[2],mem[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm5[2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rsi), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3],ymm7[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm8[0],mem[0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%r9) +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps %xmm5, (%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm6, 160(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%r9) ; AVX2-ONLY-NEXT: vmovaps %xmm4, 176(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm7, 160(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 256(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 192(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 64(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm13, 128(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm12, 288(%r9) @@ -779,153 +779,150 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $216, %rsp -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: subq $152, %rsp +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm4 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],mem[0],ymm7[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd 96(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm2[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm0[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm15 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm15[0],mem[0],ymm15[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm10[0,1,2],ymm12[3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm9[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm9[2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm9[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm8[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm8[0],mem[0],ymm8[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm12[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[0],ymm8[0],ymm12[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm7[0],mem[0],ymm7[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm1[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm9[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm9[0],ymm0[0],ymm9[2],ymm0[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm14[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm0 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = ymm0[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm13[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm14[2,3] ; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm0[0],ymm14[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm8[0],ymm0[1],ymm8[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm3[0,1],ymm0[2],ymm3[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm0[0],ymm11[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0],ymm0[1],ymm13[2,3] +; AVX1-ONLY-NEXT: vblendpd $11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[0,1],ymm0[2],mem[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm15[2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm9[1],xmm7[1] +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm0 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3] ; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0],ymm9[1],mem[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm9[0],ymm2[1,2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1],ymm9[2],ymm0[3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm9[0],ymm3[1,2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0],ymm9[1],ymm10[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm9[2],ymm2[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm13, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm15, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm1, 496(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 480(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 336(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 320(%r9) +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm12, 496(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 480(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 336(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 320(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm9, (%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm2, 576(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 544(%r9) ; AVX1-ONLY-NEXT: vmovapd %ymm3, 512(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 384(%r9) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 352(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 256(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm14, 224(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 384(%r9) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 352(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 256(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm13, 224(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 608(%r9) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 544(%r9) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 608(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%r9) @@ -933,156 +930,154 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX1-ONLY-NEXT: addq $216, %rsp +; AVX1-ONLY-NEXT: addq $152, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride5_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $312, %rsp # imm = 0x138 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX2-ONLY-NEXT: subq $280, %rsp # imm = 0x118 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],mem[0],ymm11[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm8[0],mem[0],ymm8[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm14[0],mem[0],ymm14[2],mem[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm13[0],mem[0],ymm13[2],mem[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rsi), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm10 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm8[0,1],ymm15[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm9[0,1],ymm15[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,3,3] ; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1],ymm8[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1],ymm9[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm6[2,3] ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm6 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm6[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm6[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = ymm6[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm6[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm0 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm0[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm0[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm8[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm11 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm8 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1],ymm14[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm1 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm4 = ymm1[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm1[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm1[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1],ymm13[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm0 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm4 = ymm0[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm0[2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm0[4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm13 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm15 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps %xmm10, 16(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm15, (%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm9, 496(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm12, 480(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm0, 176(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm13, 160(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm1, 336(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm14, 320(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 576(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 544(%r9) +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps %xmm11, 496(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm15, 480(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 336(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm9, 320(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 176(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm12, 160(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm2, 16(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm13, (%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 576(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 544(%r9) ; AVX2-ONLY-NEXT: vmovaps %ymm4, 512(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 416(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 384(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 352(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 256(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 416(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 384(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 352(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 256(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1097,207 +1092,203 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r9) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 608(%r9) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 608(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r9) -; AVX2-ONLY-NEXT: addq $312, %rsp # imm = 0x138 +; AVX2-ONLY-NEXT: addq $280, %rsp # imm = 0x118 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: store_i64_stride5_vf16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm9 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm10 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm12 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm11 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm12 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm11 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm5 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <3,u,u,u,12,4,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm9 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm6, %zmm3 ; AVX512F-NEXT: movb $49, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} ; AVX512F-NEXT: movb $8, %al ; AVX512F-NEXT: kmovw %eax, %k2 ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 {%k2} -; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,8,u,u,u,1,9,u> -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm7, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm16, %zmm14 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,8,u,u,u,1,9,u> +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm9, %zmm8 ; AVX512F-NEXT: movb $-116, %al -; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] -; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <6,14,u,u,u,7,15,u> -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 -; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] -; AVX512F-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm11, %zmm17, %zmm8 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = <1,u,u,u,10,2,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm18, %zmm19 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm15, %zmm14 +; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm14 {%k1} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,9,2,3,4,5,10,7] +; AVX512F-NEXT: vpermt2q %zmm11, %zmm19, %zmm14 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm22, %zmm23 +; AVX512F-NEXT: movb $24, %al +; AVX512F-NEXT: kmovw %eax, %k3 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm23 {%k3} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512F-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = <1,u,u,u,10,2,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] -; AVX512F-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] +; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm12, %zmm24, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm12 = <6,14,u,u,u,7,15,u> +; AVX512F-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 {%k3} -; AVX512F-NEXT: vpermt2q %zmm11, %zmm17, %zmm0 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm18 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm6, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm14 {%k2} -; AVX512F-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm6, %zmm20 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm22 {%k2} -; AVX512F-NEXT: vpermt2q %zmm4, %zmm21, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm24, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm26, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} -; AVX512F-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm22, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm14, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm0, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm27, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm3, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm23, 512(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm13, 576(%r9) +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,14,3,4,5,6,15] +; AVX512F-NEXT: vpermt2q %zmm11, %zmm10, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm7, %zmm2, %zmm16 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm9 {%k2} +; AVX512F-NEXT: vpermt2q %zmm5, %zmm17, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm7, %zmm18 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} +; AVX512F-NEXT: vpermt2q %zmm5, %zmm19, %zmm15 +; AVX512F-NEXT: vpermi2q %zmm7, %zmm2, %zmm20 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm1, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm22 {%k3} +; AVX512F-NEXT: vpermt2q %zmm5, %zmm21, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm4, %zmm24, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm7, %zmm12, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k3} +; AVX512F-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 +; AVX512F-NEXT: vmovdqa64 %zmm2, 576(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, 512(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm6, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm15, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm9, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm23, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm14, 64(%r9) ; AVX512F-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride5_vf16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm10 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm11 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm11 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm5 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <3,u,u,u,12,4,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm9 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm6, %zmm3 ; AVX512BW-NEXT: movb $49, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 {%k1} ; AVX512BW-NEXT: movb $8, %al ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k2} -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <0,8,u,u,u,1,9,u> -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm2, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm16 = +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm16, %zmm14 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = <0,8,u,u,u,1,9,u> +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm9, %zmm8 ; AVX512BW-NEXT: movb $-116, %al -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm17, %zmm8 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm18, %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <6,14,u,u,u,7,15,u> -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 -; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,1,14,3,4,5,6,15] -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [0,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm17, %zmm8 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <1,u,u,u,10,2,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm18, %zmm19 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,9,2,3,4,5,10,7] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm19, %zmm14 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [13,5,13,5,13,5,13,5] ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm22, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm23 +; AVX512BW-NEXT: movb $24, %al +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k3} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7] ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm21, %zmm23 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = <1,u,u,u,10,2,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm24, %zmm25 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm26, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,9,2,3,4,5,10,7] -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm25, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm15, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm16, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] +; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm24, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm12 = <6,14,u,u,u,7,15,u> +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k3} -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm17, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm14 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm21, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm24, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm26, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm25, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 512(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,14,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm10, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm2, %zmm16 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm17, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm7, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm15 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm19, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm2, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm1, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm22 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm21, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm24, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm7, %zmm12, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm10, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm2, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, 512(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%r9) ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1711,309 +1702,299 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm11 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm7[1],ymm1[3],ymm7[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX1-ONLY-NEXT: subq $984, %rsp # imm = 0x3D8 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm13 +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm6 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],mem[0],ymm11[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd 160(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm13 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm13[0],mem[0],ymm13[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm13[0],mem[0],ymm13[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],mem[0],ymm11[2],mem[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 136(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 136(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 168(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 168(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm0[0,1,2],ymm3[3] -; AVX1-ONLY-NEXT: vbroadcastsd 200(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm0[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm14[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm14[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm14[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm13[0],ymm9[0],ymm13[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm14[1],xmm13[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = ymm14[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0],ymm14[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm13[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = ymm9[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, (%rsp), %ymm9, %ymm13 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm9[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm13, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 200(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],mem[0],ymm2[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm9[0],ymm2[0],ymm9[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm13[1],xmm9[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[0],ymm13[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm13[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm13[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1,2],ymm13[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm12[0],ymm2[0],ymm12[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm13 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm13[0,1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm13[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 176(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0],ymm14[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm14[0],ymm12[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm14[2],ymm11[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm14[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm10[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3] -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0],ymm8[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0],ymm11[1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2],ymm0[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm13 +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm13[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm13[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3],ymm13[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm12[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm12[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm11 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm12 +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm12[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm12[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3],ymm12[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm11 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm11[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm11[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 176(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm13 +; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm9 = ymm13[0],mem[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0],ymm13[1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm13[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 240(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0],ymm4[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm1[1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm14[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm4, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm5, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 976(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 960(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 1136(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 1120(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 816(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 800(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 496(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 480(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 160(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm1, 336(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 320(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 656(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 640(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 1136(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 1120(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 976(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 960(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 816(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 800(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 656(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 640(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 496(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 480(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 336(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 320(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 176(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 160(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 16(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm8, (%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2021,10 +2002,14 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1152(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1024(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 992(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 896(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%r9) @@ -2033,8 +2018,10 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%r9) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%r9) @@ -2055,20 +2042,14 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1056(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1024(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 928(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 768(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 736(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%r9) @@ -2078,7 +2059,7 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX1-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX1-ONLY-NEXT: addq $984, %rsp # imm = 0x3D8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -2092,10 +2073,10 @@ ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] @@ -2105,7 +2086,7 @@ ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],mem[0],ymm7[2],mem[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] @@ -2115,7 +2096,7 @@ ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] @@ -2203,12 +2184,12 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vbroadcastsd 248(%rsi), %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm13 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm12 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm14 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rsi), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7] @@ -2216,48 +2197,48 @@ ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %ymm0 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm12 +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rsi), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm10[1],ymm1[1],ymm10[3],ymm1[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] ; AVX2-ONLY-NEXT: vbroadcastsd 152(%rsi), %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm6 ; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm6[1],ymm2[1],ymm6[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm6[1],ymm2[1],ymm6[3],ymm2[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%rsi), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm11[0,1],ymm13[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = ymm11[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm13 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = ymm13[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm12 +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm12[0,1],mem[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm14 = mem[0,1],ymm13[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = mem[0,1],ymm12[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm13[4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3],ymm7[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm11 -; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm11[0,1],mem[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = mem[0,1],ymm11[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps $63, (%rsp), %ymm11, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = ymm11[0,1],mem[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = mem[0,1],ymm11[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps $63, (%rsp), %ymm11, %ymm12 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm7, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] @@ -2290,7 +2271,7 @@ ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm1[0],ymm12[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm1[0],ymm10[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 $49, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = ymm0[2,3],mem[2,3] ; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm1 @@ -2328,7 +2309,7 @@ ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm1[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2347,15 +2328,15 @@ ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm15 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0] @@ -2363,38 +2344,38 @@ ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps %xmm8, 16(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm9, (%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm8, 1136(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm9, 1120(%r9) ; AVX2-ONLY-NEXT: vmovaps %xmm0, 976(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm11, 960(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm1, 1136(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm12, 1120(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm2, 816(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm13, 800(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm10, 960(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 816(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm11, 800(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm2, 656(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm12, 640(%r9) ; AVX2-ONLY-NEXT: vmovaps %xmm3, 496(%r9) ; AVX2-ONLY-NEXT: vmovaps %xmm14, 480(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm4, 176(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm15, 160(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm5, 336(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm10, 320(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm6, 656(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm7, 640(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm4, 336(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm15, 320(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm5, 176(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm13, 160(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm6, 16(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm7, (%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1216(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2465,405 +2446,391 @@ ; ; AVX512F-LABEL: store_i64_stride5_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm12 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm3 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm27 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <3,u,u,u,12,4,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 +; AVX512F-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm24 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm25 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm27 +; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm21 +; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm10 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm16 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm11 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm12 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <3,u,u,u,12,4,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm18, %zmm4 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm14, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm18, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm18, %zmm29 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm18 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm26, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,8,u,u,u,1,9,u> -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [15,7,15,7,15,7,15,7] -; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm22, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm15 = <6,14,u,u,u,7,15,u> -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm15, %zmm25 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [13,5,13,5,13,5,13,5] -; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm19, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = <1,u,u,u,10,2,u,u> -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm28, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm23, %zmm26, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm15, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm19, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm28, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm24, %zmm26, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm10, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm20, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,8,u,u,u,1,9,u> +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm13, %zmm22 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = <1,u,u,u,10,2,u,u> +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm16, %zmm18, %zmm3 +; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm17, %zmm15 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm25, %zmm23, %zmm19 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] +; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm25, %zmm29, %zmm24 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm13, %zmm25 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm17, %zmm26 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm23, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm27, %zmm29, %zmm2 +; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm13, %zmm27 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm28, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm26, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm17, %zmm30 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm23, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm21, %zmm29, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512F-NEXT: vpermt2q %zmm17, %zmm15, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm19, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm26, %zmm17 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] +; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm10, %zmm0, %zmm13 +; AVX512F-NEXT: vpermi2q %zmm10, %zmm0, %zmm17 +; AVX512F-NEXT: vpermi2q %zmm10, %zmm0, %zmm23 +; AVX512F-NEXT: vpermt2q %zmm10, %zmm29, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm13, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm22, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm26, %zmm11 ; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm13, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm20, %zmm16 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm20, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm21, %zmm10 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm29 = <6,14,u,u,u,7,15,u> +; AVX512F-NEXT: vpermt2q %zmm12, %zmm29, %zmm16 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm14, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm20, %zmm4 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm11, %zmm18, %zmm8 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm21, %zmm9 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm29, %zmm11 +; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm12 +; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm14, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm20, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm22, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm22 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm20, %zmm3 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm12, %zmm18, %zmm7 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm21, %zmm6 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 +; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm1 +; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm20 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 +; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 ; AVX512F-NEXT: movb $49, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512F-NEXT: movb $-116, %al -; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k3} -; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm21 {%k3} -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm16 {%k3} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 {%k3} -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm1 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm25 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm13 {%k2} +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512F-NEXT: movb $8, %al +; AVX512F-NEXT: kmovw %eax, %k2 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512F-NEXT: movb $24, %al ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k3} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,14,3,4,5,6,15] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm25 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [12,1,2,3,4,13,6,7] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm17 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm28 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,8,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm23 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm20 {%k3} -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm24 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm4 {%k2} -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm13, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm29 {%k3} -; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm6, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm7, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm26 {%k2} -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm18 {%k3} -; AVX512F-NEXT: vmovdqa64 %zmm0, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm18, 128(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm26, 192(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm12, 320(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm3, 384(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm29, 448(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm4, 512(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm24, 576(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm16, 640(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm10, 704(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm20, 768(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm11, 832(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm23, 896(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm21, 960(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm28, 1024(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm27, 1088(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm17, 1152(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm25, 1216(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512F-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm19 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,8,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm22 +; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm16 {%k3} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,9,2,3,4,5,10,7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm15 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = [12,1,2,3,4,13,6,7] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm19 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,14,3,4,5,6,15] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm20, %zmm16 +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm26 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm28 {%k3} +; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k3} +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm25 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm20, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm31 {%k3} +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k3} +; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm27 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm30 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm31 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm20, %zmm12 +; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm17 {%k1} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm23 {%k3} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm10, %zmm23 +; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 {%k3} +; AVX512F-NEXT: vpermt2q %zmm0, %zmm20, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512F-NEXT: vmovdqa64 %zmm1, 1216(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm23, 1152(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm14, 1088(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm17, 1024(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm13, 960(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm12, 896(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm31, 832(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm2, 768(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm30, 704(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm27, 640(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm11, 576(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm28, 512(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm5, 448(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm26, 384(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm25, 320(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm16, 256(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm19, 192(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm29, 128(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm15, 64(%r9) +; AVX512F-NEXT: vmovdqa64 %zmm22, (%r9) +; AVX512F-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: store_i64_stride5_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $648, %rsp # imm = 0x288 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm13 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm1 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm27 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm17 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm23 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <3,u,u,u,12,4,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 +; AVX512BW-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm25 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm27 +; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm21 +; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm16 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm11 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm12 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <3,u,u,u,12,4,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm24, %zmm3, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm18, %zmm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm14, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm18, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm18, %zmm29 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,8,u,u,u,1,9,u> -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [15,7,15,7,15,7,15,7] -; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm15 = <6,14,u,u,u,7,15,u> -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm25 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [13,5,13,5,13,5,13,5] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm19, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <1,u,u,u,10,2,u,u> -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm28, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm26, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm15, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm19, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm28, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm26, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm10, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,8,u,u,u,1,9,u> +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm13, %zmm22 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm18 = <1,u,u,u,10,2,u,u> +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm18, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm17 = +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm17, %zmm15 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm23, %zmm19 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] +; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm25, %zmm29, %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm13, %zmm25 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm17, %zmm26 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm23, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm27, %zmm29, %zmm2 +; AVX512BW-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm13, %zmm27 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm28, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm26, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm17, %zmm30 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm23, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm21, %zmm29, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm26 = -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm15, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm19, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm26, %zmm17 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm28, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm10, %zmm0, %zmm13 +; AVX512BW-NEXT: vpermi2q %zmm10, %zmm0, %zmm17 +; AVX512BW-NEXT: vpermi2q %zmm10, %zmm0, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm29, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm13, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm21 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm22, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm26, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm13, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm16 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm20, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm10 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = <6,14,u,u,u,7,15,u> +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm29, %zmm16 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm14, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm20, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm18, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm21, %zmm9 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm29, %zmm11 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm12 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm14, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm22, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm20, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm12, %zmm18, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm21, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm1 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm20 +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm1 ; AVX512BW-NEXT: movb $49, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} ; AVX512BW-NEXT: movb $-116, %al -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k3} -; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm21 {%k3} -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm16 {%k3} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 {%k3} -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm1 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm25 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm13 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512BW-NEXT: movb $8, %al +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2} +; AVX512BW-NEXT: movb $24, %al ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,14,3,4,5,6,15] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm25 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [12,1,2,3,4,13,6,7] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm17 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm28 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,3,8,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm23 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm20 {%k3} -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm24 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm29 {%k3} -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm26 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm26 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm18 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 128(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 256(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 320(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 448(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 512(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 576(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 640(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 704(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 768(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 832(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 896(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 960(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 1024(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 1088(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1152(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 1216(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9) -; AVX512BW-NEXT: addq $648, %rsp # imm = 0x288 +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm19 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,8,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm22 +; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm16 {%k3} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,9,2,3,4,5,10,7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm15 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [12,1,2,3,4,13,6,7] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm19 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,14,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm20, %zmm16 +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm26 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm28 {%k3} +; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k3} +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm25 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm26 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm28 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm20, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm30 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm31 {%k3} +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k3} +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm27 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm30 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm31 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm20, %zmm12 +; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm3, %zmm13 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm17 {%k1} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm23 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm10, %zmm23 +; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k3} +; AVX512BW-NEXT: vpermt2q %zmm0, %zmm20, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqa64 %zmm1, 1216(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm23, 1152(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 1088(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 1024(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 960(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 896(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm31, 832(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 768(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm30, 704(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm27, 640(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 576(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm28, 512(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm26, 384(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm25, 320(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 256(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 192(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm29, 128(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 64(%r9) +; AVX512BW-NEXT: vmovdqa64 %zmm22, (%r9) +; AVX512BW-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 @@ -2897,8 +2864,8 @@ ; SSE-NEXT: movapd (%rcx), %xmm12 ; SSE-NEXT: movapd 16(%rcx), %xmm13 ; SSE-NEXT: movapd 32(%rcx), %xmm14 -; SSE-NEXT: movapd 16(%r8), %xmm1 ; SSE-NEXT: movapd 32(%r8), %xmm0 +; SSE-NEXT: movapd 16(%r8), %xmm1 ; SSE-NEXT: movapd (%r8), %xmm2 ; SSE-NEXT: movapd %xmm3, %xmm15 ; SSE-NEXT: unpcklpd {{.*#+}} xmm15 = xmm15[0],xmm6[0] @@ -3692,167 +3659,119 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride5_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $2280, %rsp # imm = 0x8E8 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm7 +; AVX1-ONLY-NEXT: subq $2312, %rsp # imm = 0x908 +; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 256(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm7 +; AVX1-ONLY-NEXT: vmovaps 448(%rcx), %ymm0 +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm7 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 272(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 336(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 400(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 272(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm10 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm8 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 336(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm11 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 400(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rcx), %ymm0 ; AVX1-ONLY-NEXT: vmovaps 464(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm15 -; AVX1-ONLY-NEXT: vmovapd 480(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm14 ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm7 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],mem[0],ymm7[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],mem[0],ymm7[2],mem[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vbroadcastsd 136(%rsi), %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] @@ -3865,13 +3784,23 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 168(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 168(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vbroadcastsd 200(%rsi), %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 @@ -3879,17 +3808,25 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 232(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vbroadcastsd 264(%rsi), %ymm0 @@ -3903,202 +3840,224 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],mem[0],ymm3[2],mem[2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 296(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 296(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 304(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vbroadcastsd 328(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vbroadcastsd 360(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 360(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 368(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 392(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 424(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 424(%rsi), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm0[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm8 ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm6[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 432(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 456(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vbroadcastsd 456(%rsi), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm6 +; AVX1-ONLY-NEXT: vbroadcastsd 488(%rsi), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm6[0,1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm10 ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm8[0,1,2,3,4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm8[0,1,2],ymm10[3] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 488(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm15[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovapd %ymm15, %ymm11 -; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovapd 496(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm13[0],ymm6[0],ymm13[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm14 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm14[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm14[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm13 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm14 +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = ymm14[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0,1],ymm14[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm15 = mem[0,1,2,3],ymm14[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm13 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[0,1,2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm13[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[0,1],ymm13[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 112(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm13 +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm13[0,1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm15[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[0,1],ymm13[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[0,1],ymm15[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = mem[0,1,2,3],ymm13[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm15[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm9[0],ymm7[0],ymm9[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rsi), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm15[1],xmm9[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm15[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[0],ymm15[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm15[2],ymm7[3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm9[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm13[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm7[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm7[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 112(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm9[1],xmm7[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[0],ymm9[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm9[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1,2],ymm9[3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm6 -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm6[0,1],mem[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1],ymm6[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm7 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm7[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm7[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[3] -; AVX1-ONLY-NEXT: vmovapd 176(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 176(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm7 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vmovapd 160(%r8), %ymm7 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = mem[0],ymm7[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm9 = ymm7[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm7 +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = ymm7[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1],ymm7[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[0,1,2,3],ymm7[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX1-ONLY-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm6 -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1],ymm6[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm5[0,1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm6[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0,1],ymm5[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[2],ymm4[3] -; AVX1-ONLY-NEXT: vmovapd 240(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm6 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = ymm6[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm7 = mem[0],ymm6[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 240(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = ymm5[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0,1],ymm5[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[0,1,2,3],ymm5[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -4114,92 +4073,82 @@ ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovapd 304(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovapd 304(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastsd 312(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 288(%r8), %ymm5 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0],ymm5[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = ymm5[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX1-ONLY-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 320(%r8), %ymm4 +; AVX1-ONLY-NEXT: vmovaps 304(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 312(%rcx), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 288(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = ymm4[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = mem[0,1],ymm4[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm4[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,2,3],ymm4[4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 368(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovapd 368(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vbroadcastsd 376(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovapd 352(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = ymm4[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm5 = mem[0],ymm4[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 384(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 320(%r8), %ymm3 ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = ymm3[0,1],mem[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 432(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovapd 432(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovapd 416(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = ymm3[0],mem[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm4 = mem[0],ymm3[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3] +; AVX1-ONLY-NEXT: vmovaps 368(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 376(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 352(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = ymm3[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm4 = mem[0,1,2,3],ymm3[4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 384(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = ymm2[0,1],mem[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1],ymm2[2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 432(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 416(%r8), %ymm2 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0],ymm12[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0],ymm2[1],mem[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd $11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm3 = mem[0,1],ymm2[2],mem[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] @@ -4208,166 +4157,158 @@ ; AVX1-ONLY-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm13[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm11[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0,1],ymm1[2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm1[2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vmovapd 496(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 496(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[0],ymm2[1],mem[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0],ymm10[1,2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovapd 496(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0],ymm10[1,2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0],ymm1[1],ymm8[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm1[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm12, 16(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm14, (%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm4, 1936(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 1920(%r9) +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm9, 2416(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 2400(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm0, 2256(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 2240(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 2416(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 2400(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 2240(%r9) ; AVX1-ONLY-NEXT: vmovaps %xmm1, 2096(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 2080(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1616(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 1600(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 1776(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 1760(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 2080(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 1936(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 1920(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 1776(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 1760(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 1616(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 1600(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 1456(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 1440(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 1296(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 1280(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1456(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 1440(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1136(%r9) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1120(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 976(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 960(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 1136(%r9) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 1120(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 960(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 816(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 800(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 656(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 640(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 496(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 480(%r9) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 656(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 640(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1296(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1280(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2496(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 2464(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2432(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2304(%r9) @@ -4380,6 +4321,8 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2112(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1984(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1952(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1856(%r9) @@ -4388,10 +4331,14 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1792(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1664(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1632(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1536(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1504(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1472(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1344(%r9) @@ -4404,10 +4351,14 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1152(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 1024(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 992(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 896(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 832(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%r9) @@ -4416,6 +4367,8 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%r9) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%r9) @@ -4434,8 +4387,6 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2528(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 2464(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2368(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2336(%r9) @@ -4446,20 +4397,14 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 2016(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1984(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1888(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1728(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1696(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1664(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1568(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1504(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1408(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1376(%r9) @@ -4470,20 +4415,14 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1056(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 1024(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 928(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 864(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 768(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 736(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 544(%r9) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%r9) @@ -4493,7 +4432,7 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%r9) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r9) -; AVX1-ONLY-NEXT: addq $2280, %rsp # imm = 0x8E8 +; AVX1-ONLY-NEXT: addq $2312, %rsp # imm = 0x908 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -4506,10 +4445,10 @@ ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd 8(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4520,7 +4459,7 @@ ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],mem[0],ymm7[2],mem[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd 40(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] @@ -4530,7 +4469,7 @@ ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd 72(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] @@ -5056,13 +4995,13 @@ ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -5074,13 +5013,13 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -5092,98 +5031,98 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm14 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps %xmm8, 16(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm9, (%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1936(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm10, 1920(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm1, 2256(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm9, 2416(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm10, 2400(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 2256(%r9) ; AVX2-ONLY-NEXT: vmovaps %xmm11, 2240(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm2, 2416(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm12, 2400(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm3, 2096(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm13, 2080(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 2096(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm12, 2080(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm2, 1936(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm13, 1920(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm3, 1776(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm14, 1760(%r9) ; AVX2-ONLY-NEXT: vmovaps %xmm4, 1616(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm14, 1600(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm5, 1776(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm15, 1760(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm6, 1456(%r9) -; AVX2-ONLY-NEXT: vmovaps %xmm7, 1440(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm15, 1600(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm5, 1456(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm6, 1440(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm7, 1296(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm8, 1280(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 976(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 1136(%r9) ; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 960(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 1120(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1136(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 976(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1120(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 960(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 816(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 800(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 496(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 656(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 480(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 640(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 176(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 496(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 160(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 480(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 336(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 320(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 656(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 176(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 640(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 160(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1296(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 16(%r9) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1280(%r9) +; AVX2-ONLY-NEXT: vmovaps %xmm0, (%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 2496(%r9) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll @@ -29,15 +29,15 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, %xmm7 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] -; SSE-NEXT: movaps %xmm4, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm5[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movaps %xmm2, 16(%rax) -; SSE-NEXT: movaps %xmm4, 32(%rax) +; SSE-NEXT: movaps %xmm4, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE-NEXT: movaps %xmm4, 80(%rax) ; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps %xmm8, 80(%rax) +; SSE-NEXT: movaps %xmm3, 32(%rax) +; SSE-NEXT: movaps %xmm2, 16(%rax) ; SSE-NEXT: movaps %xmm7, 64(%rax) ; SSE-NEXT: movaps %xmm6, (%rax) ; SSE-NEXT: retq @@ -126,94 +126,92 @@ ; SSE-LABEL: store_i64_stride6_vf4: ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps (%rdi), %xmm2 -; SSE-NEXT: movaps 16(%rdi), %xmm1 +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movaps 16(%rdi), %xmm2 ; SSE-NEXT: movaps (%rsi), %xmm5 ; SSE-NEXT: movaps 16(%rsi), %xmm6 -; SSE-NEXT: movaps (%rdx), %xmm0 -; SSE-NEXT: movaps 16(%rdx), %xmm4 -; SSE-NEXT: movaps (%rcx), %xmm7 -; SSE-NEXT: movaps 16(%rcx), %xmm8 -; SSE-NEXT: movaps (%r8), %xmm9 +; SSE-NEXT: movaps (%rdx), %xmm1 +; SSE-NEXT: movaps 16(%rdx), %xmm7 +; SSE-NEXT: movaps (%rcx), %xmm8 +; SSE-NEXT: movaps 16(%rcx), %xmm9 +; SSE-NEXT: movaps (%r8), %xmm4 ; SSE-NEXT: movaps 16(%r8), %xmm10 ; SSE-NEXT: movaps (%r9), %xmm11 ; SSE-NEXT: movaps 16(%r9), %xmm12 -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] -; SSE-NEXT: movaps %xmm10, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm12[0] -; SSE-NEXT: movaps %xmm9, %xmm14 -; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm11[1] -; SSE-NEXT: movaps %xmm0, %xmm15 -; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm7[1] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm5[0] +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; SSE-NEXT: movaps %xmm1, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm8[0] +; SSE-NEXT: movaps %xmm4, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm11[1] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0] +; SSE-NEXT: movaps %xmm7, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm9[0] +; SSE-NEXT: movaps %xmm10, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm6[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1] -; SSE-NEXT: movaps %xmm4, %xmm12 -; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm8[1] -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] -; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm11[0] -; SSE-NEXT: movaps %xmm9, 32(%rax) -; SSE-NEXT: movaps %xmm2, 48(%rax) -; SSE-NEXT: movaps %xmm1, 96(%rax) -; SSE-NEXT: movaps %xmm4, 112(%rax) -; SSE-NEXT: movaps %xmm12, 160(%rax) ; SSE-NEXT: movaps %xmm10, 176(%rax) -; SSE-NEXT: movaps %xmm7, (%rax) -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm15, 64(%rax) -; SSE-NEXT: movaps %xmm14, 80(%rax) -; SSE-NEXT: movaps %xmm13, 128(%rax) -; SSE-NEXT: movaps %xmm3, 144(%rax) +; SSE-NEXT: movaps %xmm7, 160(%rax) +; SSE-NEXT: movaps %xmm2, 144(%rax) +; SSE-NEXT: movaps %xmm11, 128(%rax) +; SSE-NEXT: movaps %xmm8, 112(%rax) +; SSE-NEXT: movaps %xmm5, 96(%rax) +; SSE-NEXT: movaps %xmm4, 80(%rax) +; SSE-NEXT: movaps %xmm1, 64(%rax) +; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movaps %xmm14, 32(%rax) +; SSE-NEXT: movaps %xmm13, 16(%rax) +; SSE-NEXT: movaps %xmm3, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride6_vf4: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm3 -; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm3[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm9[1],xmm8[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm10, %ymm11 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm4[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[3] -; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm6[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm9[0],xmm8[0] -; AVX1-ONLY-NEXT: vmovaps %xmm4, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 64(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm5, 32(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm0 +; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm1 +; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm2 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm1[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm2[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm10 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm10[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovaps %xmm3, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm7, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 160(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -226,39 +224,40 @@ ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm5 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = xmm5[0,0] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm7 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm8 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm4[0,1],ymm9[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm9 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm10 +; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = xmm5[0,0] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm10 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm9[1] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm3 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 16(%r9), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm7[0] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm10[0],xmm9[0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm2 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm10[0],xmm9[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm8[0],xmm7[0] ; AVX2-ONLY-NEXT: vmovaps %ymm11, 96(%rax) ; AVX2-ONLY-NEXT: vmovaps %xmm3, 16(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 64(%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -418,276 +417,278 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride6_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm11 +; AVX1-ONLY-NEXT: pushq %rax +; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm14 ; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm13 +; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm8 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm2, %ymm8 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm2, %ymm9 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm10 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm9[2,3],ymm2[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0],ymm9[1],ymm11[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm12[1],xmm5[1] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm11[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm8, %ymm10 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3],ymm8[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm14[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm13[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm15[1],ymm12[3],ymm15[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm15 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm8[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[2],ymm15[3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm15 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] ; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm15[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0],ymm13[0],ymm12[2],ymm13[3] -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm13 -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm13[2,3] -; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm1[2,3],ymm15[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm11[0],ymm13[0],ymm11[2],ymm13[3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6,7] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm14[0],xmm4[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm7[0],xmm6[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm5[0] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm13[0],ymm15[0],ymm13[2],ymm15[3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm9[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm11 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm11[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm12[0],xmm5[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 16(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 208(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm4, 192(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm13, 128(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm12, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 160(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm11, 352(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm15, 320(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 128(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm0, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: popq %rax ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride6_vf8: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: pushq %rax -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm7 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm12 -; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm9 +; AVX2-ONLY-NEXT: subq $24, %rsp +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm4 ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm13 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = xmm9[0,0] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm1 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = xmm4[0,0] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm2[1],xmm1[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm3[0,1],ymm5[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm2[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm10 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm11 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm11[1],xmm10[1] -; AVX2-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm7 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm10 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm11[1],xmm8[1] +; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = xmm13[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm3[1],xmm1[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm14[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm9[2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm8[1],xmm6[1] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm15 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm10[1],xmm7[1] +; AVX2-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = xmm13[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm15[1],xmm4[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm12[0,1],ymm14[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm0 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm14[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm14[1],ymm0[1],ymm14[3],ymm0[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%r9), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm12[2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm7[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm7 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm7[1],mem[1],ymm7[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm14 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm14[2,3] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm14 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm14[1],ymm0[1],ymm14[3],ymm0[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm9[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%r9), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm7[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm15[0],xmm4[0] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm11[0],xmm10[0] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm9 = xmm9[0],mem[0] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm6[0] +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm15[0],ymm0[0],ymm15[2],ymm0[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%r9), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm4 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm4[0],ymm12[0],ymm4[2],ymm12[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm12[1],ymm4[3],ymm12[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%r9), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm12 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm11[0],xmm8[0] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[0],mem[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm10[0],xmm7[0] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %xmm6, 16(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm9, (%rax) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovaps %xmm7, 208(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm4, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 160(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 288(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 320(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 352(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm13, 224(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm3, 192(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm6, 16(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 352(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 320(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 288(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm14, 96(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: popq %rax +; AVX2-ONLY-NEXT: addq $24, %rsp ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: store_i64_stride6_vf8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512F-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12] -; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,12,5,13,4,12,5,13] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 ; AVX512F-NEXT: movb $12, %r10b ; AVX512F-NEXT: kmovw %r10d, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} ; AVX512F-NEXT: movb $16, %r10b ; AVX512F-NEXT: kmovw %r10d, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 ; AVX512F-NEXT: movb $48, %r9b ; AVX512F-NEXT: kmovw %r9d, %k2 ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,9,u,4,5,6,7> ; AVX512F-NEXT: vpermi2q %zmm1, %zmm8, %zmm7 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,9,4,5,6,7] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] ; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 ; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,13,u,4,5,6,7> ; AVX512F-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,1,9,0,8,1,9] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 ; AVX512F-NEXT: vmovdqa (%rdx), %xmm10 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] ; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm7 {%k1} ; AVX512F-NEXT: vinserti32x4 $2, (%r8), %zmm7, %zmm7 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,6,7] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15] -; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm11 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <14,u,2,3,4,5,15,u> -; AVX512F-NEXT: vpermi2q %zmm1, %zmm4, %zmm6 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,14,2,3,4,5,6,15] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm6, %zmm4 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u> -; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] -; AVX512F-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm7, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = <10,u,2,3,4,5,11,u> +; AVX512F-NEXT: vpermi2q %zmm1, %zmm7, %zmm11 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,10,2,3,4,5,6,11] +; AVX512F-NEXT: vpermi2q %zmm6, %zmm11, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512F-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [7,15,7,15] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm5[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,14,2,3,4,5,6,15] +; AVX512F-NEXT: vpermi2q %zmm6, %zmm2, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm9, 256(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 128(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512F-NEXT: vzeroupper @@ -696,85 +697,85 @@ ; AVX512BW-LABEL: store_i64_stride6_vf8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12] -; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,12,5,13,4,12,5,13] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm5 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm6 ; AVX512BW-NEXT: movb $12, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} ; AVX512BW-NEXT: movb $16, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,9,2,10,1,9,2,10] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm8 ; AVX512BW-NEXT: movb $48, %r9b ; AVX512BW-NEXT: kmovd %r9d, %k2 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,9,u,4,5,6,7> ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm8, %zmm7 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,9,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm8 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm8 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,13,6,14,5,13,6,14] ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k2} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = <0,1,13,u,4,5,6,7> ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm9, %zmm7 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm9 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,1,9,0,8,1,9] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm10 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],mem[0] ; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm7 {%k1} ; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm7, %zmm7 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,6,7] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm7, %zmm10 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15] -; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm11 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <14,u,2,3,4,5,15,u> -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm4, %zmm6 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,14,2,3,4,5,6,15] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm6, %zmm4 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <10,u,2,3,4,5,11,u> -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,12,6,7] -; AVX512BW-NEXT: vpermi2q %zmm5, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm7, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = <10,u,2,3,4,5,11,u> +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm7, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,10,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm11, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,12,6,7] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm11 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [7,15,7,15] +; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <14,u,2,3,4,5,15,u> +; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -1040,21 +1041,21 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride6_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm13 +; AVX1-ONLY-NEXT: subq $504, %rsp # imm = 0x1F8 +; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm15 ; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm0 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm13[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm2 @@ -1062,7 +1063,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm3 @@ -1072,115 +1073,110 @@ ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm14[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] ; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm9[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm8[1] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm2[1] ; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r9), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm8[1],xmm10[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm5[1],xmm7[1] ; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm13[2,3],ymm10[2,3] -; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm15 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[2],ymm11[3] -; AVX1-ONLY-NEXT: vmovupd %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm10 -; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm10[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm10 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm10[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[2],ymm11[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm9 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm11 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm11[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm9[0],ymm13[0],ymm9[2],ymm13[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm9 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm9 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm9[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[2],ymm13[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm0 = mem[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm11[0],ymm1[2],ymm11[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[2],ymm13[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm11[1],ymm1[3],ymm11[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm1[2,3],ymm11[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm11[0],ymm4[2],ymm11[3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm11[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm1[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm13[0],ymm3[2],ymm13[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5],ymm11[6,7] -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm10 = ymm4[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm3[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm15 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm15[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm13 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm13[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] @@ -1190,53 +1186,56 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm15[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm12 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm8 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm12 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm12 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm6[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm12[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm8[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm5[0],xmm7[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 592(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 576(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 208(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 400(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 384(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 704(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 512(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 592(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 576(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 400(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm12, (%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm3, 736(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm13, 672(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 640(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 608(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 480(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 640(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 480(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm10, 352(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 352(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm11, 288(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) @@ -1250,198 +1249,206 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 +; AVX1-ONLY-NEXT: addq $504, %rsp # imm = 0x1F8 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride6_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: subq $504, %rsp # imm = 0x1F8 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm11 ; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm0 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm6 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm6 ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm9[0,1],ymm3[0,1] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 +; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm6[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm8[0,1],ymm2[0,1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm5[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm7[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm8[1],xmm6[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm2 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = xmm2[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm11[1] +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm8[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[0,1],ymm2[0,1] -; AVX2-ONLY-NEXT: vmovaps 96(%r9), %xmm5 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = xmm5[0,0] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm10[1],xmm0[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 96(%r9), %xmm9 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = xmm9[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm3 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm6 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm6[1],xmm7[1] ; AVX2-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm12 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm5[0],ymm12[0],ymm5[2],ymm12[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm9[0],ymm12[0],ymm9[2],ymm12[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm12[1],ymm5[3],ymm12[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm9[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm9[1],ymm12[1],ymm9[3],ymm12[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 16(%r9), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm9[2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm9[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm9 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm9[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm5[0],ymm9[0],ymm5[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm4[0],ymm9[0],ymm4[2],ymm9[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm13[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm9[1],ymm5[3],ymm9[3] -; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = mem[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm9[1],ymm4[3],ymm9[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm11[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 48(%r9), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm9[2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm9[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm13[1],mem[1],ymm13[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm9 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm9[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[2,3],ymm9[2,3] ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm13[0],ymm0[0],ymm13[2],ymm0[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm0[1],ymm13[3],ymm0[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%r9), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm6 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm6[2,3] -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm15 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 80(%r9), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm13 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm14 ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm13[1],ymm6[3],ymm13[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm6[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm15[1],mem[1],ymm15[3],mem[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm13[0],ymm14[0],ymm13[2],ymm14[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm12[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm12[2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm15[1],mem[1],ymm15[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm13 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = xmm13[0],mem[0] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = xmm11[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm14 = xmm14[0],mem[0] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd (%rsp), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm8 = xmm8[0],mem[0] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm13 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm14 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm15 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm1 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm10 = xmm10[0],mem[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %xmm8, 16(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm3, (%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm2, 592(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm7, 576(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm14, 208(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm11, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm10, 400(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm13, 384(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 736(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 704(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 672(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 544(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 512(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps %xmm6, 592(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm10, 576(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 400(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 384(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm2, 208(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm15, 192(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm14, 16(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm13, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 736(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 704(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 672(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 544(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 512(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm9, 480(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 352(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 352(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm15, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 160(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1462,1129 +1469,289 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: addq $456, %rsp # imm = 0x1C8 +; AVX2-ONLY-NEXT: addq $504, %rsp # imm = 0x1F8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-ONLY-SLOW-LABEL: store_i64_stride6_vf16: -; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: movb $12, %r10b -; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: movb $16, %r10b -; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: movb $48, %r9b -; AVX512F-ONLY-SLOW-NEXT: kmovw %r9d, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] -; AVX512F-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vzeroupper -; AVX512F-ONLY-SLOW-NEXT: retq -; -; AVX512F-ONLY-FAST-LABEL: store_i64_stride6_vf16: -; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 -; AVX512F-ONLY-FAST-NEXT: movb $12, %r10b -; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: movb $16, %r10b -; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 -; AVX512F-ONLY-FAST-NEXT: movb $48, %r9b -; AVX512F-ONLY-FAST-NEXT: kmovw %r9d, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] -; AVX512F-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %xmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] -; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512F-ONLY-FAST-NEXT: vzeroupper -; AVX512F-ONLY-FAST-NEXT: retq +; AVX512F-LABEL: store_i64_stride6_vf16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm4 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm6 +; AVX512F-NEXT: vmovdqa64 (%r8), %zmm10 +; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm5 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm11, %zmm12 +; AVX512F-NEXT: movb $12, %r10b +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 {%k1} +; AVX512F-NEXT: movb $16, %r10b +; AVX512F-NEXT: kmovw %r10d, %k2 +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} +; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm13, %zmm9, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 {%k2} +; AVX512F-NEXT: vmovdqa64 (%r9), %zmm16 +; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm14 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm18, %zmm15 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [1,9,2,10,1,9,2,10] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm12, %zmm11 +; AVX512F-NEXT: movb $48, %r9b +; AVX512F-NEXT: kmovw %r9d, %k2 +; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm11 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,9,u,4,5,6,7> +; AVX512F-NEXT: vpermt2q %zmm10, %zmm19, %zmm11 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,9,4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm20, %zmm11 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,13,6,14,5,13,6,14] +; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm17, %zmm15 +; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,13,u,4,5,6,7> +; AVX512F-NEXT: vpermt2q %zmm10, %zmm22, %zmm15 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,13,4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm18 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm12 +; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512F-NEXT: vpermt2q %zmm5, %zmm19, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm20, %zmm12 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm21 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm17 +; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512F-NEXT: vpermt2q %zmm5, %zmm22, %zmm17 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm23, %zmm17 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] +; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512F-NEXT: vpermt2q %zmm8, %zmm19, %zmm18 +; AVX512F-NEXT: vmovdqa64 (%rdx), %xmm20 +; AVX512F-NEXT: vmovdqa64 64(%rdx), %xmm21 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512F-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512F-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} +; AVX512F-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm20, %zmm18 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm19 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512F-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512F-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} +; AVX512F-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm20, %zmm19 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512F-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 +; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm22 = ymm22[1],mem[1],ymm22[3],mem[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm22, %zmm21, %zmm21 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = <10,u,2,3,4,5,11,u> +; AVX512F-NEXT: vpermt2q %zmm10, %zmm22, %zmm21 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,10,2,3,4,5,6,11] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm24, %zmm21 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,4,12,6,7] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] +; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm13, %zmm26, %zmm9 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [7,15,7,15] +; AVX512F-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-NEXT: vpermt2q %zmm8, %zmm13, %zmm7 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm9[4,5,6,7] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = <14,u,2,3,4,5,15,u> +; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm7 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,14,2,3,4,5,6,15] +; AVX512F-NEXT: vpermt2q %zmm16, %zmm9, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm20 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm5, %zmm22, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm24, %zmm10 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm25, %zmm3 +; AVX512F-NEXT: vpermt2q %zmm6, %zmm26, %zmm4 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm14, %zmm9, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, 704(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm3, 576(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm7, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm15, 256(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq ; -; AVX512DQ-SLOW-LABEL: store_i64_stride6_vf16: -; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 -; AVX512DQ-SLOW-NEXT: movb $12, %r10b -; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: movb $16, %r10b -; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] -; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 -; AVX512DQ-SLOW-NEXT: movb $48, %r9b -; AVX512DQ-SLOW-NEXT: kmovw %r9d, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] -; AVX512DQ-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] -; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512DQ-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] -; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15] -; AVX512DQ-SLOW-NEXT: # ymm22 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] -; AVX512DQ-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512DQ-SLOW-NEXT: vzeroupper -; AVX512DQ-SLOW-NEXT: retq -; -; AVX512DQ-FAST-LABEL: store_i64_stride6_vf16: -; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 -; AVX512DQ-FAST-NEXT: movb $12, %r10b -; AVX512DQ-FAST-NEXT: kmovw %r10d, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: movb $16, %r10b -; AVX512DQ-FAST-NEXT: kmovw %r10d, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 -; AVX512DQ-FAST-NEXT: movb $48, %r9b -; AVX512DQ-FAST-NEXT: kmovw %r9d, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] -; AVX512DQ-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512DQ-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %xmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15] -; AVX512DQ-FAST-NEXT: # ymm22 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512DQ-FAST-NEXT: vzeroupper -; AVX512DQ-FAST-NEXT: retq -; -; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride6_vf16: -; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: movb $12, %r10b -; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: movb $16, %r10b -; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: movb $48, %r9b -; AVX512BW-ONLY-SLOW-NEXT: kmovd %r9d, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] -; AVX512BW-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vzeroupper -; AVX512BW-ONLY-SLOW-NEXT: retq -; -; AVX512BW-ONLY-FAST-LABEL: store_i64_stride6_vf16: -; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: movb $12, %r10b -; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: movb $16, %r10b -; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: movb $48, %r9b -; AVX512BW-ONLY-FAST-NEXT: kmovd %r9d, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] -; AVX512BW-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vzeroupper -; AVX512BW-ONLY-FAST-NEXT: retq -; -; AVX512DQBW-SLOW-LABEL: store_i64_stride6_vf16: -; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 -; AVX512DQBW-SLOW-NEXT: movb $12, %r10b -; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: movb $16, %r10b -; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 -; AVX512DQBW-SLOW-NEXT: movb $48, %r9b -; AVX512DQBW-SLOW-NEXT: kmovd %r9d, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] -; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # ymm22 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512DQBW-SLOW-NEXT: vzeroupper -; AVX512DQBW-SLOW-NEXT: retq -; -; AVX512DQBW-FAST-LABEL: store_i64_stride6_vf16: -; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] -; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 -; AVX512DQBW-FAST-NEXT: movb $12, %r10b -; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: movb $16, %r10b -; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 -; AVX512DQBW-FAST-NEXT: movb $48, %r9b -; AVX512DQBW-FAST-NEXT: kmovd %r9d, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] -; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %xmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15] -; AVX512DQBW-FAST-NEXT: # ymm22 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512DQBW-FAST-NEXT: vzeroupper -; AVX512DQBW-FAST-NEXT: retq +; AVX512BW-LABEL: store_i64_stride6_vf16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm13 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm10 +; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm5 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm11, %zmm12 +; AVX512BW-NEXT: movb $12, %r10b +; AVX512BW-NEXT: kmovd %r10d, %k1 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 {%k1} +; AVX512BW-NEXT: movb $16, %r10b +; AVX512BW-NEXT: kmovd %r10d, %k2 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2} +; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm13, %zmm9, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm16 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm14 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm18, %zmm15 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [1,9,2,10,1,9,2,10] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm11 +; AVX512BW-NEXT: movb $48, %r9b +; AVX512BW-NEXT: kmovd %r9d, %k2 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,9,u,4,5,6,7> +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm19, %zmm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,9,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm20, %zmm11 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm22 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm21, %zmm22 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,13,6,14,5,13,6,14] +; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm17, %zmm15 +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,13,u,4,5,6,7> +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm22, %zmm15 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,13,4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm12 +; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm19, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm20, %zmm12 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm21 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm17 +; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm22, %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm17 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] +; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm18 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %xmm20 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %xmm21 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} +; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm20, %zmm18 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm19 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} +; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm20, %zmm19 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm20, %zmm21 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm23 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm22 = ymm22[1],mem[1],ymm22[3],mem[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm22, %zmm21, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <10,u,2,3,4,5,11,u> +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm22, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,10,2,3,4,5,6,11] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm24, %zmm21 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,4,12,6,7] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm25, %zmm0 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm26, %zmm9 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [7,15,7,15] +; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm7 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = <14,u,2,3,4,5,15,u> +; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm7 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,14,2,3,4,5,6,15] +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm9, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm20 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm23[1],mem[1],ymm23[3],mem[3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm20, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm22, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm24, %zmm10 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm25, %zmm3 +; AVX512BW-NEXT: vpermt2q %zmm6, %zmm26, %zmm4 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[4,5,6,7] +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm8, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm14, %zmm9, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, 704(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 576(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 %in.vec1 = load <16 x i64>, ptr %in.vecptr1, align 64 %in.vec2 = load <16 x i64>, ptr %in.vecptr2, align 64 @@ -3110,17 +2277,15 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride6_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1592, %rsp # imm = 0x638 +; AVX1-ONLY-NEXT: subq $1608, %rsp # imm = 0x648 ; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm1 ; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm4 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm9 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm14 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm11[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3] @@ -3138,8 +2303,9 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],xmm9[1] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm10[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3] @@ -3154,10 +2320,10 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3177,8 +2343,8 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3192,9 +2358,11 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm14[1] +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovapd 128(%r8), %ymm3 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3] @@ -3274,68 +2442,52 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r9), %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3],ymm12[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm7 -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm12[1],ymm7[3],ymm12[3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm7 = mem[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm7[2,3] ; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm7 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm7[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm7[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm12[1],ymm1[3],ymm12[3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm1[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm1[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm12[0],ymm4[2],ymm12[3] ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm12[1],ymm4[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm6 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm6[2,3],ymm12[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm4 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm5 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm5[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm12[0],ymm4[2],ymm12[3] ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm4 -; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm12[1],ymm4[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm4[2,3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm4 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm4[2,3] ; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm4[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm12[0],ymm5[2],ymm12[3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm12[1],ymm5[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm3[2,3],ymm5[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm4[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm12[0],ymm6[2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm6 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm3[2,3],ymm6[2,3] ; AVX1-ONLY-NEXT: vmovapd 128(%r9), %ymm3 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm3[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm12[0],ymm5[2],ymm12[3] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm5 -; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm12 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm12[1],ymm5[3],ymm12[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm5[2,3] -; AVX1-ONLY-NEXT: vmovapd 160(%r9), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm5[2,3],ymm12[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm13[0],ymm12[0],ymm13[2],ymm12[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm3[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm12[0],ymm6[2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm6 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm13[2,3],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%r9), %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm6[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm12[0],ymm13[0],ymm12[2],ymm13[3] ; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm12 = mem[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3],ymm12[2,3] ; AVX1-ONLY-NEXT: vmovapd 192(%r9), %ymm2 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm2[2,3],ymm13[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm2[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm12[0],ymm13[0],ymm12[2],ymm13[3] ; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm13 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm13[1],ymm12[3],ymm13[3] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm12 = mem[0,0,3,2] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm12[2,3] ; AVX1-ONLY-NEXT: vmovapd 224(%r9), %ymm0 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm13[2,3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],mem[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm12[0],ymm13[0],ymm12[2],ymm13[3] ; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm12 @@ -3372,18 +2524,18 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm7 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm7[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm1 @@ -3408,7 +2560,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm3 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] @@ -3418,23 +2570,23 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 208(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm14[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm11[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm0[0],mem[0] @@ -3443,7 +2595,8 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm12 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm9[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm0[0],xmm9[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm10 = xmm0[0],mem[0] @@ -3457,13 +2610,13 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd (%rsp), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vunpcklpd (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload @@ -3478,22 +2631,22 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1360(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 1344(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm2, 1168(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm3, 1152(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm4, 1360(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 1344(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 976(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 960(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 976(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 960(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 784(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 768(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm8, 592(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm9, 576(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 208(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 400(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 384(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 784(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 400(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 208(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm15, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1472(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3574,292 +2727,298 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $1592, %rsp # imm = 0x638 +; AVX1-ONLY-NEXT: addq $1608, %rsp # imm = 0x648 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride6_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1656, %rsp # imm = 0x678 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm12 -; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm2 +; AVX2-ONLY-NEXT: subq $1672, %rsp # imm = 0x688 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm13 +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = xmm2[0,0] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = xmm1[0,0] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm14 ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm8 ; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[0,1],ymm4[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm14[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm7 ; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm0[0,0] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm5[1],xmm8[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm12[0,1],ymm3[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],xmm6[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm6[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[0,1],ymm0[0,1] -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm3 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = xmm3[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm1 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = xmm1[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm0[1] +; AVX2-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %xmm3 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = xmm3[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%rcx), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 128(%r9), %xmm3 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = xmm3[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rcx), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] ; AVX2-ONLY-NEXT: vbroadcastsd 136(%r8), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm1[1],xmm3[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[0,1],ymm3[0,1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%r9), %xmm5 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = xmm5[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rcx), %xmm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rcx), %xmm6 +; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm1[1],xmm3[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] ; AVX2-ONLY-NEXT: vbroadcastsd 168(%r8), %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %xmm5 ; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm1[1],xmm5[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm3[0,1],ymm5[0,1] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 192(%r9), %xmm6 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = xmm6[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm8[2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %xmm5 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = xmm6[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm7[2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rcx), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %xmm5 ; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm1[1],xmm5[1] -; AVX2-ONLY-NEXT: vbroadcastsd 200(%r8), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; AVX2-ONLY-NEXT: vbroadcastsd 200(%r8), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm15 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm1[1],xmm15[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm5[0,1],ymm6[0,1] -; AVX2-ONLY-NEXT: vmovaps 224(%r9), %xmm9 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = xmm9[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%r9), %xmm7 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = xmm7[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rcx), %xmm8 -; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm11 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm8[1] -; AVX2-ONLY-NEXT: vbroadcastsd 232(%r8), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%r9), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm9 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm9[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm9[0],ymm1[2],ymm9[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%r9), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm9 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm9[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm9[0],ymm1[2],ymm9[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%r9), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm2 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm9 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm9[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm3[1],xmm8[1] +; AVX2-ONLY-NEXT: vbroadcastsd 232(%r8), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%r9), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm10[1],mem[1],ymm10[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm6 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm13[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%r9), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm6 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm2[0],ymm6[0],ymm2[2],ymm6[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 80(%r9), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm4 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm6 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm9[1],mem[1],ymm9[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],mem[1],ymm6[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm2 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 144(%r9), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],mem[1],ymm4[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 152(%r8), %ymm2 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm7[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm6 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 176(%r9), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm7[1],mem[1],ymm7[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],mem[1],ymm6[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 208(%r9), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3868,14 +3027,14 @@ ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm9 ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm14[0],ymm2[2],ymm14[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm14[1],ymm2[3],ymm14[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm9[1],ymm2[3],ymm9[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm5[2,3],ymm2[4,5,6,7] @@ -3884,8 +3043,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm2 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm0[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm14[0] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -3918,20 +3076,22 @@ ; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm6 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm15[0] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm11[0],xmm8[0] -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm4 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm3 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpcklpd (%rsp), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] @@ -3947,31 +3107,30 @@ ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %xmm0, 16(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 1360(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 1344(%rax) ; AVX2-ONLY-NEXT: vmovaps %xmm2, 1168(%rax) ; AVX2-ONLY-NEXT: vmovaps %xmm3, 1152(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm4, 1360(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm5, 1344(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm6, 976(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm7, 960(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm4, 976(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm5, 960(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm6, 784(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm7, 768(%rax) ; AVX2-ONLY-NEXT: vmovaps %xmm9, 592(%rax) ; AVX2-ONLY-NEXT: vmovaps %xmm10, 576(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm12, 208(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm14, 192(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm12, 400(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm14, 384(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 400(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 208(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 384(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 192(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 784(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 16(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 768(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm13, 1504(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1472(%rax) @@ -4048,7 +3207,7 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: addq $1656, %rsp # imm = 0x678 +; AVX2-ONLY-NEXT: addq $1672, %rsp # imm = 0x688 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -7361,644 +6520,614 @@ ; AVX1-ONLY-LABEL: store_i64_stride6_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $3464, %rsp # imm = 0xD88 -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm6 -; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm7 +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm7 +; AVX1-ONLY-NEXT: vmovapd (%r8), %ymm11 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm11[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 64(%r8), %ymm8 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm8 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 96(%r8), %ymm10 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 128(%r8), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 128(%r8), %ymm13 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 136(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 136(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 168(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 168(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 192(%r8), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 200(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 200(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 232(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 232(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 256(%r8), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 256(%r8), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 264(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 264(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 288(%r8), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 288(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 288(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 296(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 296(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 320(%r8), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 328(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 328(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 352(%r8), %ymm11 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 352(%r8), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 360(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 360(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 384(%r8), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 384(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 392(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 392(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 416(%r8), %ymm12 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 416(%r8), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 424(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 424(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovapd 448(%r8), %ymm13 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovapd 448(%r8), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 448(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 448(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 456(%r8), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vbroadcastsd 456(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%r9), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm2[1],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vbroadcastsd 488(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%r9), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd (%rsi), %ymm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX1-ONLY-NEXT: vbroadcastsd 488(%r8), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%r9), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm11[2,3],ymm2[2,3] ; AVX1-ONLY-NEXT: vmovapd (%r9), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm5 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 64(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 64(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm10[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm6 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %ymm2 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm2[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm14[0],ymm3[2],ymm14[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm8[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovapd 64(%r9), %ymm14 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm14[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm12[0],ymm3[2],ymm12[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm10[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm12 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm12[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm11[0],ymm3[2],ymm11[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm13[2,3],ymm3[2,3] +; AVX1-ONLY-NEXT: vmovapd 128(%r9), %ymm3 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm3[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm11[0],ymm13[0],ymm11[2],ymm13[3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm11 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm13 = mem[2,3],ymm11[2,3] +; AVX1-ONLY-NEXT: vmovapd 160(%r9), %ymm11 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm11[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm13[0],ymm10[0],ymm13[2],ymm10[3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm10 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm10 = mem[2,3],ymm10[2,3] +; AVX1-ONLY-NEXT: vmovapd 192(%r9), %ymm13 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm13[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm10[0],ymm7[0],ymm10[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm7 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovapd 224(%r9), %ymm10 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm10[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm7 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm8 = mem[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovapd 256(%r9), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 96(%r9), %ymm10 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm10[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm7 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovapd 288(%r9), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 128(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 128(%rsi), %ymm7 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm7[1],ymm0[3],ymm7[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 128(%r9), %ymm14 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm14[2,3],ymm7[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm7 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm7 = mem[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovapd 320(%r9), %ymm0 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 160(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 160(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 160(%r9), %ymm7 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[2,3],ymm8[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm7 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm7[2,3] +; AVX1-ONLY-NEXT: vmovapd 352(%r9), %ymm7 +; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[2],ymm7[3] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm6 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm6[2,3] +; AVX1-ONLY-NEXT: vmovapd 384(%r9), %ymm8 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm8[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm5 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm9[2,3],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 416(%r9), %ymm7 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm7[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm5 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm15[2,3],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 448(%r9), %ymm15 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm15[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm5 = mem[0,0,3,2] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm5[2,3] +; AVX1-ONLY-NEXT: vmovapd 480(%r9), %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm6[2,3],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 192(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 192(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 192(%r9), %ymm15 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm15[2,3],ymm8[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm5 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 224(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],ymm8[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[3] +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 256(%rdi), %ymm8 -; AVX1-ONLY-NEXT: vmovapd 256(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm8 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovapd 256(%r9), %ymm8 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 288(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 288(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm2 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 288(%r9), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 144(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 152(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 320(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 320(%r9), %ymm2 -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 352(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 352(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 352(%r9), %ymm11 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 208(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 384(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 384(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 384(%r9), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm9[0],ymm1[0],ymm9[2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 416(%rsi), %ymm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm12[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 416(%r9), %ymm12 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm12[2,3],ymm9[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm9 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm9[1],ymm1[3],ymm9[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 448(%r9), %ymm13 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm9[0],ymm1[2],ymm9[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vmovapd 480(%r9), %ymm9 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm9[2,3],ymm0[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 16(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 80(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 144(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 144(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 152(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 176(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm7[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 208(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm15[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 240(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 240(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 272(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 272(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 280(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 304(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 272(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 272(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 304(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 312(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovaps 272(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 280(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 336(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 304(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 336(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 344(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovaps 304(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 312(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 368(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 336(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 368(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 376(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm11[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 400(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovaps 336(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 344(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 400(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 408(%r8), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 368(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 368(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 376(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 400(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 400(%rdx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 408(%r8), %ymm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 432(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] @@ -8009,7 +7138,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 440(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -8021,7 +7150,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 472(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 496(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] @@ -8033,7 +7162,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 504(%r8), %ymm1 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -8048,7 +7177,7 @@ ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8056,7 +7185,7 @@ ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8066,7 +7195,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] @@ -8082,7 +7211,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] @@ -8148,54 +7277,54 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 2320(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 2304(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm4, 2704(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 2688(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 2896(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 2880(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 2512(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 2496(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 2896(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 2880(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 2704(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm3, 2688(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 2512(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 2496(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 2320(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 2304(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 2128(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 2112(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm10, 1936(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm11, 1920(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 2128(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 2112(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 1744(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 1728(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 1744(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 1728(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 1552(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 1536(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1360(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 1344(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 1168(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 1152(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1360(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1344(%rax) -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 976(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 960(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 784(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 592(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 576(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 784(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1552(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 1536(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3008(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8362,23 +7491,24 @@ ; ; AVX2-ONLY-LABEL: store_i64_stride6_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3656, %rsp # imm = 0xE48 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm6 +; AVX2-ONLY-NEXT: subq $3704, %rsp # imm = 0xE78 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm7 +; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm0 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm5 -; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm6[0,1],ymm2[0,1] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -8386,8 +7516,8 @@ ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vbroadcastsd 8(%r8), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -8395,48 +7525,51 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm0 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm5[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm7[0,1],ymm2[0,1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm3[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 40(%r8), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1] ; AVX2-ONLY-NEXT: vbroadcastsd 72(%r8), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 96(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -8451,14 +7584,15 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 128(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -8473,13 +7607,14 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm15 ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -8494,13 +7629,14 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm3 ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 192(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -8515,13 +7651,14 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[0,1],ymm0[0,1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] @@ -8536,413 +7673,421 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%r8), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 256(%rsi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1],ymm1[0,1] -; AVX2-ONLY-NEXT: vmovaps 256(%r9), %xmm2 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = xmm2[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rcx), %xmm4 -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 256(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; AVX2-ONLY-NEXT: vbroadcastsd 264(%r8), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%r8), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 288(%rsi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 256(%r8), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 256(%r9), %xmm1 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 256(%rcx), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 256(%rdx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vbroadcastsd 264(%r8), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[0,1],ymm1[0,1] -; AVX2-ONLY-NEXT: vmovaps 288(%r9), %xmm4 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = xmm4[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rcx), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 288(%r8), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 288(%r9), %xmm1 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rcx), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vbroadcastsd 296(%r8), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 320(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; AVX2-ONLY-NEXT: vbroadcastsd 296(%r8), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 320(%rsi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[0,1],ymm4[0,1] -; AVX2-ONLY-NEXT: vmovaps 320(%r9), %xmm6 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = xmm6[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 320(%r9), %xmm1 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = xmm1[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 320(%rcx), %xmm7 ; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %xmm4 -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm7[1] +; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] ; AVX2-ONLY-NEXT: vbroadcastsd 328(%r8), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm4[0,1],ymm6[0,1] -; AVX2-ONLY-NEXT: vmovaps 352(%r9), %xmm7 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = xmm7[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 352(%r9), %xmm1 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = xmm1[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 352(%rcx), %xmm8 ; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1] +; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1] ; AVX2-ONLY-NEXT: vbroadcastsd 360(%r8), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%r8), %ymm6 -; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %xmm8 -; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm8[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm6[0,1],ymm7[0,1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 384(%r8), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 384(%r9), %xmm8 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = xmm8[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rcx), %xmm9 -; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] -; AVX2-ONLY-NEXT: vbroadcastsd 392(%r8), %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%r8), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm8[0,1],ymm7[0,1] -; AVX2-ONLY-NEXT: vmovaps 416(%r9), %xmm9 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = xmm9[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rcx), %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm10[1] -; AVX2-ONLY-NEXT: vbroadcastsd 424(%r8), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%r8), %ymm7 -; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %xmm10 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = xmm8[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm10[2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rcx), %xmm10 ; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm7[0,1],ymm9[0,1] -; AVX2-ONLY-NEXT: vmovaps 448(%r9), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm10[1] +; AVX2-ONLY-NEXT: vbroadcastsd 392(%r8), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8 +; AVX2-ONLY-NEXT: vmovaps 416(%r8), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 416(%r9), %xmm10 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = xmm10[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm11[2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rcx), %xmm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rcx), %xmm11 ; AVX2-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %xmm9 -; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm9[1],xmm11[1] -; AVX2-ONLY-NEXT: vbroadcastsd 456(%r8), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm11[1] +; AVX2-ONLY-NEXT: vbroadcastsd 424(%r8), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm9 -; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %xmm11 -; AVX2-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %xmm10 ; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm11[1] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm9[0,1],ymm10[0,1] -; AVX2-ONLY-NEXT: vmovaps 480(%r9), %xmm11 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm10[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vmovaps 448(%r8), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 448(%r9), %xmm11 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = xmm11[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rcx), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 448(%rcx), %xmm12 ; AVX2-ONLY-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %xmm10 +; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %xmm10 ; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm12[1] -; AVX2-ONLY-NEXT: vbroadcastsd 488(%r8), %ymm12 +; AVX2-ONLY-NEXT: vbroadcastsd 456(%r8), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %xmm11 +; AVX2-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm11[1] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%r9), %xmm12 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = xmm12[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rcx), %xmm13 +; AVX2-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %xmm10 +; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],xmm13[1] +; AVX2-ONLY-NEXT: vbroadcastsd 488(%r8), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = mem[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%r9), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%r9), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm11 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%r8), %ymm12 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = mem[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%r9), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%r9), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm11 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%r8), %ymm12 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 80(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = mem[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 80(%r9), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 80(%r9), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm11 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%r8), %ymm12 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX2-ONLY-NEXT: vmovaps 96(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = mem[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm11 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%r8), %ymm12 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] +; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 144(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] ; AVX2-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = mem[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 144(%r9), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 144(%r9), %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 152(%r8), %ymm11 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 152(%r8), %ymm12 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm12[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm15[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 176(%r9), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm11 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm12 +; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 176(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm12[1],ymm10[3],ymm12[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 176(%r9), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 184(%r8), %ymm10 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm10 ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm5[0],ymm10[0],ymm5[2],ymm10[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],ymm12[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 208(%rcx), %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 208(%r9), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm10[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm12[1],mem[1],ymm12[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm10 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm3[2,3],ymm10[2,3] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm10[1],ymm5[3],ymm10[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 208(%r9), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm12[1],mem[1],ymm12[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%r8), %ymm6 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm5[0],ymm6[0],ymm5[2],ymm6[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm10[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 240(%rcx), %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm5[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm10[1],mem[1],ymm10[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 248(%r8), %ymm5 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 256(%rsi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 256(%rdx), %ymm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],ymm10[2,3] +; AVX2-ONLY-NEXT: vmovaps 256(%rdx), %ymm6 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm6[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 272(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 272(%r9), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],mem[1],ymm10[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 280(%r8), %ymm3 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 288(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 288(%rdx), %ymm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 304(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 304(%r9), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 272(%r9), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 280(%r8), %ymm4 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 288(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 288(%rdx), %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 304(%rcx), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 304(%r9), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 312(%r8), %ymm4 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 320(%rsi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3],ymm5[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 336(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 336(%r9), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 344(%r8), %ymm3 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 368(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 368(%r9), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 376(%r8), %ymm3 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 400(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 400(%r9), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 312(%r8), %ymm2 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX2-ONLY-NEXT: vbroadcastsd 408(%r8), %ymm2 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 320(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 336(%rcx), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 432(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 336(%r9), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 344(%r8), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 368(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 368(%r9), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 376(%r8), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 400(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 400(%r9), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] -; AVX2-ONLY-NEXT: vbroadcastsd 408(%r8), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm2[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 432(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 432(%r9), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],mem[1],ymm2[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],mem[1],ymm3[3],mem[3] ; AVX2-ONLY-NEXT: vbroadcastsd 440(%r8), %ymm1 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8955,7 +8100,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm8[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8972,7 +8117,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm0[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9012,22 +8157,22 @@ ; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] @@ -9140,56 +8285,56 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm11[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %xmm0, 16(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm2, 2320(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm3, 2304(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm4, 2704(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm5, 2688(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm6, 2896(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm7, 2880(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm8, 2512(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm9, 2496(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 2896(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 2880(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm2, 2704(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm3, 2688(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm4, 2512(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm5, 2496(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm6, 2320(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm7, 2304(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm8, 2128(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm9, 2112(%rax) ; AVX2-ONLY-NEXT: vmovaps %xmm10, 1936(%rax) ; AVX2-ONLY-NEXT: vmovaps %xmm12, 1920(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm13, 2128(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm15, 2112(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm13, 1744(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm15, 1728(%rax) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm0, 1552(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1744(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 1536(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1728(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 1360(%rax) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm0, 1344(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 1168(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 1152(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1360(%rax) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1344(%rax) -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 976(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 960(%rax) +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm0, 784(%rax) +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps %xmm0, 768(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 592(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 576(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 208(%rax) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 400(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vmovaps %xmm0, 384(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 784(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 208(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 768(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 192(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1552(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 16(%rax) ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vmovaps %xmm0, 1536(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm11, 3040(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 3008(%rax) @@ -9348,7 +8493,7 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-ONLY-NEXT: addq $3656, %rsp # imm = 0xE48 +; AVX2-ONLY-NEXT: addq $3704, %rsp # imm = 0xE78 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll @@ -31,15 +31,15 @@ ; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm1[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1] -; SSE-NEXT: movapd %xmm2, 16(%rax) -; SSE-NEXT: movapd %xmm4, 32(%rax) -; SSE-NEXT: movapd %xmm0, 48(%rax) -; SSE-NEXT: movapd %xmm3, 80(%rax) ; SSE-NEXT: movapd %xmm5, 96(%rax) +; SSE-NEXT: movapd %xmm3, 80(%rax) +; SSE-NEXT: movapd %xmm0, 48(%rax) +; SSE-NEXT: movapd %xmm4, 32(%rax) +; SSE-NEXT: movapd %xmm2, 16(%rax) ; SSE-NEXT: movapd %xmm1, 64(%rax) ; SSE-NEXT: movapd %xmm7, (%rax) ; SSE-NEXT: retq @@ -51,22 +51,22 @@ ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm3 +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm7 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm4[1],xmm5[1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -107,17 +107,17 @@ ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-NEXT: vmovdqa (%r8), %xmm2 +; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1 -; AVX512-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1] -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1] +; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = <3,5,7,9,11,13,u,u> -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 +; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 ; AVX512-NEXT: vextracti32x4 $2, %zmm3, 96(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -146,101 +146,100 @@ ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movapd (%rdi), %xmm8 -; SSE-NEXT: movaps 16(%rdi), %xmm7 -; SSE-NEXT: movapd (%rsi), %xmm0 -; SSE-NEXT: movaps 16(%rsi), %xmm13 -; SSE-NEXT: movapd (%rdx), %xmm4 -; SSE-NEXT: movaps 16(%rdx), %xmm2 -; SSE-NEXT: movapd (%rcx), %xmm3 -; SSE-NEXT: movaps 16(%rcx), %xmm1 -; SSE-NEXT: movapd (%r8), %xmm10 -; SSE-NEXT: movaps 16(%r8), %xmm6 +; SSE-NEXT: movapd (%rdi), %xmm0 +; SSE-NEXT: movapd 16(%rdi), %xmm5 +; SSE-NEXT: movapd (%rsi), %xmm2 +; SSE-NEXT: movapd 16(%rsi), %xmm8 +; SSE-NEXT: movapd (%rdx), %xmm1 +; SSE-NEXT: movapd 16(%rdx), %xmm6 +; SSE-NEXT: movapd (%rcx), %xmm4 +; SSE-NEXT: movapd 16(%rcx), %xmm11 +; SSE-NEXT: movapd (%r8), %xmm3 +; SSE-NEXT: movapd 16(%r8), %xmm10 ; SSE-NEXT: movapd (%r9), %xmm9 -; SSE-NEXT: movaps 16(%r9), %xmm5 +; SSE-NEXT: movapd 16(%r9), %xmm12 +; SSE-NEXT: movapd 16(%r10), %xmm13 ; SSE-NEXT: movapd (%r10), %xmm14 -; SSE-NEXT: movaps 16(%r10), %xmm12 -; SSE-NEXT: movaps %xmm13, %xmm11 -; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm12[1] -; SSE-NEXT: movapd %xmm8, %xmm15 -; SSE-NEXT: unpcklpd {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm7[2,3] -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm13[0] -; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm9[0] +; SSE-NEXT: movapd %xmm0, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm2[0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm9[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm14[1] -; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm14[0],xmm8[1] -; SSE-NEXT: movapd %xmm10, 32(%rax) -; SSE-NEXT: movapd %xmm8, 48(%rax) +; SSE-NEXT: movapd %xmm5, %xmm14 +; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm8[0] +; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm13[0],xmm5[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm6[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm11[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm10[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm13[1] +; SSE-NEXT: movapd %xmm12, 208(%rax) +; SSE-NEXT: movapd %xmm11, 192(%rax) +; SSE-NEXT: movapd %xmm8, 176(%rax) +; SSE-NEXT: movapd %xmm5, 160(%rax) +; SSE-NEXT: movapd %xmm10, 144(%rax) +; SSE-NEXT: movapd %xmm6, 128(%rax) +; SSE-NEXT: movapd %xmm14, 112(%rax) ; SSE-NEXT: movapd %xmm9, 96(%rax) -; SSE-NEXT: movaps %xmm7, 112(%rax) -; SSE-NEXT: movaps %xmm12, 160(%rax) -; SSE-NEXT: movaps %xmm11, 176(%rax) -; SSE-NEXT: movapd %xmm15, (%rax) -; SSE-NEXT: movapd %xmm4, 16(%rax) -; SSE-NEXT: movapd %xmm0, 64(%rax) -; SSE-NEXT: movapd %xmm3, 80(%rax) -; SSE-NEXT: movaps %xmm2, 128(%rax) -; SSE-NEXT: movaps %xmm6, 144(%rax) -; SSE-NEXT: movaps %xmm1, 192(%rax) -; SSE-NEXT: movaps %xmm5, 208(%rax) +; SSE-NEXT: movapd %xmm4, 80(%rax) +; SSE-NEXT: movapd %xmm2, 64(%rax) +; SSE-NEXT: movapd %xmm0, 48(%rax) +; SSE-NEXT: movapd %xmm3, 32(%rax) +; SSE-NEXT: movapd %xmm1, 16(%rax) +; SSE-NEXT: movapd %xmm7, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride7_vf4: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm2 -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 16(%r10), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm5 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm5 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm10 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm9[1] -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],mem[0],ymm3[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r9), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm10[0] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],mem[0],ymm5[2],mem[2] +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm8[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm10[2,3],ymm6[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%r10), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm4[1],ymm11[1],ymm4[3],ymm11[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm8[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm7[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r9), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm2[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -248,51 +247,51 @@ ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm4 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%r10), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 16(%r10), %xmm0 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm6 ; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm10 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm10[1],xmm9[1] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm2[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm11[2,3] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm7[0],xmm2[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm8 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm1[1] +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps (%r10), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 16(%r10), %xmm10 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm6[0],mem[0],ymm6[2],mem[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm3 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm3[1],xmm9[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm6, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%r9), %ymm5 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm7[0],xmm10[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm9[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps %xmm6, 16(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm5, (%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm9, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm5[1],ymm6[1],ymm5[3],ymm6[3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%r9), %ymm6 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm7[0],xmm8[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps %xmm1, 16(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm6, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 192(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 128(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -309,6 +308,7 @@ ; AVX512F-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <15,3,7,u> ; AVX512F-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 @@ -328,16 +328,15 @@ ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = <2,6,u,u,u,11,15,3> -; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <2,6,u,u,u,11,15,3> +; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm0 ; AVX512F-NEXT: movb $28, %cl ; AVX512F-NEXT: kmovw %ecx, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512F-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-NEXT: vmovdqa %ymm0, 192(%rax) +; AVX512F-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -354,6 +353,7 @@ ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm3, %zmm3 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <15,3,7,u> ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 @@ -373,16 +373,15 @@ ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6] ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = <2,6,u,u,u,11,15,3> -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <2,6,u,u,u,11,15,3> +; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm0 ; AVX512BW-NEXT: movb $28, %cl ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa %ymm0, 192(%rax) +; AVX512BW-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <4 x i64>, ptr %in.vecptr0, align 64 @@ -530,215 +529,206 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride7_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: pushq %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm7 -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm0 -; AVX1-ONLY-NEXT: vmovapd 32(%rax), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm10[1],xmm6[1] -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm9 -; AVX1-ONLY-NEXT: vmovapd 32(%r9), %xmm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm14[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm7 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm3 +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm8 +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm10 +; AVX1-ONLY-NEXT: vmovapd 32(%r9), %xmm6 +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rax), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm11[0],xmm10[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm10, %ymm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm11[1],ymm6[2],ymm11[2] -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm10 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm11 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm7[1],ymm9[1],ymm7[3],ymm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm3[1],ymm15[1],ymm3[3],ymm15[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm15[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm12, %ymm12 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm9 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm11[0],xmm12[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm9, %ymm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2],ymm14[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX1-ONLY-NEXT: vmovapd 32(%r8), %xmm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm14[0],xmm6[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rax), %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 32(%rdi), %ymm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm14 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm14[0],ymm7[0],ymm14[2],ymm7[2] ; AVX1-ONLY-NEXT: vmovapd 32(%rsi), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm14[0],ymm8[2],ymm14[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm0[0,1],ymm8[2,3] -; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm10[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[2],ymm8[2] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm10[0],xmm13[0] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 32(%rax), %ymm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm13[0],mem[0],ymm13[2],mem[2] +; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 48(%rdi), %xmm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 32(%rax), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3],ymm14[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm14[0],ymm0[1],ymm14[2],ymm0[3] -; AVX1-ONLY-NEXT: vmovapd 32(%r8), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],mem[0],ymm14[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd 48(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovapd 48(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm14 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = ymm14[0],mem[0],ymm14[2],mem[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm9[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm14[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm9[0],ymm3[0],ymm9[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm13[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r9), %ymm9 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm10[2,3],ymm14[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2],ymm15[3] +; AVX1-ONLY-NEXT: vmovapd 48(%rcx), %xmm15 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm15 = xmm15[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r9), %ymm0 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm10[0] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm9, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 352(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 384(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm8, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm12, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm11, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 224(%rax) +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm4, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm13, 352(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm6, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 96(%rax) ; AVX1-ONLY-NEXT: vmovapd %ymm0, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm12, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 256(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm9, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: popq %rax +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride7_vf8: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm7 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm2 -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm4 -; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm11 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm5 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm8 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm7[0],ymm3[2],ymm7[2] +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 16(%rax), %xmm12 ; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm13 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm9[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm15 -; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm9 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm9[1] -; AVX2-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm12[0],mem[0],ymm12[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm14[2,3],ymm5[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm4[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm10[1],ymm11[1],ymm10[3],ymm11[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vmovaps 48(%rax), %xmm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm15 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],xmm13[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm7[0],mem[0],ymm7[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%r9), %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm14[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm13, %ymm9 -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm15[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm14 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm4, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm11[0],mem[0],ymm11[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm14[2,3],ymm2[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm12[0,1],ymm3[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm10[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm14 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],xmm14[1] +; AVX2-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],xmm13[1] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm15[0],ymm6[2],ymm15[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm13[0],mem[0],ymm13[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm1[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm15[1],ymm6[3],ymm15[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps %xmm2, 16(%rdi) -; AVX2-ONLY-NEXT: vmovaps %xmm13, (%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 128(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 64(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm10, 320(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 384(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 256(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 96(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 352(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 224(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 288(%rdi) +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 48(%rax), %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm8[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm9, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm11[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%r9), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps %xmm9, 16(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm1, (%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 416(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 352(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm12, 320(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm10, 288(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 256(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 224(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 192(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 160(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rdi) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -746,123 +736,123 @@ ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: movb $12, %sil +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] ; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: movb $112, %sil +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,0,14,6,5,0,14,6] ; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] ; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $96, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm7, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: movb $12, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: movb $112, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm7, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] ; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: movb $6, %cl +; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm7 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: movb $56, %cl +; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] ; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: movb $-61, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] ; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] -; AVX512F-ONLY-SLOW-NEXT: movb $28, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[0,1,2,3],zmm4[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: movb $6, %cl +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm8[0],ymm12[2],ymm8[2] +; AVX512F-ONLY-SLOW-NEXT: movb $28, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm13 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm10, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: movb $56, %cl -; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[0,1,2,3],zmm3[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm10, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: movb $120, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm11, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: movb $-61, %cl +; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movb $48, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm8[1],ymm12[3],ymm8[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm15 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; @@ -871,245 +861,245 @@ ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: movb $12, %sil +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] ; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: movb $112, %sil +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,0,14,6,5,0,14,6] ; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] ; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 ; AVX512F-ONLY-FAST-NEXT: movb $24, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $96, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm7, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512F-ONLY-FAST-NEXT: movb $12, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k2} -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm7 -; AVX512F-ONLY-FAST-NEXT: movb $112, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: movb $-61, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] ; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: movb $48, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k2} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <1,3,7,u> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %ymm9, %ymm12, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: movb $14, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm7 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] -; AVX512F-ONLY-FAST-NEXT: movb $28, %sil -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm9[0,1,2,3],zmm4[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movb $6, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm9 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm10, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm7 {%k2} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm8 ; AVX512F-ONLY-FAST-NEXT: movb $56, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm8[0],ymm12[2],ymm8[2] +; AVX512F-ONLY-FAST-NEXT: movb $28, %cl +; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[0,1,2,3],zmm3[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 +; AVX512F-ONLY-FAST-NEXT: movb $48, %cl +; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k2} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,3,7,u> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %ymm8, %ymm12, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: movb $14, %cl +; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k2} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm4 ; AVX512F-ONLY-FAST-NEXT: movb $120, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: movb $-61, %cl +; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i64_stride7_vf8: ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm4 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,0,14,6,5,0,14,6] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm5 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] -; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm6 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: movb $-61, %sil +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm4 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,8,0,1,0,8,0,1] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-SLOW-NEXT: movb $12, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3] -; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3] -; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: movb $96, %sil +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: movb $112, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm10 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX512DQ-SLOW-NEXT: movb $28, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm11[0,1,2,3],zmm4[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7] -; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] -; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm12 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,12,0,5,4,12,0,5] -; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm11 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] -; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: movb $24, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm7 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] -; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: movb $6, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k2} -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3] -; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm12, %zmm11 -; AVX512DQ-SLOW-NEXT: movb $56, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k2} -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-SLOW-NEXT: movb $12, %cl -; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2} -; AVX512DQ-SLOW-NEXT: movb $112, %cl +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm7, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: movb $6, %cl +; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [9,1,9,1,9,1,9,1] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm3, %zmm9 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm9, %zmm10 +; AVX512DQ-SLOW-NEXT: movb $56, %cl +; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 +; AVX512DQ-SLOW-NEXT: movb $96, %cl +; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %ymm11 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512DQ-SLOW-NEXT: movb $28, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%r10), %zmm13, %zmm11 {%k2} +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm12[0,1,2,3],zmm4[2,3,0,1] ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] ; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] ; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 -; AVX512DQ-SLOW-NEXT: movb $120, %cl -; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: movb $48, %cl -; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] -; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: movb $14, %cl -; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,0,14,6,5,0,14,6] +; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm14 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm15 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [15,7,15,7,15,7,15,7] +; AVX512DQ-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm16 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,0,12,4,3,0,12,4] +; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm6, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm5 +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7] +; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm13, %zmm5 +; AVX512DQ-SLOW-NEXT: movb $120, %dl +; AVX512DQ-SLOW-NEXT: kmovw %edx, %k2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k2} +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] +; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm6 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm12 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: movb $-61, %dl +; AVX512DQ-SLOW-NEXT: kmovw %edx, %k2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k2} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7] +; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: movb $24, %dl +; AVX512DQ-SLOW-NEXT: kmovw %edx, %k2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: movb $48, %dl +; AVX512DQ-SLOW-NEXT: kmovw %edx, %k1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-SLOW-NEXT: movb $14, %al +; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 384(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 256(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; @@ -1117,122 +1107,123 @@ ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm4 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,0,14,6,5,0,14,6] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm7, %zmm8, %zmm5 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm6 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: movb $-61, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: movb $48, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm10 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <1,3,7,u> -; AVX512DQ-FAST-NEXT: vpermi2q %ymm10, %ymm11, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: movb $14, %sil +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,8,0,1,0,8,0,1] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: movb $12, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,3,11,3,11,3,11,3] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512DQ-FAST-NEXT: movb $96, %sil +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: movb $112, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX512DQ-FAST-NEXT: movb $28, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[0,1,2,3],zmm4[2,3,0,1] -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm11 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm7 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] -; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: movb $24, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm7 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: movb $6, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm8 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%r10), %zmm7, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: movb $6, %cl +; AVX512DQ-FAST-NEXT: kmovw %ecx, %k1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [9,1,9,1,9,1,9,1] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm9 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] ; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: movb $56, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm9, %zmm10 +; AVX512DQ-FAST-NEXT: movb $56, %cl +; AVX512DQ-FAST-NEXT: kmovw %ecx, %k1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm9 +; AVX512DQ-FAST-NEXT: movb $96, %cl +; AVX512DQ-FAST-NEXT: kmovw %ecx, %k1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm12 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQ-FAST-NEXT: movb $28, %cl +; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[0,1,2,3],zmm1[2,3,0,1] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4] ; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FAST-NEXT: movb $12, %cl +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm6, %zmm10 +; AVX512DQ-FAST-NEXT: movb $48, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} -; AVX512DQ-FAST-NEXT: movb $112, %cl +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <1,3,7,u> +; AVX512DQ-FAST-NEXT: vpermi2q %ymm11, %ymm12, %ymm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: movb $14, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%r10), %zmm12, %zmm10 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] ; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm12 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,1,12,7,0,1,12,7] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm12, %zmm13 ; AVX512DQ-FAST-NEXT: movb $120, %cl -; AVX512DQ-FAST-NEXT: kmovw %ecx, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 {%k2} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,0,14,6,5,0,14,6] +; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm8, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,13,6,7,0,13,6,7] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm11, %zmm12 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm14 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm14[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: movb $-61, %cl +; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 {%k2} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [15,7,15,7,15,7,15,7] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7] +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: movb $24, %cl +; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -1240,123 +1231,123 @@ ; AVX512BW-ONLY-SLOW: # %bb.0: ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: movb $12, %sil +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] ; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: movb $112, %sil +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,0,14,6,5,0,14,6] ; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] ; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $96, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm7, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: movb $12, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: movb $112, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm7, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] ; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm2, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: movb $6, %cl +; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm7 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm9, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: movb $56, %cl +; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] ; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] ; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %ymm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[2],ymm7[2] -; AVX512BW-ONLY-SLOW-NEXT: movb $28, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[0,1,2,3],zmm4[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: movb $6, %cl +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm8[0],ymm12[2],ymm8[2] +; AVX512BW-ONLY-SLOW-NEXT: movb $28, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm13 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm10, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: movb $56, %cl -; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm13 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[0,1,2,3],zmm3[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm4, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm10, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: movb $120, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm11, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %cl +; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movb $48, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm12[1],ymm7[1],ymm12[3],ymm7[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm8[1],ymm12[3],ymm8[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm15 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; @@ -1365,245 +1356,245 @@ ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,0,8,0,8,0,8] ; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: movb $112, %sil +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,0,14,6,5,0,14,6] ; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm7, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] ; AVX512BW-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: movb $24, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $96, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm7, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm6 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%r10), %zmm8, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: movb $112, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm9, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: movb $-61, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm7, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] ; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: movb $48, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k2} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <1,3,7,u> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %ymm9, %ymm12, %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: movb $14, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm7 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] -; AVX512BW-ONLY-FAST-NEXT: movb $28, %sil -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm9[0,1,2,3],zmm4[2,3,0,1] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $6, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm9 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm10, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm7 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm9, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: movb $56, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm8[0],ymm12[2],ymm8[2] +; AVX512BW-ONLY-FAST-NEXT: movb $28, %cl +; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm13[0,1,2,3],zmm3[2,3,0,1] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: movb $48, %cl +; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k2} = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,3,7,u> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %ymm8, %ymm12, %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: movb $14, %cl +; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm13 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm8, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm10, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: movb $120, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm11, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: movb $-61, %cl +; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i64_stride7_vf8: ; AVX512DQBW-SLOW: # %bb.0: ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm4 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm5 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm6 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: movb $-61, %sil +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm4 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQBW-SLOW-NEXT: movb $12, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm6 -; AVX512DQBW-SLOW-NEXT: movb $96, %sil +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm3, %zmm7 +; AVX512DQBW-SLOW-NEXT: movb $112, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm10 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX512DQBW-SLOW-NEXT: movb $28, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm11[0,1,2,3],zmm4[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm12 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm11 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: movb $24, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm8, %zmm7 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm11 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: movb $6, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm8 {%k2} -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm12, %zmm11 -; AVX512DQBW-SLOW-NEXT: movb $56, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm8 {%k2} -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQBW-SLOW-NEXT: movb $12, %cl -; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm11 {%k2} -; AVX512DQBW-SLOW-NEXT: movb $112, %cl +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm7, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm2, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: movb $6, %cl +; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm9, %zmm10 +; AVX512DQBW-SLOW-NEXT: movb $56, %cl +; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm9 +; AVX512DQBW-SLOW-NEXT: movb $96, %cl +; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm11 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] +; AVX512DQBW-SLOW-NEXT: movb $28, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%r10), %zmm13, %zmm11 {%k2} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm12[0,1,2,3],zmm4[2,3,0,1] ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] ; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] ; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm3, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 -; AVX512DQBW-SLOW-NEXT: movb $120, %cl -; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: movb $48, %cl -; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: movb $14, %cl -; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm13 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm3, %zmm13 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm14 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm15 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm16 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm6, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm12, %zmm5 +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm13, %zmm5 +; AVX512DQBW-SLOW-NEXT: movb $120, %dl +; AVX512DQBW-SLOW-NEXT: kmovd %edx, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 {%k2} +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm6 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm12 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: movb $-61, %dl +; AVX512DQBW-SLOW-NEXT: kmovd %edx, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm12 {%k2} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: movb $24, %dl +; AVX512DQBW-SLOW-NEXT: kmovd %edx, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: movb $48, %dl +; AVX512DQBW-SLOW-NEXT: kmovd %edx, %k1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-SLOW-NEXT: movb $14, %al +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 384(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 256(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; @@ -1611,122 +1602,123 @@ ; AVX512DQBW-FAST: # %bb.0: ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm4 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm8, %zmm5 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm6 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: movb $-61, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: movb $48, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm10 -; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <1,3,7,u> -; AVX512DQBW-FAST-NEXT: vpermi2q %ymm10, %ymm11, %ymm9 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: movb $14, %sil +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQBW-FAST-NEXT: movb $12, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm9 -; AVX512DQBW-FAST-NEXT: movb $96, %sil +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: movb $112, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] -; AVX512DQBW-FAST-NEXT: movb $28, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[0,1,2,3],zmm4[2,3,0,1] -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm11 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] -; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: movb $24, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm8, %zmm7 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = mem[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: movb $6, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm8 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%r10), %zmm7, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm7 +; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: movb $6, %cl +; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k1 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm9 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,9,0,3,4,9,0,3] ; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: movb $56, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 {%k2} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm9, %zmm10 +; AVX512DQBW-FAST-NEXT: movb $56, %cl +; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm10 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm9 +; AVX512DQBW-FAST-NEXT: movb $96, %cl +; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm11 +; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm12 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm10 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQBW-FAST-NEXT: movb $28, %cl +; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm10[0,1,2,3],zmm1[2,3,0,1] +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [3,0,12,4,3,0,12,4] ; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQBW-FAST-NEXT: movb $12, %cl +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm6, %zmm10 +; AVX512DQBW-FAST-NEXT: movb $48, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} -; AVX512DQBW-FAST-NEXT: movb $112, %cl +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm4[0],zmm5[0],zmm4[2],zmm5[2],zmm4[4],zmm5[4],zmm4[6],zmm5[6] +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <1,3,7,u> +; AVX512DQBW-FAST-NEXT: vpermi2q %ymm11, %ymm12, %ymm13 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: movb $14, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%r10), %zmm12, %zmm10 {%k2} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm10 {%k2} ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,5,13,5,13,5,13,5] ; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm11, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm12, %zmm13 ; AVX512DQBW-FAST-NEXT: movb $120, %cl -; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 {%k2} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [5,0,14,6,5,0,14,6] +; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm8, %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm11, %zmm12 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm3, %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm14 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm14[0,1,2,3],zmm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: movb $-61, %cl +; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm11 {%k2} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [15,7,15,7] +; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: movb $24, %cl +; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 64 @@ -2018,24 +2010,19 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride7_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $520, %rsp # imm = 0x208 +; AVX1-ONLY-NEXT: subq $392, %rsp # imm = 0x188 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm5 -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm0 -; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm6 -; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm12 +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm13 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm13[0] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm7 -; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rax), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm2 @@ -2047,16 +2034,16 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm1[1],ymm6[3],ymm1[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm12[1],ymm1[1],ymm12[3],ymm1[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2064,178 +2051,165 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm11 +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm11[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm7[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rax), %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm10 ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm8[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm4[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rax), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm15 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm15[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%rax), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 80(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm11[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm11 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm14[0],xmm13[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm2[0],mem[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm14, %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm11[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm11[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm13[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm12[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[2],ymm10[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm11[1],xmm12[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm9[0],xmm7[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[2],ymm5[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm6[1] +; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vmovapd 112(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 96(%rax), %ymm8 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm8[2,3],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[3] -; AVX1-ONLY-NEXT: vmovapd 112(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovapd 112(%rdx), %xmm9 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 120(%r9), %ymm10 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm10[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm8[0],xmm15[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm10 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 96(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 96(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 112(%rdi), %xmm4 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 96(%rax), %ymm7 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3] +; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] +; AVX1-ONLY-NEXT: vmovapd 112(%rcx), %xmm12 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm12[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 120(%r9), %ymm15 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1,2],ymm7[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm9[0],xmm8[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm12, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm10, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 464(%rax) +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps %xmm15, 464(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm8, 448(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 832(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm3, 768(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm12, (%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm2, 832(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 768(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 576(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 544(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 352(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 320(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 544(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm13, 96(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm6, 864(%rax) -; AVX1-ONLY-NEXT: vmovapd %ymm9, 800(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm14, 736(%rax) +; AVX1-ONLY-NEXT: vmovapd %ymm7, 864(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 800(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 736(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 704(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 704(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 672(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 640(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 608(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 512(%rax) @@ -2259,7 +2233,7 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $520, %rsp # imm = 0x208 +; AVX1-ONLY-NEXT: addq $392, %rsp # imm = 0x188 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -2343,20 +2317,20 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm12 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm15[0],ymm3[2],ymm15[2] -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm5 +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm12 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm12[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],mem[0],ymm9[2],mem[2] -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm5 ; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm3[0],ymm5[2],ymm3[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm9[2,3],ymm15[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2] @@ -2400,12 +2374,12 @@ ; AVX2-ONLY-NEXT: vmovaps 96(%rax), %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm6[0,1],ymm0[2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm12[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm6 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm5, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm4[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm4 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm12, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm3[1],ymm5[3],ymm3[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm3 @@ -2413,18 +2387,18 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1],ymm5[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm11[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm8 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm9, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm7[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd %xmm9, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] @@ -2440,45 +2414,45 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 112(%r9), %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm9 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm11 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm11 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps %xmm11, 16(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm6, (%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm9, 464(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm11, 464(%rax) ; AVX2-ONLY-NEXT: vmovaps %xmm8, 448(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm9, 16(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm4, (%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 800(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 768(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm15, 576(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 544(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 512(%rax) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 352(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 320(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 128(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 96(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 64(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 832(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 736(%rax) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm5, 704(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 544(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 512(%rax) +; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 352(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 320(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 128(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 96(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 832(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 736(%rax) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm4, 704(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 672(%rax) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 640(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 608(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 416(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 480(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 416(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 384(%rax) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -2497,1697 +2471,1707 @@ ; ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512F-ONLY-SLOW: # %bb.0: +; AVX512F-ONLY-SLOW-NEXT: subq $200, %rsp +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm10, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm23, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm21, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm29, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm15, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm21, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm15, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm31, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm12, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm10, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm10, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm10, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: movb $48, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm10[0],zmm12[0],zmm10[2],zmm12[2],zmm10[4],zmm12[4],zmm10[6],zmm12[6] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: movb $24, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm30, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: movb $64, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm16, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm16, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm16, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm10, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm29, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm15[0,1,2,3],zmm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: movb $64, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm24, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm10, %zmm24 ; AVX512F-ONLY-SLOW-NEXT: movb $96, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm9, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm29, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm21, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm16, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm28, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm19[0],ymm21[0],ymm19[2],ymm21[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm25 ; AVX512F-ONLY-SLOW-NEXT: movb $28, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k3} = zmm20[0,1,2,3],zmm23[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [6,13,14,7,6,13,14,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm31, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %ymm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm29[0],ymm31[0],ymm29[2],ymm31[2] -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm30[0,1,2,3],zmm22[2,3,0,1] -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm30, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm23, %zmm13, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm23, %zmm28, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm22, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm28, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm22, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: movb $48, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm22, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm30[0],ymm29[0],ymm30[2],ymm29[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k4} = zmm13[0,1,2,3],zmm25[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm27, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm25, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm11, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm23[0],ymm11[0],ymm23[2],ymm11[2] +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k4} = zmm26[0,1,2,3],zmm25[2,3,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm16, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm10, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm25, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm22, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm22[0],zmm6[0],zmm22[2],zmm6[2],zmm22[4],zmm6[4],zmm22[6],zmm6[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512F-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm22 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm3, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm24, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: movb $120, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm19 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm12, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm19 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $6, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k4} +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm8 {%k3} ; AVX512F-ONLY-SLOW-NEXT: movb $56, %sil +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2} +; AVX512F-ONLY-SLOW-NEXT: movb $120, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 {%k2} -; AVX512F-ONLY-SLOW-NEXT: movb $-31, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm20 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm10[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $-61, %sil -; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 {%k2} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k3} -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k4} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm28 {%k5} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm19[1],ymm21[1],ymm19[3],ymm21[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 {%k6} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} +; AVX512F-ONLY-SLOW-NEXT: movb $-31, %sil +; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm15 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k3} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm9 {%k4} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm21 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 {%k5} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm30[1],ymm29[1],ymm30[3],ymm29[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm29[1],ymm31[1],ymm29[3],ymm31[3] -; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm23[1],ymm11[1],ymm23[3],ymm11[3] +; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm5 {%k1} ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $200, %rsp ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i64_stride7_vf16: ; AVX512F-ONLY-FAST: # %bb.0: +; AVX512F-ONLY-FAST-NEXT: pushq %rax ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm10 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm18 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm24, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %xmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm18[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512F-ONLY-FAST-NEXT: movb $12, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm18, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm14, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm19, %zmm19 ; AVX512F-ONLY-FAST-NEXT: movb $112, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 {%k2} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm19[0],mem[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm19, %zmm0, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm30, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm18, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 {%k2} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [13,5,13,5,13,5,13,5] -; AVX512F-ONLY-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm22, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm23 -; AVX512F-ONLY-FAST-NEXT: movb $96, %sil +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm17[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm0, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm31, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm14, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm6 {%k2} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,0,10,2,1,0,10,2] +; AVX512F-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm17, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: movb $6, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm23 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,12,0,5,4,12,0,5] -; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm19, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,1,12,7,0,1,12,7] -; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm24, %zmm18 -; AVX512F-ONLY-FAST-NEXT: movb $120, %sil -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] -; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm13, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm25 -; AVX512F-ONLY-FAST-NEXT: movb $24, %dil -; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k2} -; AVX512F-ONLY-FAST-NEXT: movb $-31, %dil -; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm25 {%k3} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm15, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] -; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm28, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm1, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: movb $-61, %dil -; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} -; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 {%k3} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [9,1,9,1,9,1,9,1] +; AVX512F-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm25, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm20 ; AVX512F-ONLY-FAST-NEXT: movb $48, %sil -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm19 {%k3} +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k2} = zmm5[0],zmm10[0],zmm5[2],zmm10[2],zmm5[4],zmm10[4],zmm5[6],zmm10[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm23, %zmm15, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm18, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm24[0,1,2,3],zmm21[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: movb $-61, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[4],zmm2[6],zmm3[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm24, %ymm5, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: movb $14, %sil -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm14, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k3} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [6,13,14,7,6,13,14,7] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm23, %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm5 +; AVX512F-ONLY-FAST-NEXT: movb $24, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %ymm22, %ymm8, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm23 {%k3} -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm28[0],ymm24[0],ymm28[2],ymm24[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm17[0,1,2,3],zmm16[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [11,3,11,3,11,3,11,3] -; AVX512F-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm24, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm24, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: movb $28, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm5[0,1,2,3],zmm10[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm8[0],ymm22[0],ymm8[2],ymm22[2] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm5[0,1,2,3],zmm13[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: movb $6, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [9,1,9,1,9,1,9,1] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,9,0,3,4,9,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 {%k3} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm26 +; AVX512F-ONLY-FAST-NEXT: movb $56, %sil +; AVX512F-ONLY-FAST-NEXT: movb $-31, %dil +; AVX512F-ONLY-FAST-NEXT: kmovw %edi, %k4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm14 {%k4} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [11,3,11,3,11,3,11,3] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm13, %zmm19 +; AVX512F-ONLY-FAST-NEXT: movb $96, %cl +; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm31, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 {%k4} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm25 = ymm12[0],ymm18[0],ymm12[2],ymm18[2] +; AVX512F-ONLY-FAST-NEXT: movb $28, %cl +; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k4 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k4} = zmm25[0,1,2,3],zmm16[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm4, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm0[0],ymm25[0],ymm0[2],ymm25[2] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k4} = zmm26[0,1,2,3],zmm9[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <1,3,7,u> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm18, %ymm26, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm25, %ymm26, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: movb $14, %cl +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm7, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k2} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] +; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm20 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm10, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm12 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $64, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} -; AVX512F-ONLY-FAST-NEXT: movb $56, %al +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: movb $120, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm26, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm27 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm31, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm11, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm24, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm23, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k3} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm2, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14> -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm16, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: popq %rax ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: pushq %rax ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm10 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7] -; AVX512DQ-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm15 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm15[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: movb $64, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm19 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm23 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm24 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,5,13,5,13,5,13,5] -; AVX512DQ-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: movb $96, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] -; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm10, %zmm17 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] -; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm18, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm27 -; AVX512DQ-SLOW-NEXT: movb $24, %sil +; AVX512DQ-SLOW-NEXT: vmovaps 64(%r8), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,8,0,1,0,8,0,1] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm7 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [1,0,10,2,1,0,10,2] +; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [11,3,11,3,11,3,11,3] +; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm6, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm25 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQ-SLOW-NEXT: movb $12, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm9, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm14 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm11, %zmm10 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] -; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm29, %zmm10 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [11,3,11,3,11,3,11,3] -; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm13 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] -; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] -; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [5,0,14,6,5,0,14,6] -; AVX512DQ-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm15, %zmm21, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm21, %zmm15, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %ymm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm28, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <13,u,2,3,4,5,6,14> -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %ymm19 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm19[0],ymm21[0],ymm19[2],ymm21[2] -; AVX512DQ-SLOW-NEXT: movb $28, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k3} = zmm20[0,1,2,3],zmm23[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [6,13,14,7,6,13,14,7] -; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm20 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,13,6,7,0,13,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm7 {%k2} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm27, %zmm28 +; AVX512DQ-SLOW-NEXT: movb $112, %sil +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [5,0,14,6,5,0,14,6] ; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm22, %zmm31, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %ymm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %ymm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm12, %zmm0, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm29[0],ymm31[0],ymm29[2],ymm31[2] -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k3} = zmm30[0,1,2,3],zmm22[2,3,0,1] -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm22, %zmm24, %zmm26 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,13,6,7,0,13,6,7] ; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm30, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [14,1,2,3,4,5,6,15] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm23, %zmm13, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,13,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm23, %zmm28, %zmm13 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [1,0,10,2,1,0,10,2] -; AVX512DQ-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm12 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,0,12,4,3,0,12,4] -; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm28, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm16, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm23, %zmm8 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,8,0,1,0,8,0,1] -; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm28, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm28 -; AVX512DQ-SLOW-NEXT: movb $48, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm6 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm0, %zmm23 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm4[0],zmm0[0],zmm4[2],zmm0[2],zmm4[4],zmm0[4],zmm4[6],zmm0[6] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: movb $12, %sil +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm23, %zmm26, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm12, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm3, %zmm16 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: movb $-61, %dil +; AVX512DQ-SLOW-NEXT: kmovw %edi, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm15 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] +; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm26, %zmm22 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [6,13,14,7,6,13,14,7] +; AVX512DQ-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm22, %zmm23, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm31 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [15,7,15,7] +; AVX512DQ-SLOW-NEXT: # ymm22 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm22, %zmm16 +; AVX512DQ-SLOW-NEXT: movb $24, %dil +; AVX512DQ-SLOW-NEXT: kmovw %edi, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16 {%k1} +; AVX512DQ-SLOW-NEXT: movb $-31, %dil +; AVX512DQ-SLOW-NEXT: kmovw %edi, %k3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm16 {%k3} ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} -; AVX512DQ-SLOW-NEXT: movb $112, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k4 -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm24, %zmm1 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: movb $120, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 {%k5} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm28, %zmm7 {%k3} +; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $6, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k6 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm8 {%k6} -; AVX512DQ-SLOW-NEXT: movb $56, %sil -; AVX512DQ-SLOW-NEXT: kmovw %esi, %k7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm8 {%k7} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm28 {%k3} -; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm17, %zmm28 {%k4} -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} -; AVX512DQ-SLOW-NEXT: movb $-31, %sil +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm25[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm0, %ymm25 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm0, %zmm9 {%k2} ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQ-SLOW-NEXT: movb $-61, %sil +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm11 {%k2} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] +; AVX512DQ-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm8, %zmm29 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,9,0,3,4,9,0,3] +; AVX512DQ-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm28, %zmm29 +; AVX512DQ-SLOW-NEXT: movb $56, %sil +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm19, %zmm30, %zmm27 +; AVX512DQ-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm27, %zmm9 {%k3} +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 {%k3} +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] +; AVX512DQ-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm27, %zmm25 +; AVX512DQ-SLOW-NEXT: movb $96, %sil +; AVX512DQ-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm13 {%k2} ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 {%k2} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm11 {%k5} -; AVX512DQ-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm23 {%k6} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23 {%k7} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm19[1],ymm21[1],ymm19[3],ymm21[3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm25 {%k2} +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [13,5,13,5,13,5,13,5] +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm10, %zmm27 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] +; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm29, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm10, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm29, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %ymm29 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm19, %zmm30, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %ymm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm13 {%k3} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm10[0],ymm28[2],ymm10[2] +; AVX512DQ-SLOW-NEXT: movb $28, %cl +; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k3 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k3} = zmm8[0,1,2,3],zmm23[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%r8), %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm27 {%k2} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm29[0],ymm8[2],ymm29[2] +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 {%k3} = zmm14[0,1,2,3],zmm18[2,3,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [4,12,0,5,4,12,0,5] +; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm17, %zmm20 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,1,12,7,0,1,12,7] +; AVX512DQ-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm24, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k2} +; AVX512DQ-SLOW-NEXT: movb $120, %cl +; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm20 {%k2} +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm19, %zmm30, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm24, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm17 {%k2} +; AVX512DQ-SLOW-NEXT: movb $48, %cl +; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k2} = zmm4[0],zmm6[0],zmm4[2],zmm6[2],zmm4[4],zmm6[4],zmm4[6],zmm6[6] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k2} = zmm3[0],zmm5[0],zmm3[2],zmm5[2],zmm3[4],zmm5[4],zmm3[6],zmm5[6] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm22 +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm2 = zmm21[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm28[1],ymm10[1],ymm28[3],ymm10[3] +; AVX512DQ-SLOW-NEXT: movb $64, %cl +; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm2 {%k1} ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-SLOW-NEXT: movb $14, %cl ; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm29[1],ymm31[1],ymm29[3],ymm31[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm8[1],ymm29[1],ymm8[3],ymm29[3] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm30, %zmm22, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <13,u,2,3,4,5,6,14> +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm19, %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm19, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm18, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm18, %zmm2, %zmm4 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512DQ-SLOW-NEXT: popq %rax ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i64_stride7_vf16: ; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: pushq %rax ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm20 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm19, %zmm16 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm10 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] -; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %xmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm24 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm17[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm25 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm25 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm9 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,0,1,0,8,0,1] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm7, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512DQ-FAST-NEXT: movb $12, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm0, %zmm6 {%k2} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm25, %zmm17 -; AVX512DQ-FAST-NEXT: movb $112, %sil -; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm17, %zmm6 {%k3} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm21, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm22 -; AVX512DQ-FAST-NEXT: movb $96, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm30, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm17, %zmm23 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [1,0,10,2,1,0,10,2] +; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [3,0,12,4,3,0,12,4] +; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm19 +; AVX512DQ-FAST-NEXT: movb $48, %sil +; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k2} = zmm5[0],zmm10[0],zmm5[2],zmm10[2],zmm5[4],zmm10[4],zmm5[6],zmm10[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm29 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [5,0,14,6,5,0,14,6] ; AVX512DQ-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm27, %zmm23 -; AVX512DQ-FAST-NEXT: movb $120, %sil -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm24[0],mem[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm24 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm0, %zmm8 {%k2} -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm25, %zmm8 {%k3} +; AVX512DQ-FAST-NEXT: vpermi2q %zmm24, %zmm16, %zmm27 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,13,6,7,0,13,6,7] +; AVX512DQ-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm27, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm27 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm20, %zmm22 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm25 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm25[0,1,2,3],zmm22[4,5,6,7] +; AVX512DQ-FAST-NEXT: movb $-61, %sil +; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm22 {%k3} ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] ; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm25, %zmm24 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] -; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm24, %zmm13, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm25, %zmm29 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [15,7,15,7] -; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm24 -; AVX512DQ-FAST-NEXT: movb $24, %dil -; AVX512DQ-FAST-NEXT: kmovw %edi, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm24 {%k2} -; AVX512DQ-FAST-NEXT: movb $-31, %dil -; AVX512DQ-FAST-NEXT: kmovw %edi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 {%k3} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] -; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm12, %zmm15, %zmm28 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] -; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm13, %zmm28, %zmm29 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm1, %zmm19 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm20 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] -; AVX512DQ-FAST-NEXT: movb $-61, %dil -; AVX512DQ-FAST-NEXT: kmovw %edi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm19 {%k3} -; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 {%k3} -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm22, %zmm20 -; AVX512DQ-FAST-NEXT: movb $48, %sil -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm28 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm21 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2q %zmm15, %zmm12, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm27, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k3} -; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[4],zmm2[6],zmm3[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %ymm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm28, %ymm7 -; AVX512DQ-FAST-NEXT: vpermt2q %ymm27, %ymm5, %ymm7 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: movb $14, %sil -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm14, %zmm22 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm25, %zmm24 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [6,13,14,7,6,13,14,7] +; AVX512DQ-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm24, %zmm17, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm25, %zmm20 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [15,7,15,7] +; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm24, %zmm5 +; AVX512DQ-FAST-NEXT: movb $24, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm20 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm7 -; AVX512DQ-FAST-NEXT: vpermi2q %ymm21, %ymm7, %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm22 {%k3} -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm18[0,1,2,3],zmm16[4,5,6,7] -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [11,3,11,3,11,3,11,3] -; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] -; AVX512DQ-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm14 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: movb $28, %al -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm5[0,1,2,3],zmm10[2,3,0,1] -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm21[0],ymm7[2],ymm21[2] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm5[0,1,2,3],zmm13[2,3,0,1] -; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: movb $6, %al -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,9,0,3,4,9,0,3] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm20 +; AVX512DQ-FAST-NEXT: movb $-31, %sil +; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 {%k4} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm31, %zmm23 +; AVX512DQ-FAST-NEXT: movb $112, %sil +; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm23, %zmm28 {%k4} +; AVX512DQ-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: movb $6, %sil +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: kmovw %esi, %k1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm26 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [4,9,0,3,4,9,0,3] +; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm20, %zmm26 +; AVX512DQ-FAST-NEXT: movb $56, %sil +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm31 +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm31, %zmm7 {%k4} +; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm12 {%k4} +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [11,3,11,3,11,3,11,3] +; AVX512DQ-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm14 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [2,10,0,3,2,10,0,3] +; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm15, %zmm18 +; AVX512DQ-FAST-NEXT: movb $96, %cl +; AVX512DQ-FAST-NEXT: kmovw %ecx, %k1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %ymm26 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k4} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm0[0],ymm26[0],ymm0[2],ymm26[2] +; AVX512DQ-FAST-NEXT: movb $28, %cl +; AVX512DQ-FAST-NEXT: kmovw %ecx, %k4 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k4} = zmm13[0,1,2,3],zmm17[2,3,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %ymm20 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm23 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm4, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm15 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm13 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm13[0],ymm20[0],ymm13[2],ymm20[2] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k4} = zmm23[0,1,2,3],zmm9[2,3,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = <1,3,7,u> +; AVX512DQ-FAST-NEXT: vpermt2q %ymm26, %ymm23, %ymm0 +; AVX512DQ-FAST-NEXT: vpermt2q %ymm20, %ymm23, %ymm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: movb $14, %cl +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm6, %zmm21 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k2} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] +; AVX512DQ-FAST-NEXT: kmovw %ecx, %k2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm13, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm27 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,1,12,7,0,1,12,7] +; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm13, %zmm16 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} ; AVX512DQ-FAST-NEXT: movb $64, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: movb $56, %al +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: movb $120, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm25, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k2} -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm30, %zmm2, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm16, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm27 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm11, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm25, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 832(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512DQ-FAST-NEXT: popq %rax ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512BW-ONLY-SLOW: # %bb.0: +; AVX512BW-ONLY-SLOW-NEXT: subq $200, %rsp +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm10, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm23, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm18, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm21, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm21, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm16, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm29, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm15, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm21, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm15, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm31, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm30, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm12, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm10, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm10, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm12, %zmm10, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: movb $48, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm10[0],zmm12[0],zmm10[2],zmm12[2],zmm10[4],zmm12[4],zmm10[6],zmm12[6] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: movb $24, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm30, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: movb $64, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm16, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm16, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm16, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm10, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm29, %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm15[0,1,2,3],zmm11[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: movb $64, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm19 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm24, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm10, %zmm24 ; AVX512BW-ONLY-SLOW-NEXT: movb $96, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm9, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm15, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm29, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm17, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm21, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm21, %zmm16, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %ymm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm28, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm19[0],ymm21[0],ymm19[2],ymm21[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm17 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm25 ; AVX512BW-ONLY-SLOW-NEXT: movb $28, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k3} = zmm20[0,1,2,3],zmm23[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [6,13,14,7,6,13,14,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm22, %zmm31, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %ymm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm0, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm17 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm29[0],ymm31[0],ymm29[2],ymm31[2] -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k3} = zmm30[0,1,2,3],zmm22[2,3,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm30, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm23, %zmm13, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm23, %zmm28, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm22, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm28, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm22, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm11, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: movb $48, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm22, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm8[0],zmm7[0],zmm8[2],zmm7[2],zmm8[4],zmm7[4],zmm8[6],zmm7[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %ymm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm30 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm30[0],ymm29[0],ymm30[2],ymm29[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k4} = zmm13[0,1,2,3],zmm25[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm27, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [6,13,14,7,6,13,14,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm25, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm11, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm11, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%r9), %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %ymm23 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm23[0],ymm11[0],ymm23[2],ymm11[2] +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k4} = zmm26[0,1,2,3],zmm25[2,3,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm16, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm10, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm25, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm31, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm22, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm22[0],zmm6[0],zmm22[2],zmm6[2],zmm22[4],zmm6[4],zmm22[6],zmm6[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512BW-ONLY-SLOW-NEXT: movb $12, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm22 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, (%rax), %zmm3, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: movb $112, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm24, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: movb $120, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm19 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $3, 64(%rax), %zmm12, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm19 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $6, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm8 {%k3} ; AVX512BW-ONLY-SLOW-NEXT: movb $56, %sil +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: movb $120, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: movb $-31, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm20 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm10[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %sil -; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm28 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm28 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm19[1],ymm21[1],ymm19[3],ymm21[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: movb $-31, %sil +; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm15 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm9 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm9 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm21 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm30[1],ymm29[1],ymm30[3],ymm29[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $14, %cl ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm29[1],ymm31[1],ymm29[3],ymm31[3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm23[1],ymm11[1],ymm23[3],ymm11[3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm5 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 384(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $200, %rsp ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride7_vf16: ; AVX512BW-ONLY-FAST: # %bb.0: +; AVX512BW-ONLY-FAST-NEXT: pushq %rax ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm10 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm18 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm24, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %xmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm19 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm18[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm14[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm18, %zmm0, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm18, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm22, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm14, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm19, %zmm19 ; AVX512BW-ONLY-FAST-NEXT: movb $112, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm6 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm19[0],mem[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm19, %zmm0, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm30, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm18, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 {%k2} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [13,5,13,5,13,5,13,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm22, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: movb $96, %sil +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm17[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm0, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm31, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm14, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm6 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,0,10,2,1,0,10,2] +; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm17, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: movb $6, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm23 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,12,0,5,4,12,0,5] -; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm19, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,1,12,7,0,1,12,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm24, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: movb $120, %sil -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm26, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm13, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: movb $24, %dil -; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k2} -; AVX512BW-ONLY-FAST-NEXT: movb $-31, %dil -; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm25 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm12, %zmm15, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm28, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm1, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm21[0,1,2,3],zmm20[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: movb $-61, %dil -; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 {%k3} -; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [9,1,9,1,9,1,9,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm25, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm20 ; AVX512BW-ONLY-FAST-NEXT: movb $48, %sil -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm22 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm12, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm19 {%k3} +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k2} = zmm5[0],zmm10[0],zmm5[2],zmm10[2],zmm5[4],zmm10[4],zmm5[6],zmm10[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm23, %zmm15, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm18, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm24[0,1,2,3],zmm21[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: movb $-61, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k3} = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[4],zmm2[6],zmm3[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm24, %ymm5, %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: movb $14, %sil -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm14, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm23 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [6,13,14,7,6,13,14,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm23, %zmm16, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: movb $24, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm21 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %ymm22, %ymm8, %ymm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm23 {%k3} -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm28[0],ymm24[0],ymm28[2],ymm24[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm17[0,1,2,3],zmm16[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [11,3,11,3,11,3,11,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm24, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm24, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm17, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: movb $28, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm5[0,1,2,3],zmm10[2,3,0,1] -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm8[0],ymm22[0],ymm8[2],ymm22[2] -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm5[0,1,2,3],zmm13[2,3,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: movb $6, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [9,1,9,1,9,1,9,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,9,0,3,4,9,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm5 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,9,0,3,4,9,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: movb $56, %sil +; AVX512BW-ONLY-FAST-NEXT: movb $-31, %dil +; AVX512BW-ONLY-FAST-NEXT: kmovd %edi, %k4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k4} +; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm14 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [11,3,11,3,11,3,11,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm13, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: movb $96, %cl +; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm31, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm12, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm17 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm25 = ymm12[0],ymm18[0],ymm12[2],ymm18[2] +; AVX512BW-ONLY-FAST-NEXT: movb $28, %cl +; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k4 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k4} = zmm25[0,1,2,3],zmm16[2,3,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %ymm25 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm4, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm26 = ymm0[0],ymm25[0],ymm0[2],ymm25[2] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k4} = zmm26[0,1,2,3],zmm9[2,3,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <1,3,7,u> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm18, %ymm26, %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm25, %ymm26, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: movb $14, %cl +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm7, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k2} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] +; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm20 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,5,13,5,13,5,13,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm10, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm12 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $64, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} -; AVX512BW-ONLY-FAST-NEXT: movb $56, %al +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: movb $120, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm26, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm27 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm31, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm11, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm24, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm23, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm2, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14> -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm16, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 512(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: popq %rax ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i64_stride7_vf16: ; AVX512DQBW-SLOW: # %bb.0: +; AVX512DQBW-SLOW-NEXT: pushq %rax ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm10 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm15 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm15[0,1,2,3],zmm11[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: movb $64, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm19 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm23 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm17, %zmm24 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: movb $96, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm10, %zmm17 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm18, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm18, %zmm27 -; AVX512DQBW-SLOW-NEXT: movb $24, %sil +; AVX512DQBW-SLOW-NEXT: vmovaps 64(%r8), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm9, %zmm7 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm6, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm16, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm25 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQBW-SLOW-NEXT: movb $12, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm9, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm14 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm11, %zmm10 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm29, %zmm10 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm13 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm21, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm21, %zmm15, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %ymm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm28, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm19, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %ymm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm20 = ymm19[0],ymm21[0],ymm19[2],ymm21[2] -; AVX512DQBW-SLOW-NEXT: movb $28, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k3} = zmm20[0,1,2,3],zmm23[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [6,13,14,7,6,13,14,7] -; AVX512DQBW-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm26, %zmm22, %zmm20 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm7 {%k2} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm27, %zmm28 +; AVX512DQBW-SLOW-NEXT: movb $112, %sil +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm20 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [5,0,14,6,5,0,14,6] ; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm22, %zmm31, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %ymm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %ymm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm12, %zmm0, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm18 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm30 = ymm29[0],ymm31[0],ymm29[2],ymm31[2] -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k3} = zmm30[0,1,2,3],zmm22[2,3,0,1] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm22, %zmm24, %zmm26 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,13,6,7,0,13,6,7] ; AVX512DQBW-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm30, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm30, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm23, %zmm13, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm23, %zmm28, %zmm13 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-SLOW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm8, %zmm12 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm28, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm16, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm28, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm23, %zmm8 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm28, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm6, %zmm28 -; AVX512DQBW-SLOW-NEXT: movb $48, %sil +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm23, %zmm26, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm12, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm5, %zmm3, %zmm16 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: movb $-61, %dil +; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm15 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm26, %zmm22 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-SLOW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm22, %zmm23, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm31 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # ymm22 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm22, %zmm16 +; AVX512DQBW-SLOW-NEXT: movb $24, %dil +; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16 {%k1} +; AVX512DQBW-SLOW-NEXT: movb $-31, %dil +; AVX512DQBW-SLOW-NEXT: kmovd %edi, %k3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm16 {%k3} ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm6 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm0, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k3} = zmm4[0],zmm0[0],zmm4[2],zmm0[2],zmm4[4],zmm0[4],zmm4[6],zmm0[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-SLOW-NEXT: movb $12, %sil +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm28, %zmm7 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: movb $6, %sil +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm25[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm0, %ymm25 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm0, %zmm9 {%k2} +; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm11 {%k2} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm8, %zmm29 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm28, %zmm29 +; AVX512DQBW-SLOW-NEXT: movb $56, %sil +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm19, %zmm30, %zmm27 +; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm27, %zmm9 {%k3} ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k3} -; AVX512DQBW-SLOW-NEXT: movb $112, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k4 -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, (%rax), %zmm24, %zmm1 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: movb $120, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 {%k5} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm11 {%k3} +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-SLOW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm27, %zmm25 +; AVX512DQBW-SLOW-NEXT: movb $96, %sil ; AVX512DQBW-SLOW-NEXT: vpbroadcastq 72(%rcx), %ymm2 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: movb $6, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k6 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm8 {%k6} -; AVX512DQBW-SLOW-NEXT: movb $56, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm8 {%k7} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm28 {%k3} -; AVX512DQBW-SLOW-NEXT: vinserti64x2 $3, 64(%rax), %zmm17, %zmm28 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2} -; AVX512DQBW-SLOW-NEXT: movb $-31, %sil -; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: movb $-61, %sil +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm13 {%k2} ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm11 {%k5} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq 8(%rcx), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm23 {%k6} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23 {%k7} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm19[1],ymm21[1],ymm19[3],ymm21[3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm25 {%k2} +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm10, %zmm27 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm29, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm10, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm29, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %ymm29 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm19, %zmm30, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm28, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %ymm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm13 {%k3} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm28[0],ymm10[0],ymm28[2],ymm10[2] +; AVX512DQBW-SLOW-NEXT: movb $28, %cl +; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k3 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k3} = zmm8[0,1,2,3],zmm23[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%r8), %ymm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm27 {%k2} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm29[0],ymm8[2],ymm29[2] +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm27 {%k3} = zmm14[0,1,2,3],zmm18[2,3,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm14 {%k2} +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm17, %zmm20 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm24, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k2} +; AVX512DQBW-SLOW-NEXT: movb $120, %cl +; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm20 {%k2} +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm19, %zmm30, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm24, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm17 {%k2} +; AVX512DQBW-SLOW-NEXT: movb $48, %cl +; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k2} = zmm4[0],zmm6[0],zmm4[2],zmm6[2],zmm4[4],zmm6[4],zmm4[6],zmm6[6] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k2} = zmm3[0],zmm5[0],zmm3[2],zmm5[2],zmm3[4],zmm5[4],zmm3[6],zmm5[6] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm4, %zmm22 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm2 = zmm21[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm28[1],ymm10[1],ymm28[3],ymm10[3] +; AVX512DQBW-SLOW-NEXT: movb $64, %cl +; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm2 {%k1} ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-SLOW-NEXT: movb $14, %cl ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm29[1],ymm31[1],ymm29[3],ymm31[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm8[1],ymm29[1],ymm8[3],ymm29[3] ; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,3,3] ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm30, %zmm22, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm19, %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm19, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,13,2,3,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm18, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm18, %zmm2, %zmm4 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 448(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 768(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 768(%rax) +; AVX512DQBW-SLOW-NEXT: popq %rax ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: store_i64_stride7_vf16: ; AVX512DQBW-FAST: # %bb.0: +; AVX512DQBW-FAST-NEXT: pushq %rax ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm20 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm30 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm19, %zmm16 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm10 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,8,0,1,0,8,0,1] -; AVX512DQBW-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %xmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm24 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm17[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm25 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm25 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm25, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm9 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,0,1,0,8,0,1] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm7, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512DQBW-FAST-NEXT: movb $12, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm0, %zmm6 {%k2} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm25, %zmm17 -; AVX512DQBW-FAST-NEXT: movb $112, %sil -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm17, %zmm6 {%k3} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5] -; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm21, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm22 -; AVX512DQBW-FAST-NEXT: movb $96, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm22 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm30, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm17, %zmm23 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm0, %zmm28 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm23 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [1,0,10,2,1,0,10,2] +; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm19 +; AVX512DQBW-FAST-NEXT: movb $48, %sil +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k2} = zmm5[0],zmm10[0],zmm5[2],zmm10[2],zmm5[4],zmm10[4],zmm5[6],zmm10[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm29 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [5,0,14,6,5,0,14,6] ; AVX512DQBW-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm27, %zmm23 -; AVX512DQBW-FAST-NEXT: movb $120, %sil -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm24[0],mem[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm24 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm0, %zmm8 {%k2} -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm25, %zmm8 {%k3} +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm24, %zmm16, %zmm27 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm17, %zmm27, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm20, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm5, %zmm25 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm25[0,1,2,3],zmm22[4,5,6,7] +; AVX512DQBW-FAST-NEXT: movb $-61, %sil +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm22 {%k3} ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] ; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm25, %zmm24 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [6,13,14,7,6,13,14,7] -; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm24, %zmm13, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm25, %zmm29 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [15,7,15,7] -; AVX512DQBW-FAST-NEXT: # ymm26 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm24 -; AVX512DQBW-FAST-NEXT: movb $24, %dil -; AVX512DQBW-FAST-NEXT: kmovd %edi, %k2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm24 {%k2} -; AVX512DQBW-FAST-NEXT: movb $-31, %dil -; AVX512DQBW-FAST-NEXT: kmovd %edi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm24 {%k3} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [5,0,14,6,5,0,14,6] -; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm12, %zmm15, %zmm28 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm13, %zmm28, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm1, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm20 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm20[0,1,2,3],zmm19[4,5,6,7] -; AVX512DQBW-FAST-NEXT: movb $-61, %dil -; AVX512DQBW-FAST-NEXT: kmovd %edi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm19 {%k3} -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 {%k3} -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm22, %zmm20 -; AVX512DQBW-FAST-NEXT: movb $48, %sil -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm21 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm12, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm27, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k3} -; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm20 {%k3} = zmm2[0],zmm3[0],zmm2[2],zmm3[2],zmm2[4],zmm3[4],zmm2[6],zmm3[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %ymm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %ymm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm28 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,3,7,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm28, %ymm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm27, %ymm5, %ymm7 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: movb $14, %sil -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm14, %zmm22 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm22 {%k3} = zmm11[0],zmm0[0],zmm11[2],zmm0[2],zmm11[4],zmm0[4],zmm11[6],zmm0[6] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm25, %zmm24 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm24, %zmm17, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm25, %zmm20 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [15,7,15,7] +; AVX512DQBW-FAST-NEXT: # ymm24 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm24, %zmm5 +; AVX512DQBW-FAST-NEXT: movb $24, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm20 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm7 -; AVX512DQBW-FAST-NEXT: vpermi2q %ymm21, %ymm7, %ymm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm22 {%k3} -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm28[0],ymm27[0],ymm28[2],ymm27[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm18[0,1,2,3],zmm16[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [11,3,11,3,11,3,11,3] -; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [2,10,0,3,2,10,0,3] -; AVX512DQBW-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm27, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm18, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm14 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm11, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: movb $28, %al -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm5[0,1,2,3],zmm10[2,3,0,1] -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm5, %zmm11 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm7[0],ymm21[0],ymm7[2],ymm21[2] -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm5[0,1,2,3],zmm13[2,3,0,1] -; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: movb $6, %al -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm11 {%k1} -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [9,1,9,1,9,1,9,1] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm7 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,9,0,3,4,9,0,3] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm7 -; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm20 +; AVX512DQBW-FAST-NEXT: movb $-31, %sil +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 {%k4} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm31, %zmm23 +; AVX512DQBW-FAST-NEXT: movb $112, %sil +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, (%rax), %zmm23, %zmm28 {%k4} +; AVX512DQBW-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm13 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-FAST-NEXT: movb $6, %sil +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k1 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm0, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [9,1,9,1,9,1,9,1] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm26 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [4,9,0,3,4,9,0,3] +; AVX512DQBW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm20, %zmm26 +; AVX512DQBW-FAST-NEXT: movb $56, %sil +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm31 +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm31, %zmm7 {%k4} +; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm12 {%k4} +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [11,3,11,3,11,3,11,3] +; AVX512DQBW-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm14 +; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm15 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [2,10,0,3,2,10,0,3] +; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm15, %zmm18 +; AVX512DQBW-FAST-NEXT: movb $96, %cl +; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %ymm26 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k4} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm0[0],ymm26[0],ymm0[2],ymm26[2] +; AVX512DQBW-FAST-NEXT: movb $28, %cl +; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k4 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 {%k4} = zmm13[0,1,2,3],zmm17[2,3,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %ymm20 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm4, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm15 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r8), %ymm13 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm23 = ymm13[0],ymm20[0],ymm13[2],ymm20[2] +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k4} = zmm23[0,1,2,3],zmm9[2,3,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = <1,3,7,u> +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm26, %ymm23, %ymm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm20, %ymm23, %ymm13 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: movb $14, %cl +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm6, %zmm21 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm21 {%k2} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] +; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm19 {%k2} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,5,13,5,13,5,13,5] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm13, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm27 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm13, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} ; AVX512DQBW-FAST-NEXT: movb $64, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm16 {%k1} -; AVX512DQBW-FAST-NEXT: movb $56, %al +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: movb $120, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm25, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 {%k2} -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm30, %zmm2, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm16, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [14,1,2,3,4,5,6,15] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm27 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm3, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm11, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm25, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm24, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <13,u,2,3,4,5,6,14> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,13,2,3,4,5,6,7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,1,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm0, %zmm1 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 512(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 832(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 832(%rax) +; AVX512DQBW-FAST-NEXT: popq %rax ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 @@ -4767,124 +4751,120 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $1320, %rsp # imm = 0x528 +; AVX1-ONLY-NEXT: subq $1336, %rsp # imm = 0x538 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm5 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm5 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm3 +; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm6 ; AVX1-ONLY-NEXT: vmovaps (%r9), %ymm4 -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm6 -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm7 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm9 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm11 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm7 +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm8 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rax), %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm9 -; AVX1-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm10[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[2],ymm2[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 16(%rax), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rax), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm1 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rax), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %ymm3 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm0 @@ -4911,48 +4891,45 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 112(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 112(%rax), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 112(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm0 @@ -4960,23 +4937,22 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rax), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vbroadcastsd 136(%rcx), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %ymm3 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm0 @@ -5003,48 +4979,45 @@ ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rcx), %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rax), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 168(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm6[1] -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm0 -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-ONLY-NEXT: vmovaps 176(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 176(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %ymm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1-ONLY-NEXT: vmovaps 176(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 176(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 176(%rax), %xmm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 176(%rax), %xmm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 176(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm0 @@ -5052,30 +5025,29 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rax), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm4 ; AVX1-ONLY-NEXT: vbroadcastsd 200(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %ymm5 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 @@ -5088,84 +5060,84 @@ ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm1[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rcx), %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2],ymm6[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdx), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm10 = ymm5[0],ymm6[1],ymm5[2],ymm6[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 224(%r8), %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rax), %ymm2, %ymm5 +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rax), %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1],ymm1[2,3],ymm4[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovapd 224(%rdi), %ymm1 +; AVX1-ONLY-NEXT: vmovapd 224(%rsi), %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm15[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 240(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm0[0,0,3,2] ; AVX1-ONLY-NEXT: vmovapd 224(%rax), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2],ymm15[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 216(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 240(%rcx), %xmm15 -; AVX1-ONLY-NEXT: vmovapd 240(%rdx), %xmm14 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],xmm15[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm14 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm14[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm14 = xmm15[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 248(%r9), %ymm15 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0],ymm0[1],ymm4[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm13[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm10[0],xmm11[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm7[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 216(%r9), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 240(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovapd 240(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 248(%r9), %ymm14 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0],ymm0[1],ymm15[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1,2],ymm0[3] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm12[0],xmm11[0] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm9[0],xmm8[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[0],mem[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm15 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm12 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm10, 16(%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm9, (%rsi) ; AVX1-ONLY-NEXT: vmovaps %xmm11, 1360(%rsi) ; AVX1-ONLY-NEXT: vmovaps %xmm8, 1344(%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 464(%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 448(%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 912(%rsi) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 896(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm0, 1760(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm1, 1728(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm2, 1696(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 1664(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 912(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 896(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 464(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 448(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 16(%rsi) +; AVX1-ONLY-NEXT: vmovaps %xmm7, (%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm1, 1760(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 1728(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 1696(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm14, 1664(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 1632(%rsi) ; AVX1-ONLY-NEXT: vmovaps %ymm6, 1600(%rsi) -; AVX1-ONLY-NEXT: vmovapd %ymm7, 1568(%rsi) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 1536(%rsi) +; AVX1-ONLY-NEXT: vmovapd %ymm10, 1568(%rsi) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 1536(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1504(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1472(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1440(%rsi) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1408(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 1376(%rsi) @@ -5247,19 +5219,19 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rsi) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rsi) -; AVX1-ONLY-NEXT: addq $1320, %rsp # imm = 0x528 +; AVX1-ONLY-NEXT: addq $1336, %rsp # imm = 0x538 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride7_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1464, %rsp # imm = 0x5B8 +; AVX2-ONLY-NEXT: subq $1432, %rsp # imm = 0x598 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm13 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm14 ; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm9 ; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] @@ -5281,7 +5253,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm10[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm9[0],ymm13[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm14[0],ymm9[0],ymm14[2],ymm9[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm3[0],mem[0],ymm3[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm8[2,3],ymm6[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5405,22 +5377,21 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%r8), %xmm14 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps 128(%r8), %xmm12 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rax), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rax), %xmm13 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 128(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm12 -; AVX2-ONLY-NEXT: vmovaps 128(%r9), %ymm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX2-ONLY-NEXT: vmovaps 128(%r8), %ymm11 +; AVX2-ONLY-NEXT: vmovaps 128(%r9), %ymm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],ymm10[0],ymm11[2],ymm10[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5434,13 +5405,13 @@ ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps 160(%rax), %xmm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm9 +; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %xmm8 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm9[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm8[1] ; AVX2-ONLY-NEXT: vbroadcastsd 168(%rcx), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -5454,9 +5425,9 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm8 -; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm7 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX2-ONLY-NEXT: vmovaps 160(%r8), %ymm7 +; AVX2-ONLY-NEXT: vmovaps 160(%r9), %ymm6 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5480,13 +5451,13 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm5[1] ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%r8), %ymm1 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] @@ -5494,22 +5465,15 @@ ; AVX2-ONLY-NEXT: vmovaps 208(%rax), %xmm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm6 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm6[0,1],ymm0[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 224(%rax), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %xmm2 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm1 ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -5525,12 +5489,13 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = ymm14[1],mem[1],ymm14[3],mem[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm15 @@ -5552,113 +5517,121 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm15 -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm15, %ymm15 +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm14[4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm14 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm15 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm14[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm14 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 152(%rcx), %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-ONLY-NEXT: vbroadcastsd 160(%rcx), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm12, %ymm12 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm11[1],ymm10[1],ymm11[3],ymm10[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 152(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm10[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX2-ONLY-NEXT: vbroadcastsd 160(%rcx), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm7[1],ymm6[1],ymm7[3],ymm6[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm5, %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 216(%rcx), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%r9), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm10, %ymm2 -; AVX2-ONLY-NEXT: vbroadcastsd 224(%rcx), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%r9), %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm5[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm11, %ymm2 +; AVX2-ONLY-NEXT: vbroadcastsd 224(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vmovaps 224(%r8), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 224(%r9), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 224(%rax), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm6[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%r9), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 224(%rax), %ymm10 -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm12 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm12 = mem[0,1],ymm10[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm11[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 240(%r9), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%r9), %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 224(%rax), %ymm13 +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm13[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm14[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %xmm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm8[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm14 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm12 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %xmm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %xmm8 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %xmm13 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm11 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps %xmm11, 16(%rdi) -; AVX2-ONLY-NEXT: vmovaps %xmm3, (%rdi) -; AVX2-ONLY-NEXT: vmovaps %xmm13, 1360(%rdi) -; AVX2-ONLY-NEXT: vmovaps %xmm8, 1344(%rdi) -; AVX2-ONLY-NEXT: vmovaps %xmm15, 464(%rdi) -; AVX2-ONLY-NEXT: vmovaps %xmm0, 448(%rdi) -; AVX2-ONLY-NEXT: vmovaps %xmm14, 912(%rdi) -; AVX2-ONLY-NEXT: vmovaps %xmm10, 896(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 1760(%rdi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1728(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm8, 1360(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm7, 1344(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm10, 912(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm15, 896(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm12, 464(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm3, 448(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm14, 16(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm13, (%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 1760(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 1728(%rdi) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 1696(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm12, 1664(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1664(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1632(%rdi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1600(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 1600(%rdi) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 1568(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 1536(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 1536(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1504(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5668,7 +5641,7 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1408(%rdi) ; AVX2-ONLY-NEXT: vmovaps %ymm4, 1376(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm7, 1312(%rdi) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 1312(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1280(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5679,7 +5652,8 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1184(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1152(%rdi) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 1120(%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1120(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 1088(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -5744,7 +5718,7 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdi) -; AVX2-ONLY-NEXT: addq $1464, %rsp # imm = 0x5B8 +; AVX2-ONLY-NEXT: addq $1432, %rsp # imm = 0x598 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -6198,16 +6172,16 @@ ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: subq $2024, %rsp # imm = 0x7E8 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm4 ; AVX512F-ONLY-FAST-NEXT: movb $96, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k1 @@ -6215,205 +6189,208 @@ ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3] ; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [2,10,0,3,2,10,0,3] +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %ymm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm1[0],ymm19[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512F-ONLY-FAST-NEXT: movb $28, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k2 ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm3[0,1,2,3],zmm4[2,3,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,3,7,7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm28, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,3,7,u> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %ymm1, %ymm2, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,12,0,5,4,12,0,5] ; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,1,12,7,0,1,12,7] +; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] +; AVX512F-ONLY-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm22, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] ; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] -; AVX512F-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] ; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm25, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [6,13,14,7,6,13,14,7] -; AVX512F-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm29, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm18[0],ymm0[0],ymm18[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm0, %ymm28, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm20[0],ymm26[0],ymm20[2],ymm26[2] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm26, %ymm28, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm20[0],ymm0[0],ymm20[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,7,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm0, %ymm1, %ymm20 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm21[0],ymm25[0],ymm21[2],ymm25[2] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %ymm25, %ymm1, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm16, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[0,1,2,3],zmm14[2,3,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm2[0,1,2,3],zmm14[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm31 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm17, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm19, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm31, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm18 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k2} = zmm3[0,1,2,3],zmm12[2,3,0,1] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k2} = zmm5[0,1,2,3],zmm12[2,3,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm20 ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm1, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm27, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm20, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm29, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm20, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm27, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm17, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm25, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm31, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,0,12,4,3,0,12,4] -; AVX512F-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm31, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm23, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm29, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] +; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm26 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] ; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] ; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: movb $48, %r10b ; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,8,0,1,0,8,0,1] -; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,8,0,1,0,8,0,1] +; AVX512F-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm27, %zmm22 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] ; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm1[0],zmm11[0],zmm1[2],zmm11[2],zmm1[4],zmm11[4],zmm1[6],zmm11[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm3, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm6[0],zmm8[0],zmm6[2],zmm8[2],zmm6[4],zmm8[4],zmm6[6],zmm8[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14] ; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] ; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm29, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm10, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm25, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm29, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k3} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm27, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k3} = zmm1[0],zmm16[0],zmm1[2],zmm16[2],zmm1[4],zmm16[4],zmm1[6],zmm16[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm16 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm31, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm29, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm18, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm25, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm0, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm16, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm16, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm23, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm0, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm18, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm18 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm30 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 @@ -6421,29 +6398,29 @@ ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm24 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm23 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm28, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm28, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [15,7,15,7] -; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm22, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm27 ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-ONLY-FAST-NEXT: movb $12, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3} ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload @@ -6451,43 +6428,43 @@ ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm4, %zmm4 ; AVX512F-ONLY-FAST-NEXT: movb $112, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm15 {%k3} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm0, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm15 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm25 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm23 {%k3} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm29 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm27 {%k3} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm20 ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm4, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: movb $14, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k3} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512F-ONLY-FAST-NEXT: movb $120, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm31 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] @@ -6496,7 +6473,7 @@ ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm19 {%k3} ; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm31 {%k3} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm29 {%k3} ; AVX512F-ONLY-FAST-NEXT: movb $-61, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k5 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload @@ -6531,14 +6508,14 @@ ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k3} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm9 {%k4} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm28 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm28 {%k3} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm28 {%k4} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movb $6, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm15 {%k4} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm17 {%k4} ; AVX512F-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm11 {%k4} @@ -6551,7 +6528,7 @@ ; AVX512F-ONLY-FAST-NEXT: movb $56, %cl ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm15 {%k4} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm17 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k4} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload @@ -6582,7 +6559,7 @@ ; AVX512F-ONLY-FAST-NEXT: movb $8, %al ; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k3} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 {%k3} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <12,u,u,3,4,5,6,13> ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm7, %zmm10 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = @@ -6605,10 +6582,10 @@ ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 1280(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1216(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1152(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1088(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1024(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 832(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 704(%rax) @@ -6619,15 +6596,15 @@ ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 384(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 896(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1344(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 896(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, (%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1728(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1600(%rax) @@ -7083,197 +7060,199 @@ ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: subq $2024, %rsp # imm = 0x7E8 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm28 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm8 ; AVX512DQ-FAST-NEXT: movb $96, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [2,10,0,3,2,10,0,3] ; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %ymm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %ymm24 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %ymm18 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %ymm20 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512DQ-FAST-NEXT: movb $28, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k2 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[0,1,2,3],zmm7[2,3,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [1,3,7,7] -; AVX512DQ-FAST-NEXT: vpermt2q %ymm1, %ymm27, %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5] -; AVX512DQ-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,1,12,7,0,1,12,7] -; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm1 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm3[0,1,2,3],zmm8[2,3,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,3,7,u> +; AVX512DQ-FAST-NEXT: vpermi2q %ymm1, %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,0,5,4,12,0,5] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,1,12,7,0,1,12,7] +; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] ; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm21, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,13,6,7,0,13,6,7] -; AVX512DQ-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm28, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm21, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,13,6,7,0,13,6,7] +; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm26, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] -; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm30 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] -; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm29, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] +; AVX512DQ-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm28 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [6,13,14,7,6,13,14,7] +; AVX512DQ-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm27, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm18[0],ymm0[0],ymm18[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vpermt2q %ymm0, %ymm27, %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,7,7] +; AVX512DQ-FAST-NEXT: vpermt2q %ymm0, %ymm2, %ymm18 ; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm25[0],ymm20[2],ymm25[2] -; AVX512DQ-FAST-NEXT: vpermt2q %ymm25, %ymm27, %ymm20 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm24[0],ymm20[2],ymm24[2] +; AVX512DQ-FAST-NEXT: vpermt2q %ymm24, %ymm2, %ymm20 ; AVX512DQ-FAST-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm16, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm13 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[0,1,2,3],zmm13[2,3,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm14 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[0,1,2,3],zmm14[2,3,0,1] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm25, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm29, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm23, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm27, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm29 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm24 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm12 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[0,1,2,3],zmm12[2,3,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[0,1,2,3],zmm12[2,3,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm25, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm28, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm21 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm24, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] -; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm12 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [3,0,12,4,3,0,12,4] +; AVX512DQ-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm25 ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm24, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: movb $48, %r10b ; AVX512DQ-FAST-NEXT: kmovw %r10d, %k3 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,0,1,0,8,0,1] ; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm7, %zmm18 ; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] ; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm28 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm5[0],zmm3[0],zmm5[2],zmm3[2],zmm5[4],zmm3[4],zmm5[6],zmm3[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm26 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm0[0],zmm13[0],zmm0[2],zmm13[2],zmm0[4],zmm13[4],zmm0[6],zmm13[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm9 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] ; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm28 ; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] ; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm27, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm31 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm24, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm15 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm11[0],zmm17[0],zmm11[2],zmm17[2],zmm11[4],zmm17[4],zmm11[6],zmm17[6] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm3 @@ -7281,40 +7260,40 @@ ; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm21 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm11 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm27, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm15, %zmm20 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm20 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm15 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm23, %zmm4, %zmm29 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm23, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm13 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm24, %zmm4, %zmm27 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm24, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm30 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm22 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm23 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm30[0],zmm8[0],zmm30[2],zmm8[2],zmm30[4],zmm8[4],zmm30[6],zmm8[6] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm30, %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm30, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm29[0],zmm8[0],zmm29[2],zmm8[2],zmm29[4],zmm8[4],zmm29[6],zmm8[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm29, %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm29, %zmm5 ; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [15,7,15,7] ; AVX512DQ-FAST-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm20, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm20, %zmm29 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] ; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm7 ; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm0, %zmm6 @@ -7326,7 +7305,7 @@ ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: movb $14, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k3} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512DQ-FAST-NEXT: movb $120, %sil @@ -7334,24 +7313,24 @@ ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm0 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm0 = zmm28[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm16 {%k3} ; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm29 {%k3} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm27 {%k3} ; AVX512DQ-FAST-NEXT: movb $-61, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k5 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k5} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k4} ; AVX512DQ-FAST-NEXT: movb $24, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k3 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload @@ -7368,14 +7347,14 @@ ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 {%k4} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm30 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm30 {%k3} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm30 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm29 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm29 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm5 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 @@ -7393,7 +7372,7 @@ ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm8 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm24 {%k4} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm23 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm8 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 @@ -7406,7 +7385,7 @@ ; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm8, %zmm19 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm8, %zmm24 {%k4} +; AVX512DQ-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm8, %zmm23 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm8 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm9 ; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm5 @@ -7415,10 +7394,10 @@ ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $6, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm28 {%k4} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm26 {%k4} ; AVX512DQ-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm5 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm14 {%k4} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm15 {%k4} ; AVX512DQ-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm5 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm4 {%k4} @@ -7428,9 +7407,9 @@ ; AVX512DQ-FAST-NEXT: movb $56, %cl ; AVX512DQ-FAST-NEXT: kmovw %ecx, %k4 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k4} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k4} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload @@ -7448,8 +7427,8 @@ ; AVX512DQ-FAST-NEXT: # zmm10 = zmm22[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> ; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,11,u,4,5,6,7> -; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm12, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,11,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm9, %zmm12, %zmm14 ; AVX512DQ-FAST-NEXT: vmovdqa 192(%r8), %ymm12 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] ; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k2} = zmm12[0,1,2,3],zmm5[2,3,0,1] @@ -7458,8 +7437,8 @@ ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512DQ-FAST-NEXT: movb $8, %al ; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 {%k3} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 {%k3} ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> ; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm11, %zmm12 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = @@ -7480,35 +7459,35 @@ ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 1472(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 1408(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 1280(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 1280(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1216(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 1152(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 1088(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm3, 1024(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 960(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 896(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 896(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 832(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 704(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 640(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm1, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 384(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 64(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 1728(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 1664(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 1600(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 1536(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 1536(%rax) ; AVX512DQ-FAST-NEXT: addq $2024, %rsp # imm = 0x7E8 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -7963,16 +7942,16 @@ ; AVX512BW-ONLY-FAST: # %bb.0: ; AVX512BW-ONLY-FAST-NEXT: subq $2024, %rsp # imm = 0x7E8 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm15 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: movb $96, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k1 @@ -7980,205 +7959,208 @@ ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3] ; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [2,10,0,3,2,10,0,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %ymm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %ymm20 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm1[0],ymm19[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %ymm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %ymm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %ymm21 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512BW-ONLY-FAST-NEXT: movb $28, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k2 ; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm3[0,1,2,3],zmm4[2,3,0,1] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [1,3,7,7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm1, %ymm28, %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,3,7,u> +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %ymm1, %ymm2, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [4,12,0,5,4,12,0,5] ; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm19, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,1,12,7,0,1,12,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [5,0,14,6,5,0,14,6] +; AVX512BW-ONLY-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm22, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,13,6,7,0,13,6,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] -; AVX512BW-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm21, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,13,6,7,0,13,6,7] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] ; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm25, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [6,13,14,7,6,13,14,7] -; AVX512BW-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm29, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm18[0],ymm0[0],ymm18[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm0, %ymm28, %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm20[0],ymm26[0],ymm20[2],ymm26[2] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm26, %ymm28, %ymm20 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm20[0],ymm0[0],ymm20[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,7,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm0, %ymm1, %ymm20 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm21[0],ymm25[0],ymm21[2],ymm25[2] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %ymm25, %ymm1, %ymm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm16, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm18, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[0,1,2,3],zmm14[2,3,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k2} = zmm2[0,1,2,3],zmm14[2,3,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm31 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm17, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm19, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm27, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm25, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm31, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm23, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm14 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm28 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm18 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm18, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm26 {%k2} = zmm3[0,1,2,3],zmm12[2,3,0,1] +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k2} = zmm5[0,1,2,3],zmm12[2,3,0,1] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm20 ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm1, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm27, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm19 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm20, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm29, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm20, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm27, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm9, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm17, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm25, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm31, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [3,0,12,4,3,0,12,4] -; AVX512BW-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm31, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm23, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm29, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm26 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [13,5,13,5,13,5,13,5] ; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] ; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: movb $48, %r10b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,8,0,1,0,8,0,1] -; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,8,0,1,0,8,0,1] +; AVX512BW-ONLY-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm27, %zmm22 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,0,10,2,1,0,10,2] ; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm1[0],zmm11[0],zmm1[2],zmm11[2],zmm1[4],zmm11[4],zmm1[6],zmm11[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm3, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm6[0],zmm8[0],zmm6[2],zmm8[2],zmm6[4],zmm8[4],zmm6[6],zmm8[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14] ; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm6, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] ; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm31, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm29, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm10, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm25, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm29, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k3} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm3, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm23, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm27, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm19 {%k3} = zmm1[0],zmm16[0],zmm1[2],zmm16[2],zmm1[4],zmm16[4],zmm1[6],zmm16[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm16 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm31, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm29, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm18, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm25, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm0, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm16, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm25, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm16, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm23, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm18, %zmm0, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm18, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm18 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm30 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 @@ -8186,29 +8168,29 @@ ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm24 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm29, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm23 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm28, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm28, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [15,7,15,7] -; AVX512BW-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm22, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm21, %zmm28 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm27 ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm22, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-ONLY-FAST-NEXT: movb $12, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm21 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm22 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,0,8,0,8,0,8] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload @@ -8216,43 +8198,43 @@ ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, (%rax), %zmm4, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: movb $112, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm17 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm15 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm0, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 64(%rax), %zmm4, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm15 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm4 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm25 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm23 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 128(%rax), %zmm4, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm25 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm4 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],mem[0] ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm29 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm27 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm20 ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm4, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $3, 192(%rax), %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $14, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm27 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512BW-ONLY-FAST-NEXT: movb $120, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm22 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm31 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = zmm0[0,1,2,3],mem[4,5,6,7] @@ -8261,7 +8243,7 @@ ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm19 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm31 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm29 {%k3} ; AVX512BW-ONLY-FAST-NEXT: movb $-61, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload @@ -8296,14 +8278,14 @@ ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm9 {%k4} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm28 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm28 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm28 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 8(%rcx), %ymm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $6, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm15 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm17 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm11 {%k4} @@ -8316,7 +8298,7 @@ ; AVX512BW-ONLY-FAST-NEXT: movb $56, %cl ; AVX512BW-ONLY-FAST-NEXT: kmovd %ecx, %k4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm15 {%k4} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm17 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm11 {%k4} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload @@ -8347,7 +8329,7 @@ ; AVX512BW-ONLY-FAST-NEXT: movb $8, %al ; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 {%k3} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 {%k3} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <12,u,u,3,4,5,6,13> ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm7, %zmm10 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = @@ -8370,10 +8352,10 @@ ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 1280(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1216(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1152(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 1088(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1024(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 832(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 768(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 704(%rax) @@ -8384,15 +8366,15 @@ ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 384(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1344(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 896(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1344(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 896(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, (%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 1728(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1664(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1600(%rax) @@ -8848,197 +8830,199 @@ ; AVX512DQBW-FAST: # %bb.0: ; AVX512DQBW-FAST-NEXT: subq $2024, %rsp # imm = 0x7E8 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm28 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm8 ; AVX512DQBW-FAST-NEXT: movb $96, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [11,3,11,3,11,3,11,3] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [2,10,0,3,2,10,0,3] ; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512DQBW-FAST-NEXT: vmovdqa (%r9), %ymm1 ; AVX512DQBW-FAST-NEXT: vmovdqa 64(%r9), %ymm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %ymm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %ymm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %ymm24 +; AVX512DQBW-FAST-NEXT: vmovdqa (%r8), %ymm2 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %ymm18 ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %ymm20 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm1[0],ymm16[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] ; AVX512DQBW-FAST-NEXT: movb $28, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k2 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[0,1,2,3],zmm7[2,3,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [1,3,7,7] -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm1, %ymm27, %ymm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [4,12,0,5,4,12,0,5] -; AVX512DQBW-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,1,12,7,0,1,12,7] -; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm26, %zmm1 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm3[0,1,2,3],zmm8[2,3,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,3,7,u> +; AVX512DQBW-FAST-NEXT: vpermi2q %ymm1, %ymm2, %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,0,5,4,12,0,5] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,1,12,7,0,1,12,7] +; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [5,0,14,6,5,0,14,6] ; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm21, %zmm1 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,13,6,7,0,13,6,7] -; AVX512DQBW-FAST-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm28, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm21, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,13,6,7,0,13,6,7] +; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm26, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [15,7,15,7,15,7,15,7] -; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm30 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [6,13,14,7,6,13,14,7] -; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm29, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] +; AVX512DQBW-FAST-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm28 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [6,13,14,7,6,13,14,7] +; AVX512DQBW-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm27, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm18[0],ymm0[0],ymm18[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm0, %ymm27, %ymm18 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,7,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm0, %ymm2, %ymm18 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm25[0],ymm20[2],ymm25[2] -; AVX512DQBW-FAST-NEXT: vpermt2q %ymm25, %ymm27, %ymm20 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm20[0],ymm24[0],ymm20[2],ymm24[2] +; AVX512DQBW-FAST-NEXT: vpermt2q %ymm24, %ymm2, %ymm20 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm16, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm13 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[0,1,2,3],zmm13[2,3,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm14 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[0,1,2,3],zmm14[2,3,0,1] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm10, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm28, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm12, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm25, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm21, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm29, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm23, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm27, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm29 ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm24 ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm7, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm16, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm12 -; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[0,1,2,3],zmm12[2,3,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm0[0,1,2,3],zmm12[2,3,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm25, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm28, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm21 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm24, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [3,0,12,4,3,0,12,4] -; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm26 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm29, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm12 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [3,0,12,4,3,0,12,4] +; AVX512DQBW-FAST-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm25 ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [13,5,13,5,13,5,13,5] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] ; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm10, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm24, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: movb $48, %r10b ; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k3 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,8,0,1,0,8,0,1] ; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm7, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm7, %zmm18 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [1,0,10,2,1,0,10,2] ; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm28 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm5[0],zmm3[0],zmm5[2],zmm3[2],zmm5[4],zmm3[4],zmm5[6],zmm3[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm26 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm25 {%k3} = zmm0[0],zmm13[0],zmm0[2],zmm13[2],zmm0[4],zmm13[4],zmm0[6],zmm13[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm9 ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] ; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm28 ; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] ; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm27, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm31 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm0 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm24, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm23, %zmm15 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm19 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm15 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm16 {%k3} = zmm11[0],zmm17[0],zmm11[2],zmm17[2],zmm11[4],zmm17[4],zmm11[6],zmm17[6] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm3 @@ -9046,40 +9030,40 @@ ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm21 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm11 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm29, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm27, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm15, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm20 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm23, %zmm4, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm23, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm24, %zmm4, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm24, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm30 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm22 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm23 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm30[0],zmm8[0],zmm30[2],zmm8[2],zmm30[4],zmm8[4],zmm30[6],zmm8[6] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm30, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm30, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm29[0],zmm8[0],zmm29[2],zmm8[2],zmm29[4],zmm8[4],zmm29[6],zmm8[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm29, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm29, %zmm5 ; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [15,7,15,7] ; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm20, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm20, %zmm29 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm7 ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm0, %zmm6 @@ -9091,7 +9075,7 @@ ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: movb $14, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm26 {%k3} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm25 {%k3} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} ; AVX512DQBW-FAST-NEXT: movb $120, %sil @@ -9099,24 +9083,24 @@ ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # zmm0 = zmm25[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm0 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm0 = zmm28[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm16 {%k3} ; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm29 {%k3} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm27 {%k3} ; AVX512DQBW-FAST-NEXT: movb $-61, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k5 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k5} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k4} ; AVX512DQBW-FAST-NEXT: movb $24, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload @@ -9133,14 +9117,14 @@ ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 {%k4} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm30 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm5 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm30 {%k3} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm30 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm29 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm29 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm5 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],mem[0] ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 @@ -9158,7 +9142,7 @@ ; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm8 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm24 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm23 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm8 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],mem[0] ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 @@ -9171,7 +9155,7 @@ ; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 64(%rax), %zmm8, %zmm19 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload -; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm8, %zmm24 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti64x2 $3, 128(%rax), %zmm8, %zmm23 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm9 ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm8, %zmm5 @@ -9180,10 +9164,10 @@ ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $6, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k4 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm28 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm26 {%k4} ; AVX512DQBW-FAST-NEXT: vpbroadcastq 72(%rcx), %ymm5 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm14 {%k4} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm15 {%k4} ; AVX512DQBW-FAST-NEXT: vpbroadcastq 136(%rcx), %ymm5 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = mem[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm4 {%k4} @@ -9193,9 +9177,9 @@ ; AVX512DQBW-FAST-NEXT: movb $56, %cl ; AVX512DQBW-FAST-NEXT: kmovd %ecx, %k4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm26 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k4} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 {%k4} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload @@ -9213,8 +9197,8 @@ ; AVX512DQBW-FAST-NEXT: # zmm10 = zmm22[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <0,11,u,u,4,5,6,7> ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = <0,1,11,u,4,5,6,7> -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm12, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <0,1,11,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm9, %zmm12, %zmm14 ; AVX512DQBW-FAST-NEXT: vmovdqa 192(%r8), %ymm12 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm12[0],mem[0],ymm12[2],mem[2] ; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 {%k2} = zmm12[0,1,2,3],zmm5[2,3,0,1] @@ -9223,8 +9207,8 @@ ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1} ; AVX512DQBW-FAST-NEXT: movb $8, %al ; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm2 {%k3} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 {%k3} ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13> ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm11, %zmm12 ; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = @@ -9245,35 +9229,35 @@ ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 1472(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 1408(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 1280(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 1280(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1216(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 1152(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 1088(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm3, 1024(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 960(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 896(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 896(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 832(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 768(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 704(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 640(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm1, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 512(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 384(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 192(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 64(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 1728(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 1664(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 1600(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 1536(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 1536(%rax) ; AVX512DQBW-FAST-NEXT: addq $2024, %rsp # imm = 0x7E8 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq @@ -10430,7 +10414,7 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $3832, %rsp # imm = 0xEF8 +; AVX1-ONLY-NEXT: subq $3448, %rsp # imm = 0xD78 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps (%rsi), %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10444,9 +10428,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm6 -; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rax), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm2 @@ -10476,30 +10458,27 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rcx), %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rax), %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rax), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] @@ -10509,9 +10488,7 @@ ; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm1 @@ -10521,9 +10498,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rax), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm1 @@ -10558,8 +10533,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rcx), %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -10567,12 +10541,10 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rax), %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 104(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 @@ -10591,9 +10563,7 @@ ; AVX1-ONLY-NEXT: vmovaps 112(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm1 @@ -10603,9 +10573,7 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 128(%rax), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm1 @@ -10640,8 +10608,7 @@ ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rcx), %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -10649,12 +10616,10 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rax), %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 168(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 @@ -10673,9 +10638,7 @@ ; AVX1-ONLY-NEXT: vmovaps 176(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 176(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm1 @@ -10685,9 +10648,7 @@ ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rax), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm1 @@ -10722,8 +10683,7 @@ ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rcx), %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -10731,12 +10691,10 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps 224(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rax), %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 232(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 @@ -10755,9 +10713,7 @@ ; AVX1-ONLY-NEXT: vmovaps 240(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 240(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%r9), %xmm1 @@ -10767,9 +10723,7 @@ ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 256(%rax), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rax), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %ymm1 @@ -10804,21 +10758,18 @@ ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rcx), %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 288(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 288(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps 288(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rax), %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 296(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 @@ -10837,9 +10788,7 @@ ; AVX1-ONLY-NEXT: vmovaps 304(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 304(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%r9), %xmm1 @@ -10849,9 +10798,7 @@ ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 320(%rax), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%rax), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %ymm1 @@ -10859,76 +10806,67 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %ymm14 ; AVX1-ONLY-NEXT: vmovaps 336(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm0[1],ymm14[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 336(%rax), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%r9), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%r9), %ymm12 ; AVX1-ONLY-NEXT: vmovaps 336(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 336(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vmovaps 336(%rcx), %xmm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rcx), %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 352(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rcx), %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rdx), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 352(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovaps 352(%r9), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 352(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps 352(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm10[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 352(%rax), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vbroadcastsd 360(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %ymm15 +; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %ymm9 ; AVX1-ONLY-NEXT: vmovaps 368(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm15[1],ymm0[1],ymm15[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 368(%rax), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 352(%r9), %ymm14 +; AVX1-ONLY-NEXT: vmovaps 352(%r9), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 368(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm0[1],ymm14[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 368(%rcx), %xmm12 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%r9), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 384(%r9), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 384(%r8), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm13[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm7[0] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovaps 384(%rax), %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 384(%rax), %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %ymm1 @@ -10936,68 +10874,65 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %ymm9 +; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 400(%rax), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 384(%r9), %ymm8 +; AVX1-ONLY-NEXT: vmovaps 384(%r9), %ymm4 ; AVX1-ONLY-NEXT: vmovaps 400(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps 400(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,3,2,3] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 400(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rcx), %ymm1, %ymm5 -; AVX1-ONLY-NEXT: vmovaps 416(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm0[0],xmm5[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rcx), %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdx), %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm13[0],ymm1[1],ymm13[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vmovapd 416(%r9), %xmm6 -; AVX1-ONLY-NEXT: vmovapd 416(%r8), %xmm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm6[0] -; AVX1-ONLY-NEXT: vmovapd 416(%rax), %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vbroadcastsd 424(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vmovapd 416(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovapd 416(%r8), %xmm13 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm13[0],xmm2[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rax), %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 424(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 456(%rcx), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 448(%rdx), %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%r8), %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 456(%rcx), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vmovapd 448(%rdx), %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%r8), %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm5[2,3],ymm1[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm11[0],xmm2[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rcx), %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vmovaps 480(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 488(%rcx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 480(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm5[0],xmm1[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rcx), %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdx), %ymm15, %ymm15 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[2] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 488(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%r8), %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11005,8 +10940,7 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm0 @@ -11021,24 +10955,21 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm0 @@ -11053,24 +10984,21 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm0 @@ -11085,24 +11013,21 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps 176(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm0 @@ -11117,24 +11042,21 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps 240(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%r8), %ymm0 @@ -11148,207 +11070,199 @@ ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps 304(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] ; AVX1-ONLY-NEXT: vmovaps 336(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm15[0],ymm0[2],ymm15[2] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm10[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm14[0],ymm0[2],ymm14[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] ; AVX1-ONLY-NEXT: vmovaps 368(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[2],ymm9[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm13[1],xmm10[1] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[2],ymm6[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],mem[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%r8), %ymm0 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[2],ymm8[2] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX1-ONLY-NEXT: vmovaps 400(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm4[1] -; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm1 -; AVX1-ONLY-NEXT: vmovapd 416(%rsi), %ymm2 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-ONLY-NEXT: vmovapd 416(%rdi), %ymm0 +; AVX1-ONLY-NEXT: vmovapd 416(%rsi), %ymm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 416(%r8), %ymm0 +; AVX1-ONLY-NEXT: vmovaps 416(%r8), %ymm0 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-ONLY-NEXT: vmovapd 432(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovapd 432(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm6[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 432(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 432(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 416(%rax), %ymm1 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 416(%rax), %ymm9 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm9[2,3],ymm2[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 448(%r8), %ymm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rax), %ymm13, %ymm2 +; AVX1-ONLY-NEXT: vmovapd 448(%r8), %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 448(%rax), %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 448(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovapd 448(%rsi), %ymm2 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm9 = ymm3[0,1],ymm0[2,3] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm11 = ymm6[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovapd 464(%rdi), %xmm0 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 448(%rax), %ymm4 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 448(%rax), %ymm6 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm6[2,3],ymm2[2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vmovapd 480(%r8), %ymm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rax), %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 480(%r8), %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rax), %ymm0, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovapd 480(%rdi), %ymm2 -; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],mem[2,3] -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,3,2] -; AVX1-ONLY-NEXT: vmovapd 480(%rax), %ymm11 -; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm11[2,3],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2],ymm5[3] +; AVX1-ONLY-NEXT: vmovapd 480(%rsi), %ymm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovapd 496(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],mem[2,3] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0,0,3,2] +; AVX1-ONLY-NEXT: vmovapd 480(%rax), %ymm1 +; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm5[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0],ymm7[1],ymm5[2],ymm7[3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 440(%r9), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 464(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovapd 464(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm13[2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm6 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovapd 432(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 440(%r9), %ymm12 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm7[0,1,2],ymm9[3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm3 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm9[0],ymm4[1],ymm9[2,3] +; AVX1-ONLY-NEXT: vmovapd 464(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovapd 464(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],xmm7[0] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3] +; AVX1-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm9 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1,2],ymm9[3] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovapd 496(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovapd 496(%rdx), %xmm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3] -; AVX1-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm4 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vbroadcastsd 504(%r9), %ymm1 -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm11[1],ymm2[2,3] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm7[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm7 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm7 = ymm4[0,1],ymm7[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm11[0],ymm6[1],ymm11[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm3 = ymm7[0,1,2],ymm6[3] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 496(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm6 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovapd 496(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastsd 504(%r9), %ymm6 +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3] +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[0],mem[0] -; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm15[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[0],mem[0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm8 = xmm8[0],mem[0] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm15 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm14 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm10 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm12 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm13 +; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm12 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 256(%rdx), %xmm11 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 320(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps 384(%rdx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps 448(%rdx), %xmm7 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm6 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm7, 2704(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm1, 2688(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm8, 3152(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm15, 3136(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 2256(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm2, 2240(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm13, 1360(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 3152(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm8, 3136(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm9, 2704(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 2688(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm10, 2256(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm1, 2240(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm11, 1808(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm2, 1792(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 1360(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm3, 1344(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 464(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm11, 448(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 912(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm13, 912(%rax) ; AVX1-ONLY-NEXT: vmovaps %xmm4, 896(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm14, 1808(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 1792(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm14, 464(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm6, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 3520(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -11557,37 +11471,39 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $3832, %rsp # imm = 0xEF8 +; AVX1-ONLY-NEXT: addq $3448, %rsp # imm = 0xD78 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride7_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $3624, %rsp # imm = 0xE28 +; AVX2-ONLY-NEXT: subq $3544, %rsp # imm = 0xDD8 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm2 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm3 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm6 ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm7 -; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] ; AVX2-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm5 +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm11 +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm7 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm11[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm8[0],ymm6[0],ymm8[2],ymm6[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],mem[0],ymm3[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11598,8 +11514,9 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm2 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm2[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 @@ -11873,11 +11790,11 @@ ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%r9), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 256(%r8), %ymm4 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] +; AVX2-ONLY-NEXT: vmovaps 256(%r9), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11892,7 +11809,7 @@ ; AVX2-ONLY-NEXT: vmovaps 288(%rax), %xmm2 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11902,713 +11819,711 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] ; AVX2-ONLY-NEXT: vbroadcastsd 296(%rcx), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 288(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 288(%r8), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%r9), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 304(%rax), %xmm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 328(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 320(%r8), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rax), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 320(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%r9), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 336(%rax), %xmm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%r8), %xmm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 352(%rax), %xmm2 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %xmm3 -; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; AVX2-ONLY-NEXT: vbroadcastsd 360(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 352(%r9), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 368(%rax), %xmm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm0 -; AVX2-ONLY-NEXT: vbroadcastsd 392(%rcx), %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 384(%r8), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rax), %xmm6 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%r9), %ymm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 400(%rax), %xmm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%r8), %xmm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 416(%rax), %xmm3 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %xmm12 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] -; AVX2-ONLY-NEXT: vbroadcastsd 424(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%r8), %ymm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm4 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],mem[0],ymm4[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovaps 432(%rax), %xmm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 448(%r8), %ymm13 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm13[0,1],ymm1[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 448(%rax), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm8 -; AVX2-ONLY-NEXT: vbroadcastsd 456(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%r8), %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovaps 464(%rax), %xmm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm14 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm14[0,1],ymm1[0,1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 480(%rax), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %xmm10 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm10[1] -; AVX2-ONLY-NEXT: vbroadcastsd 488(%rcx), %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%r8), %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm3 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovaps 496(%rax), %xmm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm9[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm11, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm15[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 152(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 160(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 216(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 224(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 288(%rsi), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps 288(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 288(%r8), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%r9), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 304(%rax), %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 328(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 320(%r8), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 320(%rax), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 320(%rsi), %ymm3 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 320(%r8), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 320(%r9), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 280(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 336(%rax), %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps 352(%r8), %xmm0 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 288(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 312(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 352(%rax), %xmm2 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; AVX2-ONLY-NEXT: vbroadcastsd 360(%rcx), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm7[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %ymm0 +; AVX2-ONLY-NEXT: vmovaps 352(%r8), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 352(%r9), %ymm4 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 344(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 368(%rax), %xmm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: vbroadcastsd 352(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 392(%rcx), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%r8), %xmm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps 384(%rax), %xmm0 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm5 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%r8), %ymm13 +; AVX2-ONLY-NEXT: vmovaps 384(%r9), %ymm2 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm13[0],ymm2[0],ymm13[2],ymm2[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 400(%rax), %xmm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%r8), %xmm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 416(%rax), %xmm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %xmm3 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1] +; AVX2-ONLY-NEXT: vbroadcastsd 424(%rcx), %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm6[1] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm7 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 416(%r8), %ymm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm8[0],mem[0],ymm8[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] ; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 376(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm6, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovaps 432(%rax), %xmm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm1 +; AVX2-ONLY-NEXT: vbroadcastsd 456(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 448(%r8), %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm7 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 464(%rax), %xmm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %xmm15 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm15[1] +; AVX2-ONLY-NEXT: vbroadcastsd 488(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vinsertf128 $1, 480(%r8), %ymm0, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm7 +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovaps 496(%rax), %xmm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm10 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 32(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 88(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 96(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 120(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 152(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 160(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 184(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 216(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 224(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 248(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 280(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 288(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 312(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %xmm8 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 344(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vbroadcastsd 352(%rcx), %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 376(%rcx), %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm11[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm11 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,3,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd 408(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm2[1],ymm13[3],ymm2[3] +; AVX2-ONLY-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 408(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vbroadcastsd 416(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm5[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vbroadcastsd 416(%rcx), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 440(%r9), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm8[0],mem[0],ymm8[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm13[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 440(%rcx), %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 440(%r9), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 448(%r8), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 448(%r9), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 448(%rax), %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 464(%r9), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 472(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 472(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 472(%r9), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 448(%rax), %ymm3 -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vbroadcastsd 480(%rcx), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vbroadcastsd 480(%rcx), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 480(%r8), %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 480(%r9), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vbroadcastsd 480(%rax), %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm14[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 496(%r9), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 504(%rcx), %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] ; AVX2-ONLY-NEXT: vbroadcastsd 504(%r9), %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 480(%rax), %ymm8 -; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm8[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 480(%rax), %ymm3 +; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1],ymm3[2,3],mem[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdi ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm15[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 256(%rdx), %xmm0 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %xmm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm10[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm1 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm15 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 64(%rdx), %xmm12 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %xmm13 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %xmm13 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %xmm6 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm12[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %xmm12 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm9[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 256(%rdx), %xmm10 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %xmm8 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm8[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %xmm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %xmm8 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %xmm14 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0] +; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %xmm14 ; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm5 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm5[0],mem[0] -; AVX2-ONLY-NEXT: vmovaps %xmm5, 16(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm0, (%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm14, 2704(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm4, 2688(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm8, 3152(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm10, 3136(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm6, 2256(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm7, 2240(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm13, 1360(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm9, 1344(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm12, 464(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm1, 448(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm15, 912(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm11, 896(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm2, 1808(%r8) -; AVX2-ONLY-NEXT: vmovaps %xmm3, 1792(%r8) +; AVX2-ONLY-NEXT: vmovaps %xmm14, 3152(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm6, 3136(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm8, 2704(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm11, 2688(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm9, 2256(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm0, 2240(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm10, 1808(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm1, 1792(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm12, 1360(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm2, 1344(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm13, 912(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm3, 896(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm15, 464(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm4, 448(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm5, 16(%rdi) +; AVX2-ONLY-NEXT: vmovaps %xmm7, (%rdi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3552(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3552(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3520(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3520(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3488(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3488(%r8) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3456(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3456(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3424(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3424(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3392(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3392(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3360(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3360(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3328(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3328(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3296(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3296(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3264(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3264(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3232(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3232(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3200(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3200(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3168(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3168(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3104(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3104(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3072(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3072(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3040(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3040(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 3008(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 3008(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2976(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2976(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2944(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2944(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2912(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2912(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2880(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2880(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2848(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2848(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2816(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2816(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2784(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2784(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2752(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2752(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2720(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2720(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2656(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2656(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2624(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2624(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2592(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2592(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2560(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2560(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2528(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2528(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2496(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2496(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2464(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2464(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2432(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2432(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2400(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2400(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2368(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2368(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2336(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2336(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2304(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2304(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2272(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2272(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2208(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2208(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2176(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2176(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2144(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2144(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2112(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2112(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2080(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2080(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2048(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2048(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 2016(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 2016(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1984(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1984(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1952(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1952(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1920(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1920(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1888(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1888(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1856(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1856(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1824(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1824(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1760(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1760(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1728(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1728(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1696(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1696(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1664(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1664(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1632(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1632(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1600(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1600(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1568(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1568(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1536(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1536(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1504(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1504(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1472(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1472(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1440(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1440(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1408(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1408(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1376(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1376(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1312(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1312(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1280(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1280(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1248(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1248(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1216(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1216(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1184(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1184(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1152(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1152(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1120(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1120(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1088(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1088(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1056(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1056(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 1024(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 1024(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 992(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 992(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 960(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 960(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 928(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 928(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 864(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 864(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 832(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 832(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 800(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 800(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 768(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 768(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 736(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 736(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 704(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 704(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 672(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 672(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 640(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 640(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 608(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 608(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 576(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 576(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 544(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 544(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 512(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 512(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 480(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 416(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 384(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 256(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 224(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX2-ONLY-NEXT: addq $3624, %rsp # imm = 0xE28 +; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rdi) +; AVX2-ONLY-NEXT: addq $3544, %rsp # imm = 0xDD8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll @@ -33,20 +33,20 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] ; SSE-NEXT: movaps %xmm2, %xmm9 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm3[0] -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm7[0] -; SSE-NEXT: movaps %xmm4, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm5[0] +; SSE-NEXT: movaps %xmm4, %xmm10 +; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm5[0] +; SSE-NEXT: movaps %xmm6, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm7[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm5[1] -; SSE-NEXT: movaps %xmm4, 96(%rax) +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm7[1] ; SSE-NEXT: movaps %xmm6, 112(%rax) +; SSE-NEXT: movaps %xmm4, 96(%rax) ; SSE-NEXT: movaps %xmm2, 80(%rax) ; SSE-NEXT: movaps %xmm0, 64(%rax) -; SSE-NEXT: movaps %xmm11, 32(%rax) -; SSE-NEXT: movaps %xmm10, 48(%rax) +; SSE-NEXT: movaps %xmm11, 48(%rax) +; SSE-NEXT: movaps %xmm10, 32(%rax) ; SSE-NEXT: movaps %xmm9, 16(%rax) ; SSE-NEXT: movaps %xmm8, (%rax) ; SSE-NEXT: retq @@ -153,63 +153,63 @@ ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; SSE-NEXT: movaps (%rdi), %xmm4 ; SSE-NEXT: movaps 16(%rdi), %xmm2 -; SSE-NEXT: movaps (%rsi), %xmm10 -; SSE-NEXT: movaps 16(%rsi), %xmm11 +; SSE-NEXT: movaps (%rsi), %xmm9 +; SSE-NEXT: movaps 16(%rsi), %xmm8 ; SSE-NEXT: movaps (%rdx), %xmm1 -; SSE-NEXT: movaps 16(%rdx), %xmm3 -; SSE-NEXT: movaps (%rcx), %xmm7 -; SSE-NEXT: movaps 16(%rcx), %xmm12 -; SSE-NEXT: movaps (%r8), %xmm5 -; SSE-NEXT: movaps 16(%r8), %xmm0 -; SSE-NEXT: movaps (%r9), %xmm13 +; SSE-NEXT: movaps 16(%rdx), %xmm5 +; SSE-NEXT: movaps (%rcx), %xmm13 +; SSE-NEXT: movaps 16(%rcx), %xmm10 +; SSE-NEXT: movaps (%r8), %xmm3 +; SSE-NEXT: movaps 16(%r8), %xmm7 +; SSE-NEXT: movaps (%r9), %xmm14 +; SSE-NEXT: movaps 16(%r9), %xmm12 ; SSE-NEXT: movaps (%r10), %xmm6 -; SSE-NEXT: movaps 16(%r10), %xmm9 -; SSE-NEXT: movaps (%rax), %xmm14 -; SSE-NEXT: movaps 16(%rax), %xmm15 -; SSE-NEXT: movaps %xmm1, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm7[0] -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1] -; SSE-NEXT: movaps %xmm4, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm10[1] -; SSE-NEXT: movaps %xmm3, %xmm10 +; SSE-NEXT: movaps 16(%r10), %xmm0 +; SSE-NEXT: movaps (%rax), %xmm15 +; SSE-NEXT: movaps %xmm4, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm9[0] +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm9[1] +; SSE-NEXT: movaps %xmm1, %xmm9 +; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm13[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1] +; SSE-NEXT: movaps %xmm3, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm14[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm14[1] +; SSE-NEXT: movaps %xmm6, %xmm14 +; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm15[1] +; SSE-NEXT: movaps %xmm2, %xmm15 +; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm8[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm8[1] +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm10[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm10[1] +; SSE-NEXT: movaps %xmm7, %xmm10 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm12[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm12[1] -; SSE-NEXT: movaps %xmm2, %xmm12 -; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm11[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; SSE-NEXT: movaps %xmm6, %xmm11 -; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm14[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm14[1] -; SSE-NEXT: movaps %xmm5, %xmm14 -; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm13[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm13[1] -; SSE-NEXT: movaps %xmm9, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm15[1] -; SSE-NEXT: movaps 16(%r9), %xmm15 -; SSE-NEXT: movaps %xmm0, %xmm8 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm15[0] -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm12[1] +; SSE-NEXT: movaps 16(%rax), %xmm12 +; SSE-NEXT: movaps %xmm0, %xmm11 +; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0] +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movaps %xmm0, 224(%rax) -; SSE-NEXT: movaps %xmm9, 240(%rax) -; SSE-NEXT: movaps %xmm8, 160(%rax) -; SSE-NEXT: movaps %xmm13, 176(%rax) -; SSE-NEXT: movaps %xmm5, 96(%rax) -; SSE-NEXT: movaps %xmm6, 112(%rax) -; SSE-NEXT: movaps %xmm14, 32(%rax) -; SSE-NEXT: movaps %xmm11, 48(%rax) +; SSE-NEXT: movaps %xmm0, 240(%rax) +; SSE-NEXT: movaps %xmm7, 224(%rax) +; SSE-NEXT: movaps %xmm5, 208(%rax) ; SSE-NEXT: movaps %xmm2, 192(%rax) -; SSE-NEXT: movaps %xmm3, 208(%rax) -; SSE-NEXT: movaps %xmm12, 128(%rax) -; SSE-NEXT: movaps %xmm10, 144(%rax) -; SSE-NEXT: movaps %xmm4, 64(%rax) +; SSE-NEXT: movaps %xmm11, 176(%rax) +; SSE-NEXT: movaps %xmm10, 160(%rax) +; SSE-NEXT: movaps %xmm8, 144(%rax) +; SSE-NEXT: movaps %xmm15, 128(%rax) +; SSE-NEXT: movaps %xmm6, 112(%rax) +; SSE-NEXT: movaps %xmm3, 96(%rax) ; SSE-NEXT: movaps %xmm1, 80(%rax) -; SSE-NEXT: movaps %xmm7, (%rax) +; SSE-NEXT: movaps %xmm4, 64(%rax) +; SSE-NEXT: movaps %xmm14, 48(%rax) +; SSE-NEXT: movaps %xmm13, 32(%rax) +; SSE-NEXT: movaps %xmm9, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i64_stride8_vf4: @@ -219,52 +219,52 @@ ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm2 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm3 -; AVX1-ONLY-NEXT: vmovaps (%r11), %ymm1 -; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm4 -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovaps 16(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm6[0],xmm5[0] +; AVX1-ONLY-NEXT: vmovaps (%r11), %ymm4 +; AVX1-ONLY-NEXT: vmovaps (%r10), %ymm5 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm6[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovaps 16(%r9), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm9[0],xmm8[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm7[1],xmm6[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm5 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm5[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm7 -; AVX1-ONLY-NEXT: vmovaps (%r11), %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm5[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm8[0],xmm7[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm7 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm11 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm12 = xmm11[1],xmm8[1] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm11[0],xmm8[0] -; AVX1-ONLY-NEXT: vmovaps %xmm8, 16(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm12, 80(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm7, (%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm10, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm5, 48(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm3, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm9, 112(%rax) -; AVX1-ONLY-NEXT: vmovaps %xmm6, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0] +; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm10 +; AVX1-ONLY-NEXT: vmovaps (%r11), %xmm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm10[0] +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm13 +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm14 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm7[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm11[1],xmm10[1] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm14[1],xmm13[1] +; AVX1-ONLY-NEXT: vmovaps %xmm8, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm7, 112(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm5, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm4, 80(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm15, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm12, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm9, (%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm6, 16(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -277,50 +277,50 @@ ; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm4 ; AVX2-ONLY-NEXT: vmovaps (%rcx), %ymm5 -; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm6 -; AVX2-ONLY-NEXT: vmovaps (%r11), %ymm7 -; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm8 -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],ymm6[0],ymm1[2],ymm6[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm7[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm6 +; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm7 +; AVX2-ONLY-NEXT: vmovaps (%r11), %ymm8 +; AVX2-ONLY-NEXT: vmovaps (%r10), %ymm9 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm6[0],ymm7[0],ymm6[2],ymm7[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3],ymm1[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm3 -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm4 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm4[1],xmm3[1] -; AVX2-ONLY-NEXT: vmovaps (%r10), %xmm7 -; AVX2-ONLY-NEXT: vmovaps (%r11), %xmm8 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm7[0] -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm8 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm8[1],xmm7[1] -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm11 -; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm12 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm12[1],xmm11[1] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm12[0],xmm11[0] -; AVX2-ONLY-NEXT: vmovaps %xmm8, 16(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm7, (%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm13, 80(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm10, 64(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm4, 48(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm3, 32(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm9, 112(%rax) -; AVX2-ONLY-NEXT: vmovaps %xmm5, 96(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 192(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 224(%rax) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm6[1],ymm7[1],ymm6[3],ymm7[3] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm7 +; AVX2-ONLY-NEXT: vmovaps (%rdx), %xmm8 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm8[0],xmm7[0] +; AVX2-ONLY-NEXT: vmovaps (%r10), %xmm10 +; AVX2-ONLY-NEXT: vmovaps (%r11), %xmm11 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm12 = xmm11[0],xmm10[0] +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm13 +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm14 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm7[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm14[1],xmm13[1] +; AVX2-ONLY-NEXT: vmovaps %xmm8, 96(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm7, 112(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm5, 80(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm4, 64(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm15, 32(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm12, 48(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm9, 16(%rax) +; AVX2-ONLY-NEXT: vmovaps %xmm6, (%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm3, 224(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 192(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 160(%rax) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -544,245 +544,237 @@ ; ; AVX1-ONLY-LABEL: store_i64_stride8_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: pushq %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm8[1],ymm2[2],ymm8[2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm6 +; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm5 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm8 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm6[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2],ymm9[2] -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%r10), %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm9 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm9[0],xmm8[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm5[0] +; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm9 +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm10[1],ymm1[2],ymm10[2] +; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%rdx), %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm8[1],xmm5[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vbroadcastsd 8(%r10), %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm8 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm9[0],xmm8[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[2] +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm11[0],xmm10[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[2] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm9[1],xmm8[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm11[1],xmm10[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vbroadcastsd 8(%rdx), %ymm9 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] -; AVX1-ONLY-NEXT: vmovaps 16(%r9), %xmm10 -; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm11 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm11[0],xmm10[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 16(%rax), %ymm9 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] -; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm12 -; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm11[0],xmm10[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX1-ONLY-NEXT: vmovaps 16(%r9), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm13 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm13[0],xmm12[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm14 +; AVX1-ONLY-NEXT: vbroadcastsd 16(%rax), %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%r9), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm14[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm15[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] -; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm15 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm15[1],xmm14[1] -; AVX1-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm15[0],xmm14[0] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm14 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm11 +; AVX1-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm13[1],xmm12[1] -; AVX1-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm12 +; AVX1-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vmovaps 48(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm13[0],xmm12[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm15 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX1-ONLY-NEXT: vmovaps 48(%r9), %xmm15 +; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm0 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm15[0] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm13[1],xmm12[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1] +; AVX1-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] -; AVX1-ONLY-NEXT: vmovaps %ymm11, 192(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm8, 64(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm6, (%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 384(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm5, 96(%rdx) -; AVX1-ONLY-NEXT: vmovapd %ymm4, 32(%rdx) +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 448(%rdx) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 480(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm14, 384(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 352(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 320(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm5, 288(%rdx) +; AVX1-ONLY-NEXT: vmovapd %ymm4, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 224(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm10, 192(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm9, 160(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm8, 128(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 96(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rdx) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rdx) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rdx) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 128(%rdx) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 160(%rdx) -; AVX1-ONLY-NEXT: popq %rax +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i64_stride8_vf8: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: pushq %rax ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm6 -; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm3 +; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm3 +; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm9 -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm5 -; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm7 +; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm8 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm7[1],xmm5[1] -; AVX2-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm2 +; AVX2-ONLY-NEXT: vbroadcastsd 8(%rdx), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm12 +; AVX2-ONLY-NEXT: vmovaps (%rax), %xmm12 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm11 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm14 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm13 -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm15[1],xmm14[1] -; AVX2-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm4 +; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm13 +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm10 +; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm14 +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %xmm11 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm14[1],xmm13[1] +; AVX2-ONLY-NEXT: vbroadcastsd 8(%r10), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm10[1],xmm9[1] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%r10), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm9[1],xmm8[1] +; AVX2-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm13[1],xmm11[1] -; AVX2-ONLY-NEXT: vbroadcastsd 8(%rdx), %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps (%rcx), %xmm0 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm7[0],xmm5[0] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm7 -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm11[1],xmm10[1] +; AVX2-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm15 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm15 +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm7[0],xmm5[0] +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm3, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm8 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm15[0],xmm14[0] -; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm15 -; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm14[0],xmm13[0] +; AVX2-ONLY-NEXT: vmovaps (%rsi), %ymm13 +; AVX2-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm5, %ymm5 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm12, %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovaps (%r8), %ymm12 -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm10[0],xmm9[0] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm9[0],xmm8[0] ; AVX2-ONLY-NEXT: vmovaps (%r9), %ymm14 -; AVX2-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm8, %ymm8 ; AVX2-ONLY-NEXT: vbroadcastsd %xmm6, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm13[0],xmm11[0] -; AVX2-ONLY-NEXT: vinsertf128 $1, (%rdx), %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%rax), %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm11 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm15[1],ymm11[1],ymm15[3],ymm11[3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm1 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm15[0],ymm11[0],ymm15[2],ymm11[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm15 -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] -; AVX2-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm3 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm13[0],ymm15[0],ymm13[2],ymm15[2] -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],mem[2,3] -; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm11[0],xmm10[0] +; AVX2-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vbroadcastsd %xmm15, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],ymm13[0],ymm0[2],ymm13[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%rcx), %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm14[0],ymm12[2],ymm14[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 16(%rax), %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm11 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm13[1],ymm0[3],ymm13[3] +; AVX2-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm13 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm12[1],ymm14[1],ymm12[3],ymm14[3] ; AVX2-ONLY-NEXT: vbroadcastsd 24(%r10), %ymm13 ; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3],ymm13[2,3] -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] -; AVX2-ONLY-NEXT: vbroadcastsd 24(%rdx), %ymm8 -; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm11[0],ymm7[2],ymm11[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rcx), %ymm14 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vmovaps 32(%r8), %ymm14 +; AVX2-ONLY-NEXT: vmovaps 32(%r9), %ymm15 +; AVX2-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm15[0],ymm14[2],ymm15[2] +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] +; AVX2-ONLY-NEXT: vbroadcastsd 48(%rax), %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%rdx), %ymm7 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],ymm7[2,3] +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm14[1],ymm15[1],ymm14[3],ymm15[3] +; AVX2-ONLY-NEXT: vbroadcastsd 56(%r10), %ymm11 +; AVX2-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm11[2,3] ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],mem[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7] -; AVX2-ONLY-NEXT: vmovaps %ymm7, 192(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 384(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm0, 448(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm11, 416(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm1, 480(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm2, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 480(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, 448(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm1, 416(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm13, 384(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 224(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%rdx) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 160(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm9, (%rdx) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm6, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 128(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 352(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 320(%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 256(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 288(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm6, 256(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 352(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm0, 96(%rdx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm0, 288(%rdx) -; AVX2-ONLY-NEXT: popq %rax +; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -791,117 +783,119 @@ ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm9 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm7 ; AVX512F-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512F-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512F-NEXT: vmovdqa64 (%r11), %zmm1 ; AVX512F-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8] ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm8 ; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 ; AVX512F-NEXT: movb $-64, %r8b ; AVX512F-NEXT: kmovw %r8d, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa (%rcx), %xmm10 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm11[1],xmm10[1] -; AVX512F-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm14 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm12[1] -; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm13, %ymm5 -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm16 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512F-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm10[0],xmm8[0] +; AVX512F-NEXT: vmovdqa (%rcx), %xmm12 +; AVX512F-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm12[0] +; AVX512F-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-NEXT: vinserti32x4 $0, %xmm11, %zmm14, %zmm11 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm11[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm11, %zmm14 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm11 +; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm10[1],xmm8[1] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm13[1],xmm12[1] +; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-NEXT: vinserti32x4 $0, %xmm8, %zmm10, %zmm8 +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm11[4,5,6,7] +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,4,12,4,12,4,12] +; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm10, %zmm11 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512F-NEXT: vpermi2q %zmm9, %zmm5, %zmm10 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm11, %zmm12 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] +; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512F-NEXT: vpermi2q %zmm9, %zmm5, %zmm11 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [5,13,5,13] +; AVX512F-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] +; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm12, %zmm13 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] -; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm5 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] -; AVX512F-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm15 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5 +; AVX512F-NEXT: vpermi2q %zmm9, %zmm5, %zmm12 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,6,14] +; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-NEXT: vpermi2q %zmm7, %zmm6, %zmm14 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15] ; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm15 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm13 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm13 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,4,12,4,12,4,12] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm15 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm17 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm4 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] -; AVX512F-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm15 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm15, %zmm17 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm18 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512F-NEXT: vpermt2q %zmm9, %zmm15, %zmm6 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] +; AVX512F-NEXT: vpermt2q %zmm9, %zmm13, %zmm5 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [7,15,7,15] ; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm18, %zmm6 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm11[0],xmm10[0] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm14[0],xmm12[0] -; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 -; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512F-NEXT: vmovdqa (%rcx), %ymm9 -; AVX512F-NEXT: vmovdqa (%rdx), %ymm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX512F-NEXT: vmovdqa (%rsi), %ymm12 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm11, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512F-NEXT: vpermi2q %zmm7, %zmm6, %zmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512F-NEXT: vpermi2q %zmm2, %zmm0, %zmm6 +; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512F-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512F-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512F-NEXT: vmovdqa (%rsi), %ymm14 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm13[2,3] +; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [3,11,3,11,3,11,3,11] +; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 +; AVX512F-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm5, 448(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm12, 384(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm11, 320(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm10, 256(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512F-NEXT: vmovdqa64 %zmm16, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -910,117 +904,119 @@ ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm7 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm2 ; AVX512BW-NEXT: vmovdqa64 (%r11), %zmm1 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm3 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8] ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm8 ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm4 ; AVX512BW-NEXT: movb $-64, %r8b ; AVX512BW-NEXT: kmovd %r8d, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm10 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm11[1],xmm10[1] -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm14 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm12[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm13, %ymm5 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm16 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm10[0],xmm8[0] +; AVX512BW-NEXT: vmovdqa (%rcx), %xmm12 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm12[0] +; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm11, %zmm14, %zmm11 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm11[0,1,2,3],zmm4[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm14 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm11 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm11 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm10[1],xmm8[1] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm13[1],xmm12[1] +; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm10, %zmm8 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm11[4,5,6,7] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm11 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm12 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] +; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm5, %zmm10 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm12 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm13 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm5, %zmm11 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [5,13,5,13] +; AVX512BW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm12, %zmm13 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm5 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] -; AVX512BW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm15 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5 +; AVX512BW-NEXT: vpermi2q %zmm9, %zmm5, %zmm12 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,6,14] +; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm14 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm13, %zmm12 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [7,15,7,15,7,15,7,15] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm15 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm13 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] -; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm13 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm15 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm17 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm4 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] -; AVX512BW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm15 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm17 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm18 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm15, %zmm6 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm14 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm14 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm5 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [7,15,7,15] ; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm18, %zmm6 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm7, %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm11[0],xmm10[0] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm14[0],xmm12[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm8, %zmm9 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm9 -; AVX512BW-NEXT: vmovdqa (%rdx), %ymm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX512BW-NEXT: vmovdqa (%rsi), %ymm12 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm14[0],ymm12[0],ymm14[2],ymm12[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm15[2,3],ymm11[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm11, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512BW-NEXT: vpermi2q %zmm7, %zmm6, %zmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm6 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm7 +; AVX512BW-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512BW-NEXT: vmovdqa (%rsi), %ymm14 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm13[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm14[1],ymm15[3],ymm14[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <8 x i64>, ptr %in.vecptr0, align 64 @@ -1908,212 +1904,218 @@ ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512F-ONLY-SLOW-NEXT: vmovaps 64(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm20 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm29 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: movb $-64, %al -; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %xmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm17 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm20, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r11), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r11), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm20, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] -; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: movb $-64, %r8b +; AVX512F-ONLY-SLOW-NEXT: kmovw %r8d, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm17[0],xmm15[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm18, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm14[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] ; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm26, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm19, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm20, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm13[1],xmm4[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm17[1],xmm15[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm13, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm4[0,1,2,3],zmm19[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm22[0],xmm16[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm15, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm13[1],xmm4[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm22[1],xmm16[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm1[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm29[0],zmm5[2],zmm29[2],zmm5[4],zmm29[4],zmm5[6],zmm29[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm29[1],zmm5[3],zmm29[3],zmm5[5],zmm29[5],zmm5[7],zmm29[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm30[0],zmm8[2],zmm30[2],zmm8[4],zmm30[4],zmm8[6],zmm30[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm8[1],zmm30[1],zmm8[3],zmm30[3],zmm8[5],zmm30[5],zmm8[7],zmm30[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm21, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7] -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm26, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] -; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm7, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm2[1],zmm28[1],zmm2[3],zmm28[3],zmm2[5],zmm28[5],zmm2[7],zmm28[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm7, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm21, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm12, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm11 -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 960(%rax) +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm14[0],ymm21[2],ymm14[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm9, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm14[1],ymm21[3],ymm14[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 896(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 832(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; @@ -2121,212 +2123,218 @@ ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm19 +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512F-ONLY-FAST-NEXT: vmovaps 64(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm20 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm29 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm14, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: movb $-64, %al -; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %xmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm17 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r11), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r11), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0] -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: movb $-64, %r8b +; AVX512F-ONLY-FAST-NEXT: kmovw %r8d, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm17[0],xmm15[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm18, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm14[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] ; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm26, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm19, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm20, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm10, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm13[1],xmm4[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm17[1],xmm15[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm13, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm4[0,1,2,3],zmm19[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm22[0],xmm16[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm15, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm13[1],xmm4[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm22[1],xmm16[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm4, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm1[0,1,2,3],zmm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm29[0],zmm5[2],zmm29[2],zmm5[4],zmm29[4],zmm5[6],zmm29[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm29[1],zmm5[3],zmm29[3],zmm5[5],zmm29[5],zmm5[7],zmm29[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm30[0],zmm8[2],zmm30[2],zmm8[4],zmm30[4],zmm8[6],zmm30[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm8[1],zmm30[1],zmm8[3],zmm30[3],zmm8[5],zmm30[5],zmm8[7],zmm30[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7] -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm26, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] -; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm7, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm2[1],zmm28[1],zmm2[3],zmm28[3],zmm2[5],zmm28[5],zmm2[7],zmm28[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm21, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm12, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm12, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm9, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm11 -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax) +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm14[0],ymm21[2],ymm14[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm9, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm14[1],ymm21[3],ymm14[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 896(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 832(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; @@ -2334,212 +2342,218 @@ ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm19 +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512DQ-SLOW-NEXT: vmovaps 64(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm20 ; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm29 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm29 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm14, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: movb $-64, %al -; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %xmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm17 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm20, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r11), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r11), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm28 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm20, %zmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0] -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] -; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm2 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12] -; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] -; AVX512DQ-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm15 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: movb $-64, %r8b +; AVX512DQ-SLOW-NEXT: kmovw %r8d, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm17[0],xmm15[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm18, %zmm14 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm14[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] ; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm15 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm13 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15] -; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm4 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm26, %zmm7 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm19 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm19, %zmm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm20, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0] -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0] -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm11 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm13[1],xmm4[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm17[1],xmm15[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm13, %zmm4 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm4[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm22[0],xmm16[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm15, %zmm14 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm13[1],xmm4[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm22[1],xmm16[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm1[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,4,12,4,12,4,12] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm29[0],zmm5[2],zmm29[2],zmm5[4],zmm29[4],zmm5[6],zmm29[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm16 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,5,13,5,13,5,13] +; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm15, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm29[1],zmm5[3],zmm29[3],zmm5[5],zmm29[5],zmm5[7],zmm29[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm13 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm17 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm30[0],zmm8[2],zmm30[2],zmm8[4],zmm30[4],zmm8[6],zmm30[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm22 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm8[1],zmm30[1],zmm8[3],zmm30[3],zmm8[5],zmm30[5],zmm8[7],zmm30[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [7,15,7,15] +; AVX512DQ-SLOW-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm21, %zmm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm12 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm7 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7] -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm26, %zmm7 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] -; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm7, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm9 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm7, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm2[1],zmm28[1],zmm2[3],zmm28[3],zmm2[5],zmm28[5],zmm2[7],zmm28[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm7, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm21, %zmm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,2,10,2,10,2,10] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm7 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] ; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm13 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm12, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm14 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [3,11,3,11,3,11,3,11] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm11 -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 960(%rax) +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm14[0],ymm21[2],ymm14[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm14[1],ymm21[3],ymm14[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 896(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 832(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; @@ -2547,212 +2561,218 @@ ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm19 +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512DQ-FAST-NEXT: vmovaps 64(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm12 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm20 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm29 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm29 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm14, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: movb $-64, %al -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %xmm16 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm17 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1] -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r11), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r11), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm28 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0] -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm2 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] -; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm15 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: movb $-64, %r8b +; AVX512DQ-FAST-NEXT: kmovw %r8d, %k1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm17[0],xmm15[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm18, %zmm14 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm14[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] ; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm15 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm13 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15] -; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm26, %zmm7 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm19, %zmm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm4 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm20, %zmm4 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0] -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0] -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm4 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm3 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm11 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm10, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm13[1],xmm4[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm17[1],xmm15[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm4[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm22[0],xmm16[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm15, %zmm14 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm13[1],xmm4[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm22[1],xmm16[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm1[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,4,12,4,12,4,12] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm29[0],zmm5[2],zmm29[2],zmm5[4],zmm29[4],zmm5[6],zmm29[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm16 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,5,13,5,13,5,13] +; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm29[1],zmm5[3],zmm29[3],zmm5[5],zmm29[5],zmm5[7],zmm29[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm13 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm17 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm30[0],zmm8[2],zmm30[2],zmm8[4],zmm30[4],zmm8[6],zmm30[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm22 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm8[1],zmm30[1],zmm8[3],zmm30[3],zmm8[5],zmm30[5],zmm8[7],zmm30[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [7,15,7,15] +; AVX512DQ-FAST-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm12 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm7 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7] -; AVX512DQ-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm10 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm26, %zmm7 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] -; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm7, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm9 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm15 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm7, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm2[1],zmm28[1],zmm2[3],zmm28[3],zmm2[5],zmm28[5],zmm2[7],zmm28[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm7, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm21, %zmm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm7 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] ; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm13 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm12, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm12, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [3,11,3,11,3,11,3,11] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm9, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm11 -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax) +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm14[0],ymm21[2],ymm14[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm14[1],ymm21[3],ymm14[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 896(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 832(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -2760,212 +2780,218 @@ ; AVX512BW-ONLY-SLOW: # %bb.0: ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512BW-ONLY-SLOW-NEXT: vmovaps 64(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm20 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm29 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm14, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %xmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm20, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r11), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r11), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm20, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %r8b +; AVX512BW-ONLY-SLOW-NEXT: kmovd %r8d, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm17[0],xmm15[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm18, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm14[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] ; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm26, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm19, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm20, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm13[1],xmm4[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm17[1],xmm15[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm13, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm4[0,1,2,3],zmm19[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm22[0],xmm16[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm15, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm13[1],xmm4[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm22[1],xmm16[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm1[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm29[0],zmm5[2],zmm29[2],zmm5[4],zmm29[4],zmm5[6],zmm29[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm29[1],zmm5[3],zmm29[3],zmm5[5],zmm29[5],zmm5[7],zmm29[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm30[0],zmm8[2],zmm30[2],zmm8[4],zmm30[4],zmm8[6],zmm30[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm8[1],zmm30[1],zmm8[3],zmm30[3],zmm8[5],zmm30[5],zmm8[7],zmm30[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm21, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm26, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] -; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm7, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm2[1],zmm28[1],zmm2[3],zmm28[3],zmm2[5],zmm28[5],zmm2[7],zmm28[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm7, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm21, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm12, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm11 -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 960(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm14[0],ymm21[2],ymm14[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm9, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm14[1],ymm21[3],ymm14[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 896(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 832(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; @@ -2973,212 +2999,218 @@ ; AVX512BW-ONLY-FAST: # %bb.0: ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512BW-ONLY-FAST-NEXT: vmovaps 64(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm12 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm20 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm29 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm14, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al -; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %xmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm17 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r11), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r11), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] -; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: movb $-64, %r8b +; AVX512BW-ONLY-FAST-NEXT: kmovd %r8d, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm17[0],xmm15[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm18, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm14[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] ; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm26, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm19, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm20, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm10, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm19 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm13[1],xmm4[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm17[1],xmm15[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm13, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm4[0,1,2,3],zmm19[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm22[0],xmm16[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm15, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm13[1],xmm4[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm22[1],xmm16[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm1[0,1,2,3],zmm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm29[0],zmm5[2],zmm29[2],zmm5[4],zmm29[4],zmm5[6],zmm29[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm29[1],zmm5[3],zmm29[3],zmm5[5],zmm29[5],zmm5[7],zmm29[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm30[0],zmm8[2],zmm30[2],zmm8[4],zmm30[4],zmm8[6],zmm30[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm4, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm8[1],zmm30[1],zmm8[3],zmm30[3],zmm8[5],zmm30[5],zmm8[7],zmm30[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7] -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm26, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] -; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm7, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm2[1],zmm28[1],zmm2[3],zmm28[3],zmm2[5],zmm28[5],zmm2[7],zmm28[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm7, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm21, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm12, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm12, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm9, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm11 -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax) +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm14[0],ymm21[2],ymm14[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm9, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm14[1],ymm21[3],ymm14[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 896(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 832(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; @@ -3186,212 +3218,218 @@ ; AVX512DQBW-SLOW: # %bb.0: ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm19 +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512DQBW-SLOW-NEXT: vmovaps 64(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm12 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm20 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm29 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm29 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm14, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: movb $-64, %al -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %xmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm17 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1] -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm20, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r11), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r11), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm28 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm20, %zmm10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0] -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] -; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm2 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: movb $-64, %r8b +; AVX512DQBW-SLOW-NEXT: kmovd %r8d, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm17[0],xmm15[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm18, %zmm14 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm14[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] ; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm13 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm26, %zmm7 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm19 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm19, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1] -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm20, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm10, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm13[1],xmm4[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm17[1],xmm15[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm13, %zmm4 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm4[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm22[0],xmm16[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm15, %zmm14 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm13[1],xmm4[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm22[1],xmm16[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm1[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm29[0],zmm5[2],zmm29[2],zmm5[4],zmm29[4],zmm5[6],zmm29[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm16 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm29[1],zmm5[3],zmm29[3],zmm5[5],zmm29[5],zmm5[7],zmm29[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm13 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm17 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm30[0],zmm8[2],zmm30[2],zmm8[4],zmm30[4],zmm8[6],zmm30[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm22 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm8[1],zmm30[1],zmm8[3],zmm30[3],zmm8[5],zmm30[5],zmm8[7],zmm30[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm21, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7] -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm26, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm7, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm25, %zmm3, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm7, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm2[1],zmm28[1],zmm2[3],zmm28[3],zmm2[5],zmm28[5],zmm2[7],zmm28[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm21, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm7 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm12, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %ymm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm9, %zmm5 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %ymm11 -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %ymm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 960(%rax) +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm4, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm14[0],ymm21[2],ymm14[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm14[1],ymm21[3],ymm14[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 896(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 832(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; @@ -3399,212 +3437,218 @@ ; AVX512DQBW-FAST: # %bb.0: ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm19 +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512DQBW-FAST-NEXT: vmovaps 64(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm12 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm20 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm29 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm29 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm14, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: movb $-64, %al -; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %xmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm17 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1] -; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r11), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r11), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm28 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0] -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] -; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm2 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] -; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] -; AVX512DQBW-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm15 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: movb $-64, %r8b +; AVX512DQBW-FAST-NEXT: kmovd %r8d, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm22 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm18 = xmm17[0],xmm15[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm18, %zmm14 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm14[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [1,9,1,9,1,9,1,9] ; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm15 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm13 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15] -; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm26, %zmm7 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm19 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm19, %zmm9 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1] -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm20, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0] -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0] -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm4 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm11 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm10, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm13[1],xmm4[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm17[1],xmm15[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm13, %zmm4 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm4[0,1,2,3],zmm19[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %xmm13 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm13[0],xmm4[0] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm22[0],xmm16[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm15, %zmm14 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm14[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm13[1],xmm4[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm22[1],xmm16[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm1[0,1,2,3],zmm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm29[0],zmm5[2],zmm29[2],zmm5[4],zmm29[4],zmm5[6],zmm29[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm13 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm16 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm29[1],zmm5[3],zmm29[3],zmm5[5],zmm29[5],zmm5[7],zmm29[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm13 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm17 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm8[0],zmm30[0],zmm8[2],zmm30[2],zmm8[4],zmm30[4],zmm8[6],zmm30[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm4, %zmm7 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm22 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm3 {%k1} = zmm8[1],zmm30[1],zmm8[3],zmm30[3],zmm8[5],zmm30[5],zmm8[7],zmm30[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm21 = [7,15,7,15] +; AVX512DQBW-FAST-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm21, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm12 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm14 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm14[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm7 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7] -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm26, %zmm7 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm7, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm9 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm1 {%k1} = zmm2[0],zmm28[0],zmm2[2],zmm28[2],zmm2[4],zmm28[4],zmm2[6],zmm28[6] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm25, %zmm3, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm26, %zmm12, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm7, %zmm1 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm2[1],zmm28[1],zmm2[3],zmm28[3],zmm2[5],zmm28[5],zmm2[7],zmm28[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm7, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm21, %zmm12 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm1, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %ymm7 ; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm11[0],ymm7[0],ymm11[2],ymm7[2] ; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %ymm13 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm12, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm12, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %ymm14 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm9, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm9, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %ymm11 -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %ymm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %ymm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax) +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm11[1],ymm7[1],ymm11[3],ymm7[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm15[1],ymm13[1],ymm15[3],ymm13[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm8[2,3],ymm7[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm5, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm4, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %ymm7 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %ymm8 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm8[0],ymm7[0],ymm8[2],ymm7[2] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm21[0],ymm14[0],ymm21[2],ymm14[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm7[1],ymm8[3],ymm7[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm21[1],ymm14[1],ymm21[3],ymm14[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 896(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 832(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 832(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 @@ -4346,84 +4390,84 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[2] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm6 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm7 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm8 -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[2] -; AVX1-ONLY-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm5[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm3 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[2] ; AVX1-ONLY-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r10), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[2] ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%r10), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r10), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 72(%r10), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r10), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r10), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 104(%rdx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 104(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 104(%r10), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -4464,22 +4508,22 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r10), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r10), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 168(%rdx), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm4[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 168(%r10), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -4916,11 +4960,11 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 72(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm1 @@ -5450,480 +5494,484 @@ ; ; AVX512F-ONLY-SLOW-LABEL: store_i64_stride8_vf32: ; AVX512F-ONLY-SLOW: # %bb.0: -; AVX512F-ONLY-SLOW-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512F-ONLY-SLOW-NEXT: subq $2696, %rsp # imm = 0xA88 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-ONLY-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovaps 192(%rdx), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovaps 128(%rdx), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovaps 192(%rdx), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm31 ; AVX512F-ONLY-SLOW-NEXT: movb $-64, %r11b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm7, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm19[0],zmm18[0],zmm19[2],zmm18[2],zmm19[4],zmm18[4],zmm19[6],zmm18[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] ; AVX512F-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm29, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm19[1],zmm18[1],zmm19[3],zmm18[3],zmm19[5],zmm18[5],zmm19[7],zmm18[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm23[0],zmm21[0],zmm23[2],zmm21[2],zmm23[4],zmm21[4],zmm23[6],zmm21[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm23[1],zmm21[1],zmm23[3],zmm21[3],zmm23[5],zmm21[5],zmm23[7],zmm21[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm16[0],zmm31[0],zmm16[2],zmm31[2],zmm16[4],zmm31[4],zmm16[6],zmm31[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm31[1],zmm16[3],zmm31[3],zmm16[5],zmm31[5],zmm16[7],zmm31[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm30, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm3, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm17 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm29, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] ; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm29, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm29, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm26, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm29, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm0, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm20[0],zmm5[0],zmm20[2],zmm5[2],zmm20[4],zmm5[4],zmm20[6],zmm5[6] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm20[1],zmm5[1],zmm20[3],zmm5[3],zmm20[5],zmm5[5],zmm20[7],zmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm6, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm29, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm14[0],zmm22[2],zmm14[2],zmm22[4],zmm14[4],zmm22[6],zmm14[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm18 = zmm22[1],zmm14[1],zmm22[3],zmm14[3],zmm22[5],zmm14[5],zmm22[7],zmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k1} = zmm17[0],zmm1[0],zmm17[2],zmm1[2],zmm17[4],zmm1[4],zmm17[6],zmm1[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm17[1],zmm1[1],zmm17[3],zmm1[3],zmm17[5],zmm1[5],zmm17[7],zmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm29, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm26, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm26, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm26, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm26[0],zmm2[0],zmm26[2],zmm2[2],zmm26[4],zmm2[4],zmm26[6],zmm2[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm26[1],zmm2[1],zmm26[3],zmm2[3],zmm26[5],zmm2[5],zmm26[7],zmm2[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm2[0],xmm4[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm0[0],xmm1[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm28, %zmm14, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm14[0,1,2,3],zmm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm24 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm4[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm24[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm2[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm24 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm24[0],xmm9[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm25, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm7[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm29 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm24[1],xmm9[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm4[1],xmm2[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm4[0],xmm2[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm16[0],xmm9[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm24, %zmm8, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm30[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm16[1],xmm9[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm4[1],xmm2[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm2[0,1,2,3],zmm21[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm9[0],xmm4[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm21 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm23 = xmm21[0],xmm16[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm23, %zmm11, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm11[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm21[1],xmm16[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm9[1],xmm4[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm4, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm4[0,1,2,3],zmm6[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm24 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm19 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm18, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm10, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm15, %zmm0, %zmm15 ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %ymm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm16[0],ymm18[2],ymm16[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm23[0],ymm21[0],ymm23[2],ymm21[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm16[1],ymm18[3],ymm16[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm21[1],ymm23[3],ymm21[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm16 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %ymm18 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm7[0],ymm16[2],ymm7[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm8[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm16[1],ymm7[1],ymm16[3],ymm7[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm16[0],ymm8[0],ymm16[2],ymm8[2] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %ymm18 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm22 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm16[1],ymm8[1],ymm16[3],ymm8[3] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm12[2,3],ymm8[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm18[0],ymm16[0],ymm18[2],ymm16[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm18[1],ymm16[1],ymm18[3],ymm16[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm12[2,3],ymm6[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm26, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1728(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 1728(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1664(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1152(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 1216(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm14, 1984(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm13, 1920(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 1856(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1792(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm15, 1920(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1856(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1792(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1472(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, 1408(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 1280(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1088(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1024(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm11, 1408(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1344(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1280(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5932,9 +5980,6 @@ ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -5943,490 +5988,503 @@ ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 1600(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1536(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-ONLY-SLOW-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512F-ONLY-SLOW-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; ; AVX512F-ONLY-FAST-LABEL: store_i64_stride8_vf32: ; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512F-ONLY-FAST-NEXT: subq $2696, %rsp # imm = 0xA88 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512F-ONLY-FAST-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovaps 192(%rdx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovaps 128(%rdx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovaps 192(%rdx), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 ; AVX512F-ONLY-FAST-NEXT: movb $-64, %r11b ; AVX512F-ONLY-FAST-NEXT: kmovw %r11d, %k1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm7, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm19[0],zmm18[0],zmm19[2],zmm18[2],zmm19[4],zmm18[4],zmm19[6],zmm18[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] ; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm29, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm19[1],zmm18[1],zmm19[3],zmm18[3],zmm19[5],zmm18[5],zmm19[7],zmm18[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm29, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm23[0],zmm21[0],zmm23[2],zmm21[2],zmm23[4],zmm21[4],zmm23[6],zmm21[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm23[1],zmm21[1],zmm23[3],zmm21[3],zmm23[5],zmm21[5],zmm23[7],zmm21[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm16[0],zmm31[0],zmm16[2],zmm31[2],zmm16[4],zmm31[4],zmm16[6],zmm31[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm29, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm31[1],zmm16[3],zmm31[3],zmm16[5],zmm31[5],zmm16[7],zmm31[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm13 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm3, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm17 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm4, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm29, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] ; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm29, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm29, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm3, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm29, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm0, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm20[0],zmm5[0],zmm20[2],zmm5[2],zmm20[4],zmm5[4],zmm20[6],zmm5[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm20[1],zmm5[1],zmm20[3],zmm5[3],zmm20[5],zmm5[5],zmm20[7],zmm5[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm29, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm14[0],zmm22[2],zmm14[2],zmm22[4],zmm14[4],zmm22[6],zmm14[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm18 = zmm22[1],zmm14[1],zmm22[3],zmm14[3],zmm22[5],zmm14[5],zmm22[7],zmm14[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k1} = zmm17[0],zmm1[0],zmm17[2],zmm1[2],zmm17[4],zmm1[4],zmm17[6],zmm1[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm17[1],zmm1[1],zmm17[3],zmm1[3],zmm17[5],zmm1[5],zmm17[7],zmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm29, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm26, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm26, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm26, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm26[0],zmm2[0],zmm26[2],zmm2[2],zmm26[4],zmm2[4],zmm26[6],zmm2[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm26[1],zmm2[1],zmm26[3],zmm2[3],zmm26[5],zmm2[5],zmm26[7],zmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm2[0],xmm4[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm0[0],xmm1[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm28, %zmm14, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm14[0,1,2,3],zmm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm24 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm4[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm24[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm2[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %xmm24 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm24[0],xmm9[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm25, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm7[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %xmm29 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm24[1],xmm9[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm4[1],xmm2[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm30 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm4[0],xmm2[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %xmm16 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm16[0],xmm9[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm24, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm30[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm16[1],xmm9[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm4[1],xmm2[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm2[0,1,2,3],zmm21[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm9[0],xmm4[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %xmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm21 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm23 = xmm21[0],xmm16[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm23, %zmm11, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm11[0,1,2,3],zmm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm21[1],xmm16[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm9[1],xmm4[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm4[0,1,2,3],zmm6[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm24 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %xmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm18, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm10, %zmm10 ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm15, %zmm0, %zmm15 ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %ymm17 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm16[0],ymm18[2],ymm16[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm23[0],ymm21[0],ymm23[2],ymm21[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm16[1],ymm18[3],ymm16[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm21[1],ymm23[3],ymm21[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm16 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %ymm18 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm7[0],ymm16[2],ymm7[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm8[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm16[1],ymm7[1],ymm16[3],ymm7[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %ymm16 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm16[0],ymm8[0],ymm16[2],ymm8[2] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %ymm18 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm22 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm16[1],ymm8[1],ymm16[3],ymm8[3] ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm12[2,3],ymm8[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm18[0],ymm16[0],ymm18[2],ymm16[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm18[1],ymm16[1],ymm18[3],ymm16[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm12[2,3],ymm6[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm26, %zmm6 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1728(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1728(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 1664(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1152(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm14, 1984(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm13, 1920(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1856(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1792(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm15, 1920(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1856(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1792(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1472(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 1408(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 1280(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1088(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm11, 1408(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1344(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1280(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -6435,9 +6493,6 @@ ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -6446,490 +6501,503 @@ ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1600(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 1536(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-ONLY-FAST-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512F-ONLY-FAST-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; ; AVX512DQ-SLOW-LABEL: store_i64_stride8_vf32: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512DQ-SLOW-NEXT: subq $2696, %rsp # imm = 0xA88 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512DQ-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovaps 192(%rdx), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovaps 128(%rdx), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovaps 192(%rdx), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm31 ; AVX512DQ-SLOW-NEXT: movb $-64, %r11b ; AVX512DQ-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12] -; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm7 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] -; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13] +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12] +; AVX512DQ-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm5 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,5,13] +; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm7, %zmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm19[0],zmm18[0],zmm19[2],zmm18[2],zmm19[4],zmm18[4],zmm19[6],zmm18[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] ; AVX512DQ-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm9 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm7 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512DQ-SLOW-NEXT: # ymm28 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [7,15,7,15,7,15,7,15] +; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm29, %zmm5 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm19[1],zmm18[1],zmm19[3],zmm18[3],zmm19[5],zmm18[5],zmm19[7],zmm18[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm4 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512DQ-SLOW-NEXT: # ymm30 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm23[0],zmm21[0],zmm23[2],zmm21[2],zmm23[4],zmm21[4],zmm23[6],zmm21[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm23[1],zmm21[1],zmm23[3],zmm21[3],zmm23[5],zmm21[5],zmm23[7],zmm21[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm16[0],zmm31[0],zmm16[2],zmm31[2],zmm16[4],zmm31[4],zmm16[6],zmm31[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm31[1],zmm16[3],zmm31[3],zmm16[5],zmm31[5],zmm16[7],zmm31[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm1 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm30, %zmm13 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm3, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r10), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r10), %zmm17 ; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm29, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] ; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [2,10,2,10,2,10,2,10] +; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm29, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm29, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm26, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm29, %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm25 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm25 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm12 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm18 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm0, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm23 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm20[0],zmm5[0],zmm20[2],zmm5[2],zmm20[4],zmm5[4],zmm20[6],zmm5[6] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm20[1],zmm5[1],zmm20[3],zmm5[3],zmm20[5],zmm5[5],zmm20[7],zmm5[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm6, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm29, %zmm12 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm14[0],zmm22[2],zmm14[2],zmm22[4],zmm14[4],zmm22[6],zmm14[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm18 = zmm22[1],zmm14[1],zmm22[3],zmm14[3],zmm22[5],zmm14[5],zmm22[7],zmm14[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm13 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k1} = zmm17[0],zmm1[0],zmm17[2],zmm1[2],zmm17[4],zmm1[4],zmm17[6],zmm1[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm17[1],zmm1[1],zmm17[3],zmm1[3],zmm17[5],zmm1[5],zmm17[7],zmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm29, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm26, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm26, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm2, %zmm26, %zmm29 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm26[0],zmm2[0],zmm26[2],zmm2[2],zmm26[4],zmm2[4],zmm26[6],zmm2[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm26[1],zmm2[1],zmm26[3],zmm2[3],zmm26[5],zmm2[5],zmm26[7],zmm2[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm2[0],xmm4[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm0[0],xmm1[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm28, %zmm14, %zmm14 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm14[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm24 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm4[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm2[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm24 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm24[0],xmm9[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm25, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm29 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0] -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm24[1],xmm9[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm4[1],xmm2[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm30 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm4[0],xmm2[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm16 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm16[0],xmm9[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm24, %zmm8, %zmm8 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm30[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm16[1],xmm9[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm4[1],xmm2[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm2[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rcx), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm9 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm9[0],xmm4[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm21 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm23 = xmm21[0],xmm16[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm23, %zmm11, %zmm11 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm11[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm21[1],xmm16[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm9[1],xmm4[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm4, %zmm4 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm4[0,1,2,3],zmm6[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm24 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rcx), %xmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm14 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm19 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm18, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm10, %zmm10 ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm15, %zmm0, %zmm15 ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %ymm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %ymm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm16[0],ymm18[2],ymm16[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %ymm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm23[0],ymm21[0],ymm23[2],ymm21[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm16[1],ymm18[3],ymm16[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm21[1],ymm23[3],ymm21[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm16 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %ymm18 ; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm19 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm7[0],ymm16[2],ymm7[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm8[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm16[1],ymm7[1],ymm16[3],ymm7[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %ymm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm16 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm16[0],ymm8[0],ymm16[2],ymm8[2] ; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %ymm18 ; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm22 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm16[1],ymm8[1],ymm16[3],ymm8[3] ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm12[2,3],ymm8[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rcx), %ymm12 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %ymm15 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %ymm13 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm18[0],ymm16[0],ymm18[2],ymm16[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm18[1],ymm16[1],ymm18[3],ymm16[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm12[2,3],ymm6[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm26, %zmm6 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 1728(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 1728(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 1664(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 1152(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512DQ-SLOW-NEXT: vmovaps %zmm14, 1984(%rax) -; AVX512DQ-SLOW-NEXT: vmovaps %zmm13, 1920(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 1856(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 1792(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm15, 1920(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 1856(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 1792(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 1472(%rax) -; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, 1408(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 1280(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 1088(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 1024(%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm11, 1408(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 1344(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 1280(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -6938,9 +7006,6 @@ ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -6949,490 +7014,503 @@ ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 1600(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 1536(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) ; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-SLOW-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512DQ-SLOW-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: store_i64_stride8_vf32: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512DQ-FAST-NEXT: subq $2696, %rsp # imm = 0xA88 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FAST-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovaps 192(%rdx), %zmm2 -; AVX512DQ-FAST-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovaps 128(%rdx), %zmm2 -; AVX512DQ-FAST-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQ-FAST-NEXT: vmovaps 192(%rdx), %zmm0 +; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 ; AVX512DQ-FAST-NEXT: movb $-64, %r11b ; AVX512DQ-FAST-NEXT: kmovw %r11d, %k1 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm7 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] -; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13] +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12] +; AVX512DQ-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm5 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm5 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,5,13] +; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm7, %zmm5 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm19[0],zmm18[0],zmm19[2],zmm18[2],zmm19[4],zmm18[4],zmm19[6],zmm18[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] ; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm9 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm7 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512DQ-FAST-NEXT: # ymm28 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm29, %zmm5 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm19[1],zmm18[1],zmm19[3],zmm18[3],zmm19[5],zmm18[5],zmm19[7],zmm18[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm29, %zmm4 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512DQ-FAST-NEXT: # ymm30 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm23[0],zmm21[0],zmm23[2],zmm21[2],zmm23[4],zmm21[4],zmm23[6],zmm21[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm23[1],zmm21[1],zmm23[3],zmm21[3],zmm23[5],zmm21[5],zmm23[7],zmm21[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm16[0],zmm31[0],zmm16[2],zmm31[2],zmm16[4],zmm31[4],zmm16[6],zmm31[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm29, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm31[1],zmm16[3],zmm31[3],zmm16[5],zmm31[5],zmm16[7],zmm31[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm1 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm3, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r10), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r10), %zmm17 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm4, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm30 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm29, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] ; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm29, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm29, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm3, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm29, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm25 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm25 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm12 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm18 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm23 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm20[0],zmm5[0],zmm20[2],zmm5[2],zmm20[4],zmm5[4],zmm20[6],zmm5[6] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm5 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm6 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm20[1],zmm5[1],zmm20[3],zmm5[3],zmm20[5],zmm5[5],zmm20[7],zmm5[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm29, %zmm12 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm14[0],zmm22[2],zmm14[2],zmm22[4],zmm14[4],zmm22[6],zmm14[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm18 = zmm22[1],zmm14[1],zmm22[3],zmm14[3],zmm22[5],zmm14[5],zmm22[7],zmm14[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm13 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k1} = zmm17[0],zmm1[0],zmm17[2],zmm1[2],zmm17[4],zmm1[4],zmm17[6],zmm1[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm17[1],zmm1[1],zmm17[3],zmm1[3],zmm17[5],zmm1[5],zmm17[7],zmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm29, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm17 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm26, %zmm3 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm26, %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm26, %zmm29 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm26[0],zmm2[0],zmm26[2],zmm2[2],zmm26[4],zmm2[4],zmm26[6],zmm2[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm26[1],zmm2[1],zmm26[3],zmm2[3],zmm26[5],zmm2[5],zmm26[7],zmm2[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm4 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm2[0],xmm4[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm0[0],xmm1[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm28, %zmm14, %zmm14 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm14[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm24 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm4[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm2[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %xmm24 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm24[0],xmm9[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm25, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %xmm29 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0] -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm24[1],xmm9[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm4[1],xmm2[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm30 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm4[0],xmm2[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %xmm16 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm16[0],xmm9[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm24, %zmm8, %zmm8 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm30[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm16[1],xmm9[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm4[1],xmm2[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm2[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rcx), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm9 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm9[0],xmm4[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %xmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %xmm21 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm23 = xmm21[0],xmm16[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm23, %zmm11, %zmm11 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm11[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm21[1],xmm16[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm9[1],xmm4[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm4, %zmm4 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm4[0,1,2,3],zmm6[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm24 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rcx), %xmm13 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm14 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %xmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %xmm19 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm18, %zmm9 +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm10, %zmm10 ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm15, %zmm0, %zmm15 ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %ymm17 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %ymm18 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm16[0],ymm18[2],ymm16[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm23[0],ymm21[0],ymm23[2],ymm21[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm16[1],ymm18[3],ymm16[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm21[1],ymm23[3],ymm21[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm16 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %ymm18 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %ymm19 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm7[0],ymm16[2],ymm7[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm8[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm16[1],ymm7[1],ymm16[3],ymm7[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %ymm16 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm16[0],ymm8[0],ymm16[2],ymm8[2] ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %ymm18 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm22 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm16[1],ymm8[1],ymm16[3],ymm8[3] ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm12[2,3],ymm8[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rcx), %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %ymm15 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %ymm13 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm18[0],ymm16[0],ymm18[2],ymm16[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm18[1],ymm16[1],ymm18[3],ymm16[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm12[2,3],ymm6[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm26, %zmm6 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 1728(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 1728(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 1664(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 1152(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512DQ-FAST-NEXT: vmovaps %zmm14, 1984(%rax) -; AVX512DQ-FAST-NEXT: vmovaps %zmm13, 1920(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 1856(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1792(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm15, 1920(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1856(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 1792(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 1472(%rax) -; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 1408(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 1280(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1088(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 1024(%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm11, 1408(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 1344(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1280(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -7441,9 +7519,6 @@ ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -7452,490 +7527,503 @@ ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 1600(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 1536(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQ-FAST-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512DQ-FAST-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride8_vf32: ; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512BW-ONLY-SLOW-NEXT: subq $2696, %rsp # imm = 0xA88 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-ONLY-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovaps 192(%rdx), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovaps 128(%rdx), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovaps 192(%rdx), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm31 ; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %r11b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r11d, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm7, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm19[0],zmm18[0],zmm19[2],zmm18[2],zmm19[4],zmm18[4],zmm19[6],zmm18[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] ; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm29, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm19[1],zmm18[1],zmm19[3],zmm18[3],zmm19[5],zmm18[5],zmm19[7],zmm18[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm23[0],zmm21[0],zmm23[2],zmm21[2],zmm23[4],zmm21[4],zmm23[6],zmm21[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm23[1],zmm21[1],zmm23[3],zmm21[3],zmm23[5],zmm21[5],zmm23[7],zmm21[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm16[0],zmm31[0],zmm16[2],zmm31[2],zmm16[4],zmm31[4],zmm16[6],zmm31[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm31[1],zmm16[3],zmm31[3],zmm16[5],zmm31[5],zmm16[7],zmm31[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm30, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm3, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm17 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm29, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] ; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm29, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm29, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm26, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm29, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm0, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm20[0],zmm5[0],zmm20[2],zmm5[2],zmm20[4],zmm5[4],zmm20[6],zmm5[6] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm20[1],zmm5[1],zmm20[3],zmm5[3],zmm20[5],zmm5[5],zmm20[7],zmm5[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm6, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm29, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm14[0],zmm22[2],zmm14[2],zmm22[4],zmm14[4],zmm22[6],zmm14[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm18 = zmm22[1],zmm14[1],zmm22[3],zmm14[3],zmm22[5],zmm14[5],zmm22[7],zmm14[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k1} = zmm17[0],zmm1[0],zmm17[2],zmm1[2],zmm17[4],zmm1[4],zmm17[6],zmm1[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm17[1],zmm1[1],zmm17[3],zmm1[3],zmm17[5],zmm1[5],zmm17[7],zmm1[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm29, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm26, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm26, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm26, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm26[0],zmm2[0],zmm26[2],zmm2[2],zmm26[4],zmm2[4],zmm26[6],zmm2[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm26[1],zmm2[1],zmm26[3],zmm2[3],zmm26[5],zmm2[5],zmm26[7],zmm2[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm2[0],xmm4[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm0[0],xmm1[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm28, %zmm14, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm14[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm24 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm4[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm24[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm2[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm24[0],xmm9[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm25, %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm7[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm24[1],xmm9[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm4[1],xmm2[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm4[0],xmm2[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm16[0],xmm9[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm24, %zmm8, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm30[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm16[1],xmm9[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm4[1],xmm2[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm2[0,1,2,3],zmm21[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm9[0],xmm4[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm23 = xmm21[0],xmm16[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm23, %zmm11, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm11[0,1,2,3],zmm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm21[1],xmm16[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm9[1],xmm4[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm4, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm4[0,1,2,3],zmm6[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm24 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %xmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm18, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm10, %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm15, %zmm0, %zmm15 ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %ymm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm16[0],ymm18[2],ymm16[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %ymm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm23[0],ymm21[0],ymm23[2],ymm21[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm16[1],ymm18[3],ymm16[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm21[1],ymm23[3],ymm21[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm16 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %ymm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm7[0],ymm16[2],ymm7[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm8[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm16[1],ymm7[1],ymm16[3],ymm7[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm16 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm16[0],ymm8[0],ymm16[2],ymm8[2] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %ymm18 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm16[1],ymm8[1],ymm16[3],ymm8[3] ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm12[2,3],ymm8[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm18[0],ymm16[0],ymm18[2],ymm16[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm18[1],ymm16[1],ymm18[3],ymm16[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm12[2,3],ymm6[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm26, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1728(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 1728(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1664(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1152(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 1216(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm14, 1984(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm13, 1920(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 1856(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1792(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm15, 1920(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1856(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1792(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1472(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 1408(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 1280(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1088(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1024(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm11, 1408(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 1344(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1280(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -7944,9 +8032,6 @@ ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -7955,490 +8040,503 @@ ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 1600(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1536(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512BW-ONLY-SLOW-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-ONLY-FAST-LABEL: store_i64_stride8_vf32: ; AVX512BW-ONLY-FAST: # %bb.0: -; AVX512BW-ONLY-FAST-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512BW-ONLY-FAST-NEXT: subq $2696, %rsp # imm = 0xA88 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-ONLY-FAST-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovaps 192(%rdx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovaps 128(%rdx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovaps 192(%rdx), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 ; AVX512BW-ONLY-FAST-NEXT: movb $-64, %r11b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r11d, %k1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm7, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm19[0],zmm18[0],zmm19[2],zmm18[2],zmm19[4],zmm18[4],zmm19[6],zmm18[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] ; AVX512BW-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm29, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm19[1],zmm18[1],zmm19[3],zmm18[3],zmm19[5],zmm18[5],zmm19[7],zmm18[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm29, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm23[0],zmm21[0],zmm23[2],zmm21[2],zmm23[4],zmm21[4],zmm23[6],zmm21[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm23[1],zmm21[1],zmm23[3],zmm21[3],zmm23[5],zmm21[5],zmm23[7],zmm21[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm16[0],zmm31[0],zmm16[2],zmm31[2],zmm16[4],zmm31[4],zmm16[6],zmm31[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm29, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm31[1],zmm16[3],zmm31[3],zmm16[5],zmm31[5],zmm16[7],zmm31[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm13 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm3, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm17 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm4, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm4, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm29, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] ; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm29, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm29, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm3, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm29, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm0, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm20[0],zmm5[0],zmm20[2],zmm5[2],zmm20[4],zmm5[4],zmm20[6],zmm5[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm20[1],zmm5[1],zmm20[3],zmm5[3],zmm20[5],zmm5[5],zmm20[7],zmm5[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm29, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm14[0],zmm22[2],zmm14[2],zmm22[4],zmm14[4],zmm22[6],zmm14[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm18 = zmm22[1],zmm14[1],zmm22[3],zmm14[3],zmm22[5],zmm14[5],zmm22[7],zmm14[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k1} = zmm17[0],zmm1[0],zmm17[2],zmm1[2],zmm17[4],zmm1[4],zmm17[6],zmm1[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm17[1],zmm1[1],zmm17[3],zmm1[3],zmm17[5],zmm1[5],zmm17[7],zmm1[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm29, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm26, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm26, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm26, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm26[0],zmm2[0],zmm26[2],zmm2[2],zmm26[4],zmm2[4],zmm26[6],zmm2[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm26[1],zmm2[1],zmm26[3],zmm2[3],zmm26[5],zmm2[5],zmm26[7],zmm2[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm2[0],xmm4[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm0[0],xmm1[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm28, %zmm14, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm14[0,1,2,3],zmm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm24 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm4[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm24[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm2[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %xmm24 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm24[0],xmm9[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm25, %zmm1, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm7[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %xmm29 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %xmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm24[1],xmm9[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm4[1],xmm2[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm30 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm4[0],xmm2[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %xmm16 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm16[0],xmm9[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm24, %zmm8, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm30[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm16[1],xmm9[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm4[1],xmm2[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm2[0,1,2,3],zmm21[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm9[0],xmm4[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %xmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm21 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm23 = xmm21[0],xmm16[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm23, %zmm11, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm11[0,1,2,3],zmm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm21[1],xmm16[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm9[1],xmm4[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm4, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm4[0,1,2,3],zmm6[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm24 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %xmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %xmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm19 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm18, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm10, %zmm10 ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm15, %zmm0, %zmm15 ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %ymm17 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %ymm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %ymm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm16[0],ymm18[2],ymm16[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %ymm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm23[0],ymm21[0],ymm23[2],ymm21[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm16[1],ymm18[3],ymm16[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm21[1],ymm23[3],ymm21[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm16 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %ymm18 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm7[0],ymm16[2],ymm7[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm8[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm16[1],ymm7[1],ymm16[3],ymm7[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %ymm16 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm16[0],ymm8[0],ymm16[2],ymm8[2] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %ymm18 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm22 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm16[1],ymm8[1],ymm16[3],ymm8[3] ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm12[2,3],ymm8[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm15 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm18[0],ymm16[0],ymm18[2],ymm16[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm18[1],ymm16[1],ymm18[3],ymm16[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm12[2,3],ymm6[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm26, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1728(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1728(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 1664(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1152(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm14, 1984(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm13, 1920(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1856(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1792(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm15, 1920(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1856(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1792(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1472(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 1408(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 1280(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1088(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm11, 1408(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 1344(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1280(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -8447,9 +8545,6 @@ ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -8458,490 +8553,503 @@ ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1600(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 1536(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-ONLY-FAST-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512BW-ONLY-FAST-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i64_stride8_vf32: ; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512DQBW-SLOW-NEXT: subq $2696, %rsp # imm = 0xA88 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQBW-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovaps 192(%rdx), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovaps 128(%rdx), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovaps 192(%rdx), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm31 ; AVX512DQBW-SLOW-NEXT: movb $-64, %r11b ; AVX512DQBW-SLOW-NEXT: kmovd %r11d, %k1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm7 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13] +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm5 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm10, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm7, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm19[0],zmm18[0],zmm19[2],zmm18[2],zmm19[4],zmm18[4],zmm19[6],zmm18[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm7, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] ; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm9 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # ymm28 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm29, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm19[1],zmm18[1],zmm19[3],zmm18[3],zmm19[5],zmm18[5],zmm19[7],zmm18[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm29, %zmm4 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # ymm30 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm30, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm23[0],zmm21[0],zmm23[2],zmm21[2],zmm23[4],zmm21[4],zmm23[6],zmm21[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm15, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm23[1],zmm21[1],zmm23[3],zmm21[3],zmm23[5],zmm21[5],zmm23[7],zmm21[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm10, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm16[0],zmm31[0],zmm16[2],zmm31[2],zmm16[4],zmm31[4],zmm16[6],zmm31[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm31[1],zmm16[3],zmm31[3],zmm16[5],zmm31[5],zmm16[7],zmm31[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm30, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm30, %zmm13 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm3, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm7, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r10), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r10), %zmm17 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm29, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] ; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm29, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm29, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm26, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm29, %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm19 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm0, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm3, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm6, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm20[0],zmm5[0],zmm20[2],zmm5[2],zmm20[4],zmm5[4],zmm20[6],zmm5[6] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm20[1],zmm5[1],zmm20[3],zmm5[3],zmm20[5],zmm5[5],zmm20[7],zmm5[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm29, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm6, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm29, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm14[0],zmm22[2],zmm14[2],zmm22[4],zmm14[4],zmm22[6],zmm14[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm18 = zmm22[1],zmm14[1],zmm22[3],zmm14[3],zmm22[5],zmm14[5],zmm22[7],zmm14[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k1} = zmm17[0],zmm1[0],zmm17[2],zmm1[2],zmm17[4],zmm1[4],zmm17[6],zmm1[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm17[1],zmm1[1],zmm17[3],zmm1[3],zmm17[5],zmm1[5],zmm17[7],zmm1[7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm29, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm26, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm26, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm26, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm26[0],zmm2[0],zmm26[2],zmm2[2],zmm26[4],zmm2[4],zmm26[6],zmm2[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm26[1],zmm2[1],zmm26[3],zmm2[3],zmm26[5],zmm2[5],zmm26[7],zmm2[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm2[0],xmm4[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm0[0],xmm1[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm28, %zmm14, %zmm14 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm14[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm24 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm4[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm2[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm24 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm24[0],xmm9[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm25, %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm29 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %xmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0] -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm24[1],xmm9[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm4[1],xmm2[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm7, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm30 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %xmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm4[0],xmm2[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %xmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm16 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm16[0],xmm9[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm24, %zmm8, %zmm8 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm30[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm16[1],xmm9[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm4[1],xmm2[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm2[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rcx), %xmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm9 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm9[0],xmm4[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm21 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm23 = xmm21[0],xmm16[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm23, %zmm11, %zmm11 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm11[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm21[1],xmm16[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm9[1],xmm4[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm4, %zmm4 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm4[0,1,2,3],zmm6[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm24 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rcx), %xmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm14 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm18, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm10, %zmm10 ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vinsertf64x4 $0, %ymm15, %zmm0, %zmm15 ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %ymm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm16[0],ymm18[2],ymm16[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %ymm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm23[0],ymm21[0],ymm23[2],ymm21[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm16[1],ymm18[3],ymm16[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm21[1],ymm23[3],ymm21[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %ymm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm16 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %ymm18 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm7[0],ymm16[2],ymm7[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm8[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm16[1],ymm7[1],ymm16[3],ymm7[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %ymm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %ymm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm16 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm16[0],ymm8[0],ymm16[2],ymm8[2] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %ymm18 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm16[1],ymm8[1],ymm16[3],ymm8[3] ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5 +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm12[2,3],ymm8[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rcx), %ymm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %ymm15 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %ymm13 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm18[0],ymm16[0],ymm18[2],ymm16[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm26 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm18[1],ymm16[1],ymm18[3],ymm16[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm12[2,3],ymm6[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm26, %zmm6 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 1728(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 1728(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 1664(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 1152(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm14, 1984(%rax) -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm13, 1920(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 1856(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 1792(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm15, 1920(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 1856(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 1792(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 1472(%rax) -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 1408(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 1280(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 1088(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 1024(%rax) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm11, 1408(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 1344(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 1280(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -8950,9 +9058,6 @@ ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -8961,490 +9066,503 @@ ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 1600(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 1536(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 1088(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1024(%rax) ; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQBW-SLOW-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512DQBW-SLOW-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq ; ; AVX512DQBW-FAST-LABEL: store_i64_stride8_vf32: ; AVX512DQBW-FAST: # %bb.0: -; AVX512DQBW-FAST-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512DQBW-FAST-NEXT: subq $2696, %rsp # imm = 0xA88 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQBW-FAST-NEXT: vmovaps 128(%rdi), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovaps 192(%rdx), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovaps 128(%rdx), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovaps 192(%rdx), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm31 ; AVX512DQBW-FAST-NEXT: movb $-64, %r11b ; AVX512DQBW-FAST-NEXT: kmovd %r11d, %k1 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm7 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] -; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13] +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-FAST-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm5 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm8 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm5 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [5,13,5,13] +; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm10, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm8, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm7, %zmm5 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm19[0],zmm18[0],zmm19[2],zmm18[2],zmm19[4],zmm18[4],zmm19[6],zmm18[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [6,14,6,14] ; AVX512DQBW-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm9 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm7 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15] -; AVX512DQBW-FAST-NEXT: # ymm28 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] -; AVX512DQBW-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm29, %zmm5 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm19[1],zmm18[1],zmm19[3],zmm18[3],zmm19[5],zmm18[5],zmm19[7],zmm18[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm29, %zmm4 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm30 = [7,15,7,15] +; AVX512DQBW-FAST-NEXT: # ymm30 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm30, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm23[0],zmm21[0],zmm23[2],zmm21[2],zmm23[4],zmm21[4],zmm23[6],zmm21[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm15, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm23[1],zmm21[1],zmm23[3],zmm21[3],zmm23[5],zmm21[5],zmm23[7],zmm21[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm10, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm16[0],zmm31[0],zmm16[2],zmm31[2],zmm16[4],zmm31[4],zmm16[6],zmm31[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm29, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm31[1],zmm16[3],zmm31[3],zmm16[5],zmm31[5],zmm16[7],zmm31[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm30, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm1 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm30, %zmm13 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm3, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm29, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r10), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r10), %zmm17 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm4, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm7, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm4, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm29, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] ; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm29, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm29, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm3, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm29, %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm19 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm25 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm25 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm12 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm18 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm0, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm3, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm23 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm20[0],zmm5[0],zmm20[2],zmm5[2],zmm20[4],zmm5[4],zmm20[6],zmm5[6] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm20[1],zmm5[1],zmm20[3],zmm5[3],zmm20[5],zmm5[5],zmm20[7],zmm5[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm29, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm6, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm29, %zmm12 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm15 = zmm22[0],zmm14[0],zmm22[2],zmm14[2],zmm22[4],zmm14[4],zmm22[6],zmm14[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm18 = zmm22[1],zmm14[1],zmm22[3],zmm14[3],zmm22[5],zmm14[5],zmm22[7],zmm14[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm13 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm28 {%k1} = zmm17[0],zmm1[0],zmm17[2],zmm1[2],zmm17[4],zmm1[4],zmm17[6],zmm1[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm17[1],zmm1[1],zmm17[3],zmm1[3],zmm17[5],zmm1[5],zmm17[7],zmm1[7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm29, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm26, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm26, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm26, %zmm29 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm26[0],zmm2[0],zmm26[2],zmm2[2],zmm26[4],zmm2[4],zmm26[6],zmm2[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm26[1],zmm2[1],zmm26[3],zmm2[3],zmm26[5],zmm2[5],zmm26[7],zmm2[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %xmm4 ; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm2[0],xmm4[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm0[0],xmm1[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm28, %zmm14, %zmm14 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm14[0,1,2,3],zmm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm24 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm4[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm24[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm2[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %xmm24 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm24[0],xmm9[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm25, %zmm1, %zmm1 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm7[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %xmm29 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %xmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0] -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm24[1],xmm9[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm4[1],xmm2[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm7, %zmm2, %zmm2 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm30 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %xmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm4[0],xmm2[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %xmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %xmm16 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm16[0],xmm9[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm24, %zmm8, %zmm8 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm30[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm16[1],xmm9[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm4[1],xmm2[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm2, %zmm2 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm2[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rcx), %xmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm9 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm9[0],xmm4[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %xmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %xmm21 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm23 = xmm21[0],xmm16[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm23, %zmm11, %zmm11 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm11[0,1,2,3],zmm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm21[1],xmm16[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm9[1],xmm4[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm4, %zmm4 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm4[0,1,2,3],zmm6[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm6 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm15, %zmm24 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rcx), %xmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm14 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %xmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %xmm19 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm18, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm11 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinsertf64x4 $0, %ymm11, %zmm0, %zmm11 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm27, %zmm27 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm10, %zmm10 ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512DQBW-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vinsertf64x4 $0, %ymm15, %zmm0, %zmm15 ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %ymm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %ymm17 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %ymm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %ymm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %ymm18 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm18[0],ymm16[0],ymm18[2],ymm16[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %ymm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm23[0],ymm21[0],ymm23[2],ymm21[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm18[1],ymm16[1],ymm18[3],ymm16[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm23[1],ymm21[1],ymm23[3],ymm21[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %ymm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %ymm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm16 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %ymm18 ; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %ymm19 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm16[0],ymm7[0],ymm16[2],ymm7[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm8[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm16[1],ymm7[1],ymm16[3],ymm7[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm3, %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %ymm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %ymm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %ymm16 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm16[0],ymm8[0],ymm16[2],ymm8[2] ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %ymm18 ; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm13[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm22 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm8 = ymm16[1],ymm8[1],ymm16[3],ymm8[3] ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5 +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm12[2,3],ymm8[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm22, %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rcx), %ymm12 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %ymm15 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %ymm13 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm13[0],ymm12[0],ymm13[2],ymm12[2] ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %ymm18 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm18[0],ymm16[0],ymm18[2],ymm16[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],ymm0[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm26 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm13[1],ymm12[1],ymm13[3],ymm12[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm18[1],ymm16[1],ymm18[3],ymm16[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm12[2,3],ymm6[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm26, %zmm6 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 1728(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 1728(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 1664(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 1152(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512DQBW-FAST-NEXT: vmovaps %zmm14, 1984(%rax) -; AVX512DQBW-FAST-NEXT: vmovaps %zmm13, 1920(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 1856(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1792(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm15, 1920(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1856(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 1792(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 1472(%rax) -; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 1408(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 1280(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1088(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 1024(%rax) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm11, 1408(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 1344(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1280(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 960(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9453,9 +9571,6 @@ ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 832(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -9464,11 +9579,20 @@ ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 1600(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 1536(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 1088(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1024(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 512(%rax) ; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax) -; AVX512DQBW-FAST-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512DQBW-FAST-NEXT: addq $2696, %rsp # imm = 0xA88 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 @@ -10912,58 +11036,59 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm6 -; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm7 -; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm8 = xmm7[0],xmm6[0] -; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2],ymm10[2] -; AVX1-ONLY-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm8 -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm7[1],xmm6[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2],ymm6[2] +; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%rdx), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 32(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 32(%r10), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm6 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 40(%r10), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%rdx), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 64(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, 64(%r10), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2],ymm6[2] +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 72(%r10), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -10971,19 +11096,18 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r10), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm2[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 96(%r10), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 104(%rdx), %ymm3 @@ -10991,7 +11115,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 104(%r10), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -11024,100 +11148,100 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdx), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r10), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 168(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 168(%r10), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rdx), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 160(%r8), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm6 +; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r10), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, 160(%r10), %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] ; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 200(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vbroadcastsd 168(%rdx), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 200(%r10), %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 168(%r10), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdx), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%rdx), %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] ; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 224(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovaps 224(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 224(%rax), %xmm5 +; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 192(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm3[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r10), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, 192(%r10), %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] ; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 232(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 200(%rdx), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 232(%r10), %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd 200(%r10), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps 256(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdx), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%rdx), %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 256(%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovaps 256(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovaps 224(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 224(%r8), %xmm4 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 224(%rax), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r10), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 232(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 232(%r10), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps 256(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 256(%rdx), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 256(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 256(%r8), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] ; AVX1-ONLY-NEXT: vmovaps 256(%rax), %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 @@ -11164,30 +11288,30 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%rdx), %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 320(%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%rdx), %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 320(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 320(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovaps 320(%rax), %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%r10), %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastsd 328(%rdx), %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX1-ONLY-NEXT: vmovaps 320(%rax), %xmm5 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm3[0] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 320(%r10), %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vbroadcastsd 328(%rdx), %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 328(%r10), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -11248,30 +11372,30 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm1 -; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdx), %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[2] -; AVX1-ONLY-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 416(%r9), %xmm3 +; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovaps 416(%rcx), %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%rdx), %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[2] +; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 416(%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 416(%r8), %xmm4 -; AVX1-ONLY-NEXT: vmovaps 416(%rax), %xmm5 -; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%r10), %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[2] -; AVX1-ONLY-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastsd 424(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm5 = xmm4[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovaps 416(%rax), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, 416(%r10), %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[2] +; AVX1-ONLY-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd 424(%rdx), %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm2[1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vbroadcastsd 424(%r10), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -12048,11 +12172,11 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 72(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 @@ -12060,11 +12184,11 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 64(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 64(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 72(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 64(%rax), %xmm1 @@ -12084,11 +12208,11 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%r9), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 96(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 96(%r9), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps 96(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vbroadcastsd 104(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 96(%rax), %xmm1 @@ -12132,11 +12256,11 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 160(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 160(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 168(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%rax), %xmm1 @@ -12156,11 +12280,11 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%r9), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 192(%r9), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps 192(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vbroadcastsd 200(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 192(%rax), %xmm1 @@ -12180,11 +12304,11 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%r9), %xmm0 -; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 224(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovaps 224(%r9), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 224(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 232(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%rax), %xmm1 @@ -12252,11 +12376,11 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%r9), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%r8), %xmm0 +; AVX2-ONLY-NEXT: vmovaps 320(%r9), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovaps 320(%r8), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vbroadcastsd 328(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 320(%rax), %xmm1 @@ -12319,15 +12443,15 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vbroadcastsd 424(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovaps 416(%rcx), %xmm13 -; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps 416(%rcx), %xmm1 +; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 416(%r9), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%r8), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps 416(%r8), %xmm13 +; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm13[1],xmm0[1] ; AVX2-ONLY-NEXT: vbroadcastsd 424(%r10), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovaps 416(%rax), %xmm12 @@ -12556,10 +12680,9 @@ ; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm14 = xmm14[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 416(%rdx), %ymm14, %ymm14 -; AVX2-ONLY-NEXT: vbroadcastsd %xmm13, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-ONLY-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm13 = xmm13[0],mem[0] ; AVX2-ONLY-NEXT: vinsertf128 $1, 416(%r10), %ymm13, %ymm13 @@ -13254,592 +13377,586 @@ ; AVX512F-ONLY-SLOW-NEXT: subq $5384, %rsp # imm = 0x1508 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512F-ONLY-SLOW-NEXT: movb $-64, %r11b ; AVX512F-ONLY-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] -; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm15, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm23[0],zmm16[0],zmm23[2],zmm16[2],zmm23[4],zmm16[4],zmm23[6],zmm16[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] -; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm23[1],zmm16[1],zmm23[3],zmm16[3],zmm23[5],zmm16[5],zmm23[7],zmm16[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm13, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] -; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm18[0],zmm30[0],zmm18[2],zmm30[2],zmm18[4],zmm30[4],zmm18[6],zmm30[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm10, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512F-ONLY-SLOW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm27, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7] -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm15, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm18[1],zmm30[1],zmm18[3],zmm30[3],zmm18[5],zmm30[5],zmm18[7],zmm30[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm26[0],zmm19[0],zmm26[2],zmm19[2],zmm26[4],zmm19[4],zmm26[6],zmm19[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm26[1],zmm19[1],zmm26[3],zmm19[3],zmm26[5],zmm19[5],zmm26[7],zmm19[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm4, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm15, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm24[0],zmm17[0],zmm24[2],zmm17[2],zmm24[4],zmm17[4],zmm24[6],zmm17[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm24[1],zmm17[1],zmm24[3],zmm17[3],zmm24[5],zmm17[5],zmm24[7],zmm17[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm8, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm23[0],zmm16[0],zmm23[2],zmm16[2],zmm23[4],zmm16[4],zmm23[6],zmm16[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm16[1],zmm23[3],zmm16[3],zmm23[5],zmm16[5],zmm23[7],zmm16[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm11[0],zmm0[2],zmm11[2],zmm0[4],zmm11[4],zmm0[6],zmm11[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm11[1],zmm17[3],zmm11[3],zmm17[5],zmm11[5],zmm17[7],zmm11[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm10, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm19[0],zmm20[0],zmm19[2],zmm20[2],zmm19[4],zmm20[4],zmm19[6],zmm20[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm25, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm19[1],zmm20[1],zmm19[3],zmm20[3],zmm19[5],zmm20[5],zmm19[7],zmm20[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm19[0],zmm0[2],zmm19[2],zmm0[4],zmm19[4],zmm0[6],zmm19[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm21[0],zmm26[0],zmm21[2],zmm26[2],zmm21[4],zmm26[4],zmm21[6],zmm26[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm25, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm21[1],zmm26[1],zmm21[3],zmm26[3],zmm21[5],zmm26[5],zmm21[7],zmm26[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm21 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm7[0],zmm12[0],zmm7[2],zmm12[2],zmm7[4],zmm12[4],zmm7[6],zmm12[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm7[1],zmm12[1],zmm7[3],zmm12[3],zmm7[5],zmm12[5],zmm7[7],zmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm10, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm31 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm9 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm30 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,1,9,1,9,1,9] ; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] -; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm21 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm23 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm26 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm23 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm25[0],zmm14[0],zmm25[2],zmm14[2],zmm25[4],zmm14[4],zmm25[6],zmm14[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm25[1],zmm14[1],zmm25[3],zmm14[3],zmm25[5],zmm14[5],zmm25[7],zmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm0[0],zmm24[2],zmm0[2],zmm24[4],zmm0[4],zmm24[6],zmm0[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm24[1],zmm0[1],zmm24[3],zmm0[3],zmm24[5],zmm0[5],zmm24[7],zmm0[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm29 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm28 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[4],zmm10[6],zmm9[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm10[1],zmm9[1],zmm10[3],zmm9[3],zmm10[5],zmm9[5],zmm10[7],zmm9[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm21, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm21, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm21, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm21[0],zmm8[0],zmm21[2],zmm8[2],zmm21[4],zmm8[4],zmm21[6],zmm8[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm21[1],zmm8[1],zmm21[3],zmm8[3],zmm21[5],zmm8[5],zmm21[7],zmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm21 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] @@ -13851,8 +13968,8 @@ ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -13862,7 +13979,7 @@ ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -13881,388 +13998,404 @@ ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm28 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm22 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm26, %zmm26 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm9[1],ymm12[3],ymm9[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rcx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm13[2,3],ymm7[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm9[1],ymm12[3],ymm9[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm2, %zmm25 ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] ; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm29 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm10[0],xmm9[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm1[0,1,2,3],zmm12[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm10[1],xmm9[1] ; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm10[0],xmm3[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm12[0],xmm11[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm9, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm10[1],xmm3[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %xmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm26 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm12[0],xmm3[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm15, %zmm10, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm3[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %xmm20 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm28 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm29 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm12[0],xmm3[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm15[0],xmm14[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm13, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm15[1],xmm14[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm3[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %xmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %xmm28 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rcx), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm12[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm19 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm19[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm31, %zmm15, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm15[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm19[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3],zmm30[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %xmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %xmm28 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm14[0],xmm12[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm31 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm31[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %xmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %xmm23 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm23 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm23[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rcx), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm14[0],xmm12[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm23 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm23[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm31, %zmm20, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm20[0,1,2,3],zmm16[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %xmm20 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %xmm23 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm30[1],xmm23[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm20, %zmm12, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm17[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %xmm17 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm23 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] -; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm23[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm31, %zmm20, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm20[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm30[1],xmm23[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm17[1],xmm14[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm20, %zmm14, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm14[0,1,2,3],zmm5[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 3776(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 3712(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 3264(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 3200(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 2752(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 2240(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 2176(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 1728(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1664(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1216(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1152(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 704(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 640(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 4032(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3968(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3904(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3840(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 3648(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 3584(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3520(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3456(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3392(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3328(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 3072(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3008(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 2944(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 2880(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 3776(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 3712(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 3200(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 2752(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 2688(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 2240(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 1728(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1664(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1216(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1152(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 4032(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 3968(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 3904(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 3840(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 3520(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 3456(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 3392(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 3328(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 3008(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2944(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2880(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2816(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2496(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2432(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 2304(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1984(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1920(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1856(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1792(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1472(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1408(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1344(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 1280(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 960(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 896(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm6, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 3648(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 3584(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 3136(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 2624(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 2560(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 2112(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 1600(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1536(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1088(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 1024(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 576(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 2048(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1600(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1536(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 1088(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1024(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512F-ONLY-SLOW-NEXT: addq $5384, %rsp # imm = 0x1508 ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq @@ -14272,592 +14405,586 @@ ; AVX512F-ONLY-FAST-NEXT: subq $5384, %rsp # imm = 0x1508 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512F-ONLY-FAST-NEXT: movb $-64, %r11b ; AVX512F-ONLY-FAST-NEXT: kmovw %r11d, %k1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] -; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm15, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm23[0],zmm16[0],zmm23[2],zmm16[2],zmm23[4],zmm16[4],zmm23[6],zmm16[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] -; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm23[1],zmm16[1],zmm23[3],zmm16[3],zmm23[5],zmm16[5],zmm23[7],zmm16[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm13, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] -; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm18[0],zmm30[0],zmm18[2],zmm30[2],zmm18[4],zmm30[4],zmm18[6],zmm30[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm4, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm27, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7] -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm15, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm18[1],zmm30[1],zmm18[3],zmm30[3],zmm18[5],zmm30[5],zmm18[7],zmm30[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm26[0],zmm19[0],zmm26[2],zmm19[2],zmm26[4],zmm19[4],zmm26[6],zmm19[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm26[1],zmm19[1],zmm26[3],zmm19[3],zmm26[5],zmm19[5],zmm26[7],zmm19[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm25, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm15, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm24[0],zmm17[0],zmm24[2],zmm17[2],zmm24[4],zmm17[4],zmm24[6],zmm17[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm24[1],zmm17[1],zmm24[3],zmm17[3],zmm24[5],zmm17[5],zmm24[7],zmm17[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm9, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm8, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm23[0],zmm16[0],zmm23[2],zmm16[2],zmm23[4],zmm16[4],zmm23[6],zmm16[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm25, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm16[1],zmm23[3],zmm16[3],zmm23[5],zmm16[5],zmm23[7],zmm16[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm11[0],zmm0[2],zmm11[2],zmm0[4],zmm11[4],zmm0[6],zmm11[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm11[1],zmm17[3],zmm11[3],zmm17[5],zmm11[5],zmm17[7],zmm11[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm19[0],zmm20[0],zmm19[2],zmm20[2],zmm19[4],zmm20[4],zmm19[6],zmm20[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm19[1],zmm20[1],zmm19[3],zmm20[3],zmm19[5],zmm20[5],zmm19[7],zmm20[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm19[0],zmm0[2],zmm19[2],zmm0[4],zmm19[4],zmm0[6],zmm19[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm21[0],zmm26[0],zmm21[2],zmm26[2],zmm21[4],zmm26[4],zmm21[6],zmm26[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm21[1],zmm26[1],zmm21[3],zmm26[3],zmm21[5],zmm26[5],zmm21[7],zmm26[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm21 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm7[0],zmm12[0],zmm7[2],zmm12[2],zmm7[4],zmm12[4],zmm7[6],zmm12[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm7[1],zmm12[1],zmm7[3],zmm12[3],zmm7[5],zmm12[5],zmm7[7],zmm12[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm11, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm10 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm5 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,1,9,1,9,1,9] ; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] -; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm23 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm5, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm4, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm25[0],zmm14[0],zmm25[2],zmm14[2],zmm25[4],zmm14[4],zmm25[6],zmm14[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm25[1],zmm14[1],zmm25[3],zmm14[3],zmm25[5],zmm14[5],zmm25[7],zmm14[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm0[0],zmm24[2],zmm0[2],zmm24[4],zmm0[4],zmm24[6],zmm0[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm24[1],zmm0[1],zmm24[3],zmm0[3],zmm24[5],zmm0[5],zmm24[7],zmm0[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm28 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7] -; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[4],zmm10[6],zmm9[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm10[1],zmm9[1],zmm10[3],zmm9[3],zmm10[5],zmm9[5],zmm10[7],zmm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm21, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm21, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm21, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm21[0],zmm8[0],zmm21[2],zmm8[2],zmm21[4],zmm8[4],zmm21[6],zmm8[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm21[1],zmm8[1],zmm21[3],zmm8[3],zmm21[5],zmm8[5],zmm21[7],zmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm21 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] @@ -14869,8 +14996,8 @@ ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -14880,7 +15007,7 @@ ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -14899,388 +15026,404 @@ ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm22 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm26, %zmm26 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm9[1],ymm12[3],ymm9[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm7 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rcx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm13[2,3],ymm7[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm9[1],ymm12[3],ymm9[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm2, %zmm25 ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] ; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm29 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm10[0],xmm9[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm1[0,1,2,3],zmm12[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm10[1],xmm9[1] ; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm10[0],xmm3[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm12[0],xmm11[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm9, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm10[1],xmm3[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %xmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %xmm26 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm12[0],xmm3[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm15, %zmm10, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm3[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %xmm20 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %xmm28 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm29 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm12[0],xmm3[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm15[0],xmm14[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm13, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm15[1],xmm14[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm3[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %xmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %xmm28 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %xmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rcx), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm12[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %xmm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm19[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm31, %zmm15, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm15[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm19[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3],zmm30[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %xmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %xmm28 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm14[0],xmm12[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %xmm31 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm31[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %xmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %xmm23 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm23 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm23[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rcx), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm14[0],xmm12[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm23 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm23[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm31, %zmm20, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm20[0,1,2,3],zmm16[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %xmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %xmm23 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm30[1],xmm23[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm20, %zmm12, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm17[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %xmm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm23 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1} -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] -; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm23[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm31, %zmm20, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm20[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm30[1],xmm23[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm17[1],xmm14[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm20, %zmm14, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm14[0,1,2,3],zmm5[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 3776(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 3712(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 3264(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 3200(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2752(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 2240(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 2176(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 1728(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1664(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 1216(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 1152(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 704(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 640(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 4032(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3968(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3904(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3840(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3648(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 3584(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3520(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3456(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3392(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3328(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 3072(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3008(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 2944(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 2880(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 3776(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 3712(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 3200(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2752(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 2688(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2240(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 1728(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 1664(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1152(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 4032(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 3968(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 3904(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 3840(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 3520(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 3456(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 3392(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 3328(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 3008(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2944(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2880(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2816(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2496(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2432(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 2304(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1984(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1920(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1856(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1792(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1472(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1408(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1344(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 1280(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 960(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 896(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm6, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 3648(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 3584(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 3136(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 2624(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 2560(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 2112(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 1600(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1536(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1088(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 1024(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 576(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 2048(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1600(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 1536(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512F-ONLY-FAST-NEXT: addq $5384, %rsp # imm = 0x1508 ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq @@ -15290,592 +15433,586 @@ ; AVX512DQ-SLOW-NEXT: subq $5384, %rsp # imm = 0x1508 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512DQ-SLOW-NEXT: movb $-64, %r11b ; AVX512DQ-SLOW-NEXT: kmovw %r11d, %k1 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] -; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] -; AVX512DQ-SLOW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm15, %zmm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] +; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm23[0],zmm16[0],zmm23[2],zmm16[2],zmm23[4],zmm16[4],zmm23[6],zmm16[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] -; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] +; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm23[1],zmm16[1],zmm23[3],zmm16[3],zmm23[5],zmm16[5],zmm23[7],zmm16[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm13, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm10 -; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] -; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm18[0],zmm30[0],zmm18[2],zmm30[2],zmm18[4],zmm30[4],zmm18[6],zmm30[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm10, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512DQ-SLOW-NEXT: # ymm27 = mem[0,1,0,1] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm27, %zmm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm9 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r10), %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7] -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15] +; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm15, %zmm11 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm18[1],zmm30[1],zmm18[3],zmm30[3],zmm18[5],zmm30[5],zmm18[7],zmm30[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm6 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm25 = [7,15,7,15] +; AVX512DQ-SLOW-NEXT: # ymm25 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm26[0],zmm19[0],zmm26[2],zmm19[2],zmm26[4],zmm19[4],zmm26[6],zmm19[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm8 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm5 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm5 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm26[1],zmm19[1],zmm26[3],zmm19[3],zmm26[5],zmm19[5],zmm26[7],zmm19[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm5, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r10), %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm25, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm15, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm24[0],zmm17[0],zmm24[2],zmm17[2],zmm24[4],zmm17[4],zmm24[6],zmm17[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm24[1],zmm17[1],zmm24[3],zmm17[3],zmm24[5],zmm17[5],zmm24[7],zmm17[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm8, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm23[0],zmm16[0],zmm23[2],zmm16[2],zmm23[4],zmm16[4],zmm23[6],zmm16[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r10), %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm25, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm16[1],zmm23[3],zmm16[3],zmm23[5],zmm16[5],zmm23[7],zmm16[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r10), %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm10 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm11 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm11[0],zmm0[2],zmm11[2],zmm0[4],zmm11[4],zmm0[6],zmm11[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm4 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm11[1],zmm17[3],zmm11[3],zmm17[5],zmm11[5],zmm17[7],zmm11[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm10, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm19[0],zmm20[0],zmm19[2],zmm20[2],zmm19[4],zmm20[4],zmm19[6],zmm20[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm25, %zmm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r10), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm19[1],zmm20[1],zmm19[3],zmm20[3],zmm19[5],zmm20[5],zmm19[7],zmm20[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rax), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm19 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm19[0],zmm0[2],zmm19[2],zmm0[4],zmm19[4],zmm0[6],zmm19[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm4 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm21[0],zmm26[0],zmm21[2],zmm26[2],zmm21[4],zmm26[4],zmm21[6],zmm26[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm25, %zmm2 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm21[1],zmm26[1],zmm21[3],zmm26[3],zmm21[5],zmm26[5],zmm21[7],zmm26[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm21 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm5 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm8 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm9 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r10), %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rax), %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm15 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r10), %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rax), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm12 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm7[0],zmm12[0],zmm7[2],zmm12[2],zmm7[4],zmm12[4],zmm7[6],zmm12[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm7[1],zmm12[1],zmm7[3],zmm12[3],zmm7[5],zmm12[5],zmm7[7],zmm12[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm10, %zmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm3 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm5 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r10), %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r9), %zmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm31 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm31 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r10), %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r10), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rax), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r10), %zmm10 ; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rax), %zmm9 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm14 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm17 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm30 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm5 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm28 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,1,9,1,9,1,9] ; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] -; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm21 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm21 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm23 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm19 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm11 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm26 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm23 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm25[0],zmm14[0],zmm25[2],zmm14[2],zmm25[4],zmm14[4],zmm25[6],zmm14[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm25[1],zmm14[1],zmm25[3],zmm14[3],zmm25[5],zmm14[5],zmm25[7],zmm14[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm11 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm0[0],zmm24[2],zmm0[2],zmm24[4],zmm0[4],zmm24[6],zmm0[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm24[1],zmm0[1],zmm24[3],zmm0[3],zmm24[5],zmm0[5],zmm24[7],zmm0[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm29 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm18 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm28 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm5 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm6 -; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm2 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7] -; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[4],zmm10[6],zmm9[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm10[1],zmm9[1],zmm10[3],zmm9[3],zmm10[5],zmm9[5],zmm10[7],zmm9[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm21, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm21, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm21, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm21[0],zmm8[0],zmm21[2],zmm8[2],zmm21[4],zmm8[4],zmm21[6],zmm8[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm21[1],zmm8[1],zmm21[3],zmm8[3],zmm21[5],zmm8[5],zmm21[7],zmm8[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm21 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] @@ -15887,8 +16024,8 @@ ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -15898,7 +16035,7 @@ ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -15917,388 +16054,404 @@ ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %ymm8 ; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rsi), %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm28 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm22 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rsi), %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm26, %zmm26 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm9[1],ymm12[3],ymm9[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm8 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rcx), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdx), %ymm3 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rsi), %ymm4 -; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rsi), %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm6, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rcx), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdx), %ymm3 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rsi), %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rsi), %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm12 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm13[2,3],ymm7[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm9[1],ymm12[3],ymm9[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rcx), %ymm1 ; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdx), %ymm3 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rsi), %ymm7 -; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm9 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rsi), %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm12 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm2, %zmm25 ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] ; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm29 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm10[0],xmm9[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm1[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm10[1],xmm9[1] ; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm12 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0] -; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm10[0],xmm3[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm12[0],xmm11[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm9, %zmm9 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm10[1],xmm3[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %xmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm14 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0] -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %xmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm26 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm12 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm12[0],xmm3[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %xmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %xmm14 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm15, %zmm10, %zmm10 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm3[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rcx), %xmm14 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %xmm20 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0] -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm28 -; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm29 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rcx), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm12 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm12[0],xmm3[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rsi), %xmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm15 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm15[0],xmm14[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm13, %zmm13 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm15[1],xmm14[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm3[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rcx), %xmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %xmm28 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0] -; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rcx), %xmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdx), %xmm14 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm12[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm19 ; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm19[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm31, %zmm15, %zmm15 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm15[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm19[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3],zmm30[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %xmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %xmm28 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rcx), %xmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdx), %xmm14 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm14[0],xmm12[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 ; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm31 ; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm31[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %xmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %xmm23 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] -; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm23 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm23[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rcx), %xmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdx), %xmm14 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm14[0],xmm12[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm23 ; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm23[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm31, %zmm20, %zmm20 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm20[0,1,2,3],zmm16[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %xmm20 -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %xmm23 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] -; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm30[1],xmm23[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm20, %zmm12, %zmm12 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm17[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rcx), %xmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %xmm17 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm23 ; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm23[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm31, %zmm20, %zmm20 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm20[0,1,2,3],zmm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm30[1],xmm23[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm17[1],xmm14[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm20, %zmm14, %zmm14 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm14[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 3776(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 3712(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 3264(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 3200(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 2752(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 2240(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 2176(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 1728(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 1664(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 1216(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 1152(%rax) -; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 704(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 640(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 4032(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3968(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3904(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3840(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 3648(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 3584(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3520(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3456(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3392(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3328(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 3072(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3008(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 2944(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 2880(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 3776(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 3712(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 3200(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 2752(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 2688(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 2240(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 1728(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 1664(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1152(%rax) +; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 4032(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 3968(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 3904(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 3840(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 3520(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 3456(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 3392(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 3328(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 3008(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2944(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2880(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2816(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2496(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2432(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 2304(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1984(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1920(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1856(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1792(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1472(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1408(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1344(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 1280(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 960(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 896(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm6, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 3648(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 3584(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 3136(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 2624(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 2560(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 2112(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 1600(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 1536(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 1088(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 1024(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 576(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 2048(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 1600(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 1536(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 1088(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 1024(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512DQ-SLOW-NEXT: addq $5384, %rsp # imm = 0x1508 ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq @@ -16308,592 +16461,586 @@ ; AVX512DQ-FAST-NEXT: subq $5384, %rsp # imm = 0x1508 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512DQ-FAST-NEXT: movb $-64, %r11b ; AVX512DQ-FAST-NEXT: kmovw %r11d, %k1 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] -; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] -; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm15, %zmm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm23[0],zmm16[0],zmm23[2],zmm16[2],zmm23[4],zmm16[4],zmm23[6],zmm16[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] +; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm23[1],zmm16[1],zmm23[3],zmm16[3],zmm23[5],zmm16[5],zmm23[7],zmm16[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] -; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm18[0],zmm30[0],zmm18[2],zmm30[2],zmm18[4],zmm30[4],zmm18[6],zmm30[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm4, %zmm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm13, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm10 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] -; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm15, %zmm11 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm18[1],zmm30[1],zmm18[3],zmm30[3],zmm18[5],zmm30[5],zmm18[7],zmm30[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm6 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm25 = [7,15,7,15] +; AVX512DQ-FAST-NEXT: # ymm25 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512DQ-FAST-NEXT: # ymm27 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm27, %zmm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm9 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r10), %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm26[0],zmm19[0],zmm26[2],zmm19[2],zmm26[4],zmm19[4],zmm26[6],zmm19[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm8 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm5 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm26[1],zmm19[1],zmm26[3],zmm19[3],zmm26[5],zmm19[5],zmm26[7],zmm19[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r10), %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm25, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm15, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm24[0],zmm17[0],zmm24[2],zmm17[2],zmm24[4],zmm17[4],zmm24[6],zmm17[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm24[1],zmm17[1],zmm24[3],zmm17[3],zmm24[5],zmm17[5],zmm24[7],zmm17[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm9, %zmm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm8, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm23[0],zmm16[0],zmm23[2],zmm16[2],zmm23[4],zmm16[4],zmm23[6],zmm16[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r10), %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm25, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm16[1],zmm23[3],zmm16[3],zmm23[5],zmm16[5],zmm23[7],zmm16[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r10), %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm10 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm11 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm11[0],zmm0[2],zmm11[2],zmm0[4],zmm11[4],zmm0[6],zmm11[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm11[1],zmm17[3],zmm11[3],zmm17[5],zmm11[5],zmm17[7],zmm11[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm19[0],zmm20[0],zmm19[2],zmm20[2],zmm19[4],zmm20[4],zmm19[6],zmm20[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm2 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r10), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm19[1],zmm20[1],zmm19[3],zmm20[3],zmm19[5],zmm20[5],zmm19[7],zmm20[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rax), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm19 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm19[0],zmm0[2],zmm19[2],zmm0[4],zmm19[4],zmm0[6],zmm19[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm4 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm21[0],zmm26[0],zmm21[2],zmm26[2],zmm21[4],zmm26[4],zmm21[6],zmm26[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm2 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm21[1],zmm26[1],zmm21[3],zmm26[3],zmm21[5],zmm26[5],zmm21[7],zmm26[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm21 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm5 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm8 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm9 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r10), %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rax), %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm15 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r10), %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rax), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm12 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm7[0],zmm12[0],zmm7[2],zmm12[2],zmm7[4],zmm12[4],zmm7[6],zmm12[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm7[1],zmm12[1],zmm7[3],zmm12[3],zmm7[5],zmm12[5],zmm7[7],zmm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm10, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm6 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm3 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm11, %zmm5 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r10), %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm12 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm31 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm31 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r10), %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r10), %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rax), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r10), %zmm10 ; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rax), %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm14 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r8), %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm30 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm30 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r8), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm28 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,1,9,1,9,1,9] ; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] -; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm21 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm19 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm11 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm26 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm4, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm25[0],zmm14[0],zmm25[2],zmm14[2],zmm25[4],zmm14[4],zmm25[6],zmm14[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm25[1],zmm14[1],zmm25[3],zmm14[3],zmm25[5],zmm14[5],zmm25[7],zmm14[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm11 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm0[0],zmm24[2],zmm0[2],zmm24[4],zmm0[4],zmm24[6],zmm0[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm24[1],zmm0[1],zmm24[3],zmm0[3],zmm24[5],zmm0[5],zmm24[7],zmm0[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm18 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 -; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm28 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm5 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm6 -; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm2 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7] -; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[4],zmm10[6],zmm9[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm10[1],zmm9[1],zmm10[3],zmm9[3],zmm10[5],zmm9[5],zmm10[7],zmm9[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm21, %zmm4 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm21, %zmm5 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm21, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm21[0],zmm8[0],zmm21[2],zmm8[2],zmm21[4],zmm8[4],zmm21[6],zmm8[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm21[1],zmm8[1],zmm21[3],zmm8[3],zmm21[5],zmm8[5],zmm21[7],zmm8[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm21 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] @@ -16905,8 +17052,8 @@ ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -16916,7 +17063,7 @@ ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -16935,388 +17082,404 @@ ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %ymm8 ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rsi), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm28 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm22 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rsi), %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm26, %zmm26 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm9[1],ymm12[3],ymm9[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm8 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 320(%rcx), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdx), %ymm3 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rsi), %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm14 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rsi), %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm6, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm7 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 384(%rcx), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdx), %ymm3 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rsi), %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rsi), %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm12 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm13[2,3],ymm7[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm9[1],ymm12[3],ymm9[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa 448(%rcx), %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdx), %ymm3 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rsi), %ymm7 -; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm9 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rsi), %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm12 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm2, %zmm25 ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] ; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm29 ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm2 ; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm10[0],xmm9[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm1[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm10[1],xmm9[1] ; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm12 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0] -; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm10[0],xmm3[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm12[0],xmm11[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm9, %zmm9 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm10[1],xmm3[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm14 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0] -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %xmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %xmm26 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm12 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm12[0],xmm3[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %xmm14 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm15, %zmm10, %zmm10 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm3[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 192(%rcx), %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %xmm20 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0] -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %xmm28 -; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %xmm29 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rcx), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm12 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm12[0],xmm3[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rsi), %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %xmm15 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm15[0],xmm14[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm13, %zmm13 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm15[1],xmm14[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm3[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rcx), %xmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %xmm28 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0] -; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %xmm30 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rcx), %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdx), %xmm14 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm12[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %xmm19 ; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm19[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm31, %zmm15, %zmm15 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm15[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm19[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3],zmm30[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %xmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %xmm28 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rcx), %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdx), %xmm14 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm14[0],xmm12[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 ; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %xmm31 ; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm31[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %xmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %xmm23 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] -; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %xmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm23 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm23[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rcx), %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdx), %xmm14 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm14[0],xmm12[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %xmm23 ; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm23[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm31, %zmm20, %zmm20 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm20[0,1,2,3],zmm16[4,5,6,7] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %xmm20 -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %xmm23 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] -; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %xmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm30[1],xmm23[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm20, %zmm12, %zmm12 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm17[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rcx), %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %xmm17 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %xmm23 ; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] -; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] -; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 -; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm23[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm31, %zmm20, %zmm20 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm20[0,1,2,3],zmm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm30[1],xmm23[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm17[1],xmm14[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm20, %zmm14, %zmm14 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm14[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 3776(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 3712(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 3264(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 3200(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 2752(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 2240(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 2176(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 1728(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1664(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 1216(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 1152(%rax) -; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 704(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 640(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 4032(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3968(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3904(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3840(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 3648(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 3584(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3520(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3456(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3392(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3328(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 3072(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3008(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 2944(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 2880(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 3776(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 3712(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 3200(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 2752(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 2688(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 2240(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 1728(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 1664(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 4032(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 3968(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 3904(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 3840(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 3520(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 3456(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 3392(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 3328(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 3008(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 2944(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 2880(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 2816(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 2496(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 2432(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 2304(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1984(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1920(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1856(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1792(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1472(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1408(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1344(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 1280(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 896(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm6, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 3648(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 3584(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 3136(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 2624(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 2560(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 2112(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 1600(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 1536(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 1088(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 1024(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 576(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 2048(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 1600(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 1536(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1024(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512DQ-FAST-NEXT: addq $5384, %rsp # imm = 0x1508 ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -17326,592 +17489,586 @@ ; AVX512BW-ONLY-SLOW-NEXT: subq $5384, %rsp # imm = 0x1508 ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %r11b ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r11d, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] -; AVX512BW-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm15, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm23[0],zmm16[0],zmm23[2],zmm16[2],zmm23[4],zmm16[4],zmm23[6],zmm16[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] -; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm23[1],zmm16[1],zmm23[3],zmm16[3],zmm23[5],zmm16[5],zmm23[7],zmm16[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm13, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] -; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm18[0],zmm30[0],zmm18[2],zmm30[2],zmm18[4],zmm30[4],zmm18[6],zmm30[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm10, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512BW-ONLY-SLOW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm27, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7] -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm15, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm18[1],zmm30[1],zmm18[3],zmm30[3],zmm18[5],zmm30[5],zmm18[7],zmm30[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm26[0],zmm19[0],zmm26[2],zmm19[2],zmm26[4],zmm19[4],zmm26[6],zmm19[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm26[1],zmm19[1],zmm26[3],zmm19[3],zmm26[5],zmm19[5],zmm26[7],zmm19[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm5, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm4, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm25, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm15, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm24[0],zmm17[0],zmm24[2],zmm17[2],zmm24[4],zmm17[4],zmm24[6],zmm17[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm24[1],zmm17[1],zmm24[3],zmm17[3],zmm24[5],zmm17[5],zmm24[7],zmm17[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm8, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm23[0],zmm16[0],zmm23[2],zmm16[2],zmm23[4],zmm16[4],zmm23[6],zmm16[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm25, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm16[1],zmm23[3],zmm16[3],zmm23[5],zmm16[5],zmm23[7],zmm16[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm11[0],zmm0[2],zmm11[2],zmm0[4],zmm11[4],zmm0[6],zmm11[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm11[1],zmm17[3],zmm11[3],zmm17[5],zmm11[5],zmm17[7],zmm11[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm10, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm19[0],zmm20[0],zmm19[2],zmm20[2],zmm19[4],zmm20[4],zmm19[6],zmm20[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm25, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm19[1],zmm20[1],zmm19[3],zmm20[3],zmm19[5],zmm20[5],zmm19[7],zmm20[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm19[0],zmm0[2],zmm19[2],zmm0[4],zmm19[4],zmm0[6],zmm19[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm21[0],zmm26[0],zmm21[2],zmm26[2],zmm21[4],zmm26[4],zmm21[6],zmm26[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm25, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm21[1],zmm26[1],zmm21[3],zmm26[3],zmm21[5],zmm26[5],zmm21[7],zmm26[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm21 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm7[0],zmm12[0],zmm7[2],zmm12[2],zmm7[4],zmm12[4],zmm7[6],zmm12[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm7[1],zmm12[1],zmm7[3],zmm12[3],zmm7[5],zmm12[5],zmm7[7],zmm12[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm31 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm5 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,1,9,1,9,1,9] ; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] -; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm21 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm23 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm26 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm4, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm4, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm4, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm25[0],zmm14[0],zmm25[2],zmm14[2],zmm25[4],zmm14[4],zmm25[6],zmm14[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm25[1],zmm14[1],zmm25[3],zmm14[3],zmm25[5],zmm14[5],zmm25[7],zmm14[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm0[0],zmm24[2],zmm0[2],zmm24[4],zmm0[4],zmm24[6],zmm0[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm24[1],zmm0[1],zmm24[3],zmm0[3],zmm24[5],zmm0[5],zmm24[7],zmm0[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm29 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[4],zmm10[6],zmm9[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm10[1],zmm9[1],zmm10[3],zmm9[3],zmm10[5],zmm9[5],zmm10[7],zmm9[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm21, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm21, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm21, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm21[0],zmm8[0],zmm21[2],zmm8[2],zmm21[4],zmm8[4],zmm21[6],zmm8[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm21[1],zmm8[1],zmm21[3],zmm8[3],zmm21[5],zmm8[5],zmm21[7],zmm8[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm21 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] @@ -17923,8 +18080,8 @@ ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -17934,7 +18091,7 @@ ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -17953,388 +18110,404 @@ ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm28 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm22 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm26, %zmm26 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm9[1],ymm12[3],ymm9[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm6, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rcx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm13[2,3],ymm7[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm9[1],ymm12[3],ymm9[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %ymm1 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm2, %zmm25 ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] ; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm29 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm10[0],xmm9[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm1[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm10[1],xmm9[1] ; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm1[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm10[0],xmm3[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm12[0],xmm11[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm9, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm1[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm10[1],xmm3[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm26 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm12[0],xmm3[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm15, %zmm10, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm1[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm3[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %xmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm29 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm12[0],xmm3[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm15[0],xmm14[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm13, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm1[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm15[1],xmm14[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm3[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %xmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rcx), %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm12[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm19 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm19[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm31, %zmm15, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm15[0,1,2,3],zmm1[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm19[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3],zmm30[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %xmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm14[0],xmm12[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm31 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm31[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %xmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm23 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm23[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rcx), %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm14[0],xmm12[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm23 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm23[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm31, %zmm20, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm20[0,1,2,3],zmm16[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %xmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm30[1],xmm23[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm20, %zmm12, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm17[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %xmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm23 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm23[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm31, %zmm20, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm20[0,1,2,3],zmm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm30[1],xmm23[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm17[1],xmm14[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm20, %zmm14, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm14[0,1,2,3],zmm5[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 3776(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 3712(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 3264(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 3200(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 2752(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 2240(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 2176(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 1728(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1664(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1216(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1152(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 704(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 640(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 4032(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3968(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3904(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3840(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 3648(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 3584(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3520(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3456(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3392(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3328(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 3072(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3008(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 2944(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 2880(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 3776(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 3712(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 3200(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 2752(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 2688(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 2240(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 1728(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 1664(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1216(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1152(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 4032(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 3968(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 3904(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 3840(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 3520(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 3456(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 3392(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 3328(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 3008(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2944(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2880(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2816(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2496(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2432(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 2304(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1984(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1920(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1856(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1792(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1472(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1408(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1344(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 1280(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 960(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 896(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm6, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 3648(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 3584(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 3136(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 2624(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 2560(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 2112(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 1600(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1536(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1088(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 1024(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 576(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 2048(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 1600(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 1536(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 1088(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1024(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: addq $5384, %rsp # imm = 0x1508 ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper ; AVX512BW-ONLY-SLOW-NEXT: retq @@ -18344,592 +18517,586 @@ ; AVX512BW-ONLY-FAST-NEXT: subq $5384, %rsp # imm = 0x1508 ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512BW-ONLY-FAST-NEXT: movb $-64, %r11b ; AVX512BW-ONLY-FAST-NEXT: kmovd %r11d, %k1 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] -; AVX512BW-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm15, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm23[0],zmm16[0],zmm23[2],zmm16[2],zmm23[4],zmm16[4],zmm23[6],zmm16[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] -; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm23[1],zmm16[1],zmm23[3],zmm16[3],zmm23[5],zmm16[5],zmm23[7],zmm16[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm13, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] -; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm18[0],zmm30[0],zmm18[2],zmm30[2],zmm18[4],zmm30[4],zmm18[6],zmm30[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm4, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512BW-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm27, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7] -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm15, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm18[1],zmm30[1],zmm18[3],zmm30[3],zmm18[5],zmm30[5],zmm18[7],zmm30[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm26[0],zmm19[0],zmm26[2],zmm19[2],zmm26[4],zmm19[4],zmm26[6],zmm19[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm26[1],zmm19[1],zmm26[3],zmm19[3],zmm26[5],zmm19[5],zmm26[7],zmm19[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm4, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm25, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm15, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm24[0],zmm17[0],zmm24[2],zmm17[2],zmm24[4],zmm17[4],zmm24[6],zmm17[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm24[1],zmm17[1],zmm24[3],zmm17[3],zmm24[5],zmm17[5],zmm24[7],zmm17[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm9, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm8, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm23[0],zmm16[0],zmm23[2],zmm16[2],zmm23[4],zmm16[4],zmm23[6],zmm16[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm25, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm16[1],zmm23[3],zmm16[3],zmm23[5],zmm16[5],zmm23[7],zmm16[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm11[0],zmm0[2],zmm11[2],zmm0[4],zmm11[4],zmm0[6],zmm11[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm11[1],zmm17[3],zmm11[3],zmm17[5],zmm11[5],zmm17[7],zmm11[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm11, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm19[0],zmm20[0],zmm19[2],zmm20[2],zmm19[4],zmm20[4],zmm19[6],zmm20[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm19[1],zmm20[1],zmm19[3],zmm20[3],zmm19[5],zmm20[5],zmm19[7],zmm20[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm19[0],zmm0[2],zmm19[2],zmm0[4],zmm19[4],zmm0[6],zmm19[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm21[0],zmm26[0],zmm21[2],zmm26[2],zmm21[4],zmm26[4],zmm21[6],zmm26[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm21[1],zmm26[1],zmm21[3],zmm26[3],zmm21[5],zmm26[5],zmm21[7],zmm26[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm21 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm7[0],zmm12[0],zmm7[2],zmm12[2],zmm7[4],zmm12[4],zmm7[6],zmm12[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm7[1],zmm12[1],zmm7[3],zmm12[3],zmm7[5],zmm12[5],zmm7[7],zmm12[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm10, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm11, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm31 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm10 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm17 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm30 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm5 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,1,9,1,9,1,9] ; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] -; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm21 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm23 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm19 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm11 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm26 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm23 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm5, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm4, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm25[0],zmm14[0],zmm25[2],zmm14[2],zmm25[4],zmm14[4],zmm25[6],zmm14[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm25[1],zmm14[1],zmm25[3],zmm14[3],zmm25[5],zmm14[5],zmm25[7],zmm14[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm0[0],zmm24[2],zmm0[2],zmm24[4],zmm0[4],zmm24[6],zmm0[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm24[1],zmm0[1],zmm24[3],zmm0[3],zmm24[5],zmm0[5],zmm24[7],zmm0[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm28 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm2 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7] -; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[4],zmm10[6],zmm9[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm10[1],zmm9[1],zmm10[3],zmm9[3],zmm10[5],zmm9[5],zmm10[7],zmm9[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm21, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm21, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm21, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm21[0],zmm8[0],zmm21[2],zmm8[2],zmm21[4],zmm8[4],zmm21[6],zmm8[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm21[1],zmm8[1],zmm21[3],zmm8[3],zmm21[5],zmm8[5],zmm21[7],zmm8[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm21 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] @@ -18941,8 +19108,8 @@ ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -18952,7 +19119,7 @@ ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -18971,388 +19138,404 @@ ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512BW-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm28 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm22 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm26, %zmm26 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm9[1],ymm12[3],ymm9[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm8 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm6, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm7 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rcx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm13[2,3],ymm7[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm9[1],ymm12[3],ymm9[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %ymm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm9 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm2, %zmm25 ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] ; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm29 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm10[0],xmm9[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm1, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm1[0,1,2,3],zmm12[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm10[1],xmm9[1] ; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm1[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm12 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm10[0],xmm3[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm12[0],xmm11[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm9, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm1[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm10[1],xmm3[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %xmm12 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %xmm26 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm12[0],xmm3[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm15, %zmm10, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm1[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1] -; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm3[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %xmm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %xmm28 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm29 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm12[0],xmm3[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm15 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm15[0],xmm14[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm13, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm1[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm15[1],xmm14[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm3[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %xmm28 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %xmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rcx), %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm12[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %xmm19 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm19[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm31, %zmm15, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm15[0,1,2,3],zmm1[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm19[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3],zmm30[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %xmm28 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm14[0],xmm12[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %xmm31 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm31[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %xmm23 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm23 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm23[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rcx), %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm14[0],xmm12[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm23 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm23[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm31, %zmm20, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm20[0,1,2,3],zmm16[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %xmm20 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %xmm23 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm30[1],xmm23[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm20, %zmm12, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm17[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm23 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1} -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] -; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] -; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 -; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm23[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm31, %zmm20, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm20[0,1,2,3],zmm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm30[1],xmm23[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm17[1],xmm14[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm20, %zmm14, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm14[0,1,2,3],zmm5[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 3776(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 3712(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 3264(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 3200(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2752(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 2240(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 2176(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 1728(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1664(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 1216(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 1152(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 704(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 640(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 192(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 128(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 4032(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3968(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3904(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3840(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3648(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 3584(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3520(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3456(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3392(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3328(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 3072(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3008(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 2944(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 2880(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 3776(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 3712(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 3200(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 2752(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 2688(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2240(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 1728(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 1664(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1152(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 4032(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 3968(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 3904(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 3840(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 3520(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 3456(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 3392(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 3328(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 3008(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2944(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2880(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2816(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2496(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2432(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 2304(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1984(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1920(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1856(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1792(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1472(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1408(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1344(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 1280(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 960(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 896(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm6, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 3648(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 3584(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 3136(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 2624(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 2560(%rax) ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 2112(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 1600(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1536(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1088(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 1024(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 576(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 2048(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 1600(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 1536(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512BW-ONLY-FAST-NEXT: addq $5384, %rsp # imm = 0x1508 ; AVX512BW-ONLY-FAST-NEXT: vzeroupper ; AVX512BW-ONLY-FAST-NEXT: retq @@ -19362,592 +19545,586 @@ ; AVX512DQBW-SLOW-NEXT: subq $5384, %rsp # imm = 0x1508 ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512DQBW-SLOW-NEXT: movb $-64, %r11b ; AVX512DQBW-SLOW-NEXT: kmovd %r11d, %k1 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] -; AVX512DQBW-SLOW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm15, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm23[0],zmm16[0],zmm23[2],zmm16[2],zmm23[4],zmm16[4],zmm23[6],zmm16[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] -; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm23[1],zmm16[1],zmm23[3],zmm16[3],zmm23[5],zmm16[5],zmm23[7],zmm16[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm13, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm10 -; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] -; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm18[0],zmm30[0],zmm18[2],zmm30[2],zmm18[4],zmm30[4],zmm18[6],zmm30[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm10, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm4, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512DQBW-SLOW-NEXT: # ymm27 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm27, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r10), %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7] -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm15, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm18[1],zmm30[1],zmm18[3],zmm30[3],zmm18[5],zmm30[5],zmm18[7],zmm30[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm6 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm25 = [7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # ymm25 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm26[0],zmm19[0],zmm26[2],zmm19[2],zmm26[4],zmm19[4],zmm26[6],zmm19[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm8 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm5 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm26[1],zmm19[1],zmm26[3],zmm19[3],zmm26[5],zmm19[5],zmm26[7],zmm19[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm5, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm9, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm4, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r10), %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm25, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm15, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm24[0],zmm17[0],zmm24[2],zmm17[2],zmm24[4],zmm17[4],zmm24[6],zmm17[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm24[1],zmm17[1],zmm24[3],zmm17[3],zmm24[5],zmm17[5],zmm24[7],zmm17[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm9, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm8, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm23[0],zmm16[0],zmm23[2],zmm16[2],zmm23[4],zmm16[4],zmm23[6],zmm16[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r10), %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm25, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm16[1],zmm23[3],zmm16[3],zmm23[5],zmm16[5],zmm23[7],zmm16[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r10), %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm10 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm11 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm11[0],zmm0[2],zmm11[2],zmm0[4],zmm11[4],zmm0[6],zmm11[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm11[1],zmm17[3],zmm11[3],zmm17[5],zmm11[5],zmm17[7],zmm11[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm10, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm19[0],zmm20[0],zmm19[2],zmm20[2],zmm19[4],zmm20[4],zmm19[6],zmm20[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm25, %zmm2 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r10), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm19[1],zmm20[1],zmm19[3],zmm20[3],zmm19[5],zmm20[5],zmm19[7],zmm20[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rax), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm19 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm19[0],zmm0[2],zmm19[2],zmm0[4],zmm19[4],zmm0[6],zmm19[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 ; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r10), %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rax), %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm15 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm3 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r10), %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r9), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm31 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm31 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r10), %zmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rax), %zmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm17 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm30 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] -; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] -; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm6 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm21 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm19 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm21[0],zmm26[0],zmm21[2],zmm26[2],zmm21[4],zmm26[4],zmm21[6],zmm26[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm11 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm25, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm21[1],zmm26[1],zmm21[3],zmm26[3],zmm21[5],zmm26[5],zmm21[7],zmm26[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm7, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm2, %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r10), %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rax), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm12 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm7[0],zmm12[0],zmm7[2],zmm12[2],zmm7[4],zmm12[4],zmm7[6],zmm12[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm7[1],zmm12[1],zmm7[3],zmm12[3],zmm7[5],zmm12[5],zmm7[7],zmm12[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm10, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm10, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm26 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r10), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rax), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm13, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm29 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r9), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm3, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r10), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rax), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm15, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm15, %zmm28 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,1,9,1,9,1,9] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm1, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm4, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm5, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm5, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm25[0],zmm14[0],zmm25[2],zmm14[2],zmm25[4],zmm14[4],zmm25[6],zmm14[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm25[1],zmm14[1],zmm25[3],zmm14[3],zmm25[5],zmm14[5],zmm25[7],zmm14[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm1, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm0[0],zmm24[2],zmm0[2],zmm24[4],zmm0[4],zmm24[6],zmm0[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm24[1],zmm0[1],zmm24[3],zmm0[3],zmm24[5],zmm0[5],zmm24[7],zmm0[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm5 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm6 -; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7] -; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[4],zmm10[6],zmm9[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm10[1],zmm9[1],zmm10[3],zmm9[3],zmm10[5],zmm9[5],zmm10[7],zmm9[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm21, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm21, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm21, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm21[0],zmm8[0],zmm21[2],zmm8[2],zmm21[4],zmm8[4],zmm21[6],zmm8[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm21[1],zmm8[1],zmm21[3],zmm8[3],zmm21[5],zmm8[5],zmm21[7],zmm8[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm21 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] @@ -19959,8 +20136,8 @@ ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -19970,7 +20147,7 @@ ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -19989,388 +20166,404 @@ ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %ymm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %ymm8 ; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rsi), %ymm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm28 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm22 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rsi), %ymm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm26, %zmm26 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm9[1],ymm12[3],ymm9[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm8 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rcx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdx), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rsi), %ymm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rsi), %ymm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm6, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm7 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rcx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdx), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rsi), %ymm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm9 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rsi), %ymm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm12 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm13[2,3],ymm7[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm9[1],ymm12[3],ymm9[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rcx), %ymm1 ; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdx), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rsi), %ymm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdi), %ymm9 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rsi), %ymm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdi), %ymm12 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm2, %zmm25 ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] ; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm29 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm2 ; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm10[0],xmm9[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm1[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm10[1],xmm9[1] ; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm9, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm12 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0] -; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm10[0],xmm3[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm12[0],xmm11[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm9, %zmm9 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm10[1],xmm3[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %xmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm14 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %xmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm26 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %xmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm12 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm12[0],xmm3[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %xmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm14 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm15, %zmm10, %zmm10 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1] -; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm3[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rcx), %xmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %xmm20 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm29 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rcx), %xmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm12 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm12[0],xmm3[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rsi), %xmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm15 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm15[0],xmm14[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm19, %zmm13, %zmm13 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm15[1],xmm14[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm3[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm14, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %xmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %xmm28 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rcx), %xmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdx), %xmm14 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm12[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm19 ; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm19[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm31, %zmm15, %zmm15 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm15[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm19[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3],zmm30[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %xmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %xmm28 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rcx), %xmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdx), %xmm14 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm14[0],xmm12[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm31 ; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm31[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %xmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %xmm23 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm23 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm0, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm23[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rcx), %xmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdx), %xmm14 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm14[0],xmm12[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm23 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm23[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm31, %zmm20, %zmm20 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm20[0,1,2,3],zmm16[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %xmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %xmm23 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm30[1],xmm23[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm20, %zmm12, %zmm12 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm17[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rcx), %xmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %xmm17 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm23 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] -; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm23[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm31, %zmm20, %zmm20 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm20[0,1,2,3],zmm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm30[1],xmm23[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm17[1],xmm14[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm20, %zmm14, %zmm14 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm14[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 3776(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 3712(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 3264(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 3200(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 2752(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 2240(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 2176(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 1728(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 1664(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 1216(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 1152(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 704(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 640(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 4032(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3968(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3904(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3840(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 3648(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 3584(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3520(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3456(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3392(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3328(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 3072(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3008(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 2944(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 2880(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 3776(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 3712(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 3200(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 2752(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 2688(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 2240(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 1728(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 1664(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1152(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 4032(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 3968(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 3904(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 3840(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 3520(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 3456(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 3392(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 3328(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 3008(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2944(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2880(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2816(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2496(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2432(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 2304(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1984(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1920(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1856(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1792(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1472(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1408(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1344(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 1280(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 960(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 896(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm6, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 3648(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 3584(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 3136(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 2624(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 2560(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 2112(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 1600(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 1536(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 1088(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 1024(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 576(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 2048(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 1600(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 1536(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 1088(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 1024(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512DQBW-SLOW-NEXT: addq $5384, %rsp # imm = 0x1508 ; AVX512DQBW-SLOW-NEXT: vzeroupper ; AVX512DQBW-SLOW-NEXT: retq @@ -20380,592 +20573,586 @@ ; AVX512DQBW-FAST-NEXT: subq $5384, %rsp # imm = 0x1508 ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm30 ; AVX512DQBW-FAST-NEXT: movb $-64, %r11b ; AVX512DQBW-FAST-NEXT: kmovd %r11d, %k1 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] -; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] -; AVX512DQBW-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm15, %zmm13 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm11 = zmm23[0],zmm16[0],zmm23[2],zmm16[2],zmm23[4],zmm16[4],zmm23[6],zmm16[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm12 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] -; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm23[1],zmm16[1],zmm23[3],zmm16[3],zmm23[5],zmm16[5],zmm23[7],zmm16[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm12 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm13, %zmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm10 -; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] -; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm13 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm18[0],zmm30[0],zmm18[2],zmm30[2],zmm18[4],zmm30[4],zmm18[6],zmm30[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm10, %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm4, %zmm12 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15] -; AVX512DQBW-FAST-NEXT: # ymm27 = mem[0,1,0,1] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm27, %zmm9 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm9 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r10), %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7] -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm15, %zmm11 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 {%k1} = zmm18[1],zmm30[1],zmm18[3],zmm30[3],zmm18[5],zmm30[5],zmm18[7],zmm30[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm6 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm25 = [7,15,7,15] +; AVX512DQBW-FAST-NEXT: # ymm25 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 = zmm26[0],zmm19[0],zmm26[2],zmm19[2],zmm26[4],zmm19[4],zmm26[6],zmm19[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm8 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm5 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm5 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm6 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm26[1],zmm19[1],zmm26[3],zmm19[3],zmm26[5],zmm19[5],zmm26[7],zmm19[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm5, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm9, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm10, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm4, %zmm7 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm20[0],zmm21[0],zmm20[2],zmm21[2],zmm20[4],zmm21[4],zmm20[6],zmm21[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r10), %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm25, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm15, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 {%k1} = zmm20[1],zmm21[1],zmm20[3],zmm21[3],zmm20[5],zmm21[5],zmm20[7],zmm21[7] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm24[0],zmm17[0],zmm24[2],zmm17[2],zmm24[4],zmm17[4],zmm24[6],zmm17[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm14, %zmm7 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm24[1],zmm17[1],zmm24[3],zmm17[3],zmm24[5],zmm17[5],zmm24[7],zmm17[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm5, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm9, %zmm7 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm8, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm12 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm10, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm23[0],zmm16[0],zmm23[2],zmm16[2],zmm23[4],zmm16[4],zmm23[6],zmm16[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r10), %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm25, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm16[1],zmm23[3],zmm16[3],zmm23[5],zmm16[5],zmm23[7],zmm16[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r10), %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm10 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm11 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm11[0],zmm0[2],zmm11[2],zmm0[4],zmm11[4],zmm0[6],zmm11[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm11[1],zmm17[3],zmm11[3],zmm17[5],zmm11[5],zmm17[7],zmm11[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm11, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm10, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm19[0],zmm20[0],zmm19[2],zmm20[2],zmm19[4],zmm20[4],zmm19[6],zmm20[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm2 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r10), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm19[1],zmm20[1],zmm19[3],zmm20[3],zmm19[5],zmm20[5],zmm19[7],zmm20[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r10), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rax), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm19 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm19[0],zmm0[2],zmm19[2],zmm0[4],zmm19[4],zmm0[6],zmm19[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm6 ; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm14, %zmm8 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm4 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm9, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm10, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm12, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm21[0],zmm26[0],zmm21[2],zmm26[2],zmm21[4],zmm26[4],zmm21[6],zmm26[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm25, %zmm2 ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm21[1],zmm26[1],zmm21[3],zmm26[3],zmm21[5],zmm26[5],zmm21[7],zmm26[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm21 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm5 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm7, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm8 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm9 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r10), %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rax), %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm15 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r10), %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rax), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm12 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm7[0],zmm12[0],zmm7[2],zmm12[2],zmm7[4],zmm12[4],zmm7[6],zmm12[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm8 = zmm7[1],zmm12[1],zmm7[3],zmm12[3],zmm7[5],zmm12[5],zmm7[7],zmm12[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm10, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm10, %zmm6 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm27[0],zmm17[0],zmm27[2],zmm17[2],zmm27[4],zmm17[4],zmm27[6],zmm17[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm27[1],zmm17[1],zmm27[3],zmm17[3],zmm27[5],zmm17[5],zmm27[7],zmm17[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm3 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm11, %zmm5 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r10), %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm31 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm31 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r10), %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r10), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rax), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r10), %zmm10 ; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rax), %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 ; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm14 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm2 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r8), %zmm17 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm30 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm5 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r8), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm15, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm15, %zmm28 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,1,9,1,9,1,9] ; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] -; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] ; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm21 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm21 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm1, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm4, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm5, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm23 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm19 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm11 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm26 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm23 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm5, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm5, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm4, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm25[0],zmm14[0],zmm25[2],zmm14[2],zmm25[4],zmm14[4],zmm25[6],zmm14[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm22 {%k1} = zmm25[1],zmm14[1],zmm25[3],zmm14[3],zmm25[5],zmm14[5],zmm25[7],zmm14[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm1, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm11 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm24[0],zmm0[0],zmm24[2],zmm0[2],zmm24[4],zmm0[4],zmm24[6],zmm0[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 = zmm24[1],zmm0[1],zmm24[3],zmm0[3],zmm24[5],zmm0[5],zmm24[7],zmm0[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm18 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm28 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm5 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm6 -; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm2 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7] -; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k1} = zmm10[0],zmm9[0],zmm10[2],zmm9[2],zmm10[4],zmm9[4],zmm10[6],zmm9[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm28 {%k1} = zmm10[1],zmm9[1],zmm10[3],zmm9[3],zmm10[5],zmm9[5],zmm10[7],zmm9[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm21, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm21, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm21, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm21[0],zmm8[0],zmm21[2],zmm8[2],zmm21[4],zmm8[4],zmm21[6],zmm8[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm21[1],zmm8[1],zmm21[3],zmm8[3],zmm21[5],zmm8[5],zmm21[7],zmm8[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm21 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] @@ -20977,8 +21164,8 @@ ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -20988,7 +21175,7 @@ ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm22, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -21007,388 +21194,404 @@ ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQBW-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %ymm1 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %ymm8 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %ymm3 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm12[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] ; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %ymm8 ; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm15, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rsi), %ymm8 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm13, %zmm28 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm12 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm22 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 ; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rsi), %ymm9 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %ymm12 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm26, %zmm26 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm8 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm9[1],ymm12[3],ymm9[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm8 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rcx), %ymm1 ; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdx), %ymm3 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rsi), %ymm4 -; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdi), %ymm14 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rsi), %ymm12 +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm6, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm7 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm11 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rcx), %ymm1 ; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdx), %ymm3 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rsi), %ymm7 -; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdi), %ymm9 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm7 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rsi), %ymm9 +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdi), %ymm12 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm13[2,3],ymm7[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm12[1],ymm9[1],ymm12[3],ymm9[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rcx), %ymm1 ; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdx), %ymm3 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rsi), %ymm7 -; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdi), %ymm9 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rsi), %ymm11 +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdi), %ymm12 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm13[2,3],ymm9[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm2, %zmm25 ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] ; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm29 ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} ; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %xmm2 ; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm11 = xmm10[0],xmm9[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm1, %zmm1 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm1[0,1,2,3],zmm12[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm10[1],xmm9[1] ; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm9, %zmm2, %zmm2 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm12 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0] -; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %xmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm10[0],xmm3[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %xmm12 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm12[0],xmm11[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm9, %zmm9 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm10[1],xmm3[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm3, %zmm3 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %xmm12 -; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm14 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0] -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %xmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %xmm26 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %xmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm12 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm12[0],xmm3[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %xmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %xmm14 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm15, %zmm10, %zmm10 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1] -; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm3[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm3, %zmm3 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rcx), %xmm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %xmm20 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0] -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %xmm28 -; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %xmm29 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rcx), %xmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm12 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm13 = xmm12[0],xmm3[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rsi), %xmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %xmm15 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm15[0],xmm14[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm19, %zmm13, %zmm13 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm15[1],xmm14[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm3[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm14, %zmm3, %zmm3 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %xmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %xmm28 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0] -; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %xmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rcx), %xmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdx), %xmm14 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm12[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %xmm19 ; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm19[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm31, %zmm15, %zmm15 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm15[0,1,2,3],zmm1[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm19[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3],zmm30[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %xmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %xmm28 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rcx), %xmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdx), %xmm14 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm14[0],xmm12[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm19 ; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %xmm31 ; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm31[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm30, %zmm19, %zmm19 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],zmm20[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %xmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %xmm23 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] -; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %xmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm23 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm0, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm23[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm16 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rcx), %xmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdx), %xmm14 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm14[0],xmm12[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %xmm23 ; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm23[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm31, %zmm20, %zmm20 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm20[0,1,2,3],zmm16[4,5,6,7] ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %xmm20 -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %xmm23 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] -; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %xmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm30[1],xmm23[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm20, %zmm12, %zmm12 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm17[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rcx), %xmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %xmm17 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %xmm23 ; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1} -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] -; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] -; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 -; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm23[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm31, %zmm20, %zmm20 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm20[0,1,2,3],zmm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm30[1],xmm23[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm17[1],xmm14[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm20, %zmm14, %zmm14 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm14[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 3776(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 3712(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 3264(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 3200(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 2752(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 2240(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 2176(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 1728(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1664(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 1216(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 1152(%rax) -; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 704(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 640(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 192(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 128(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 4032(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3968(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3904(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3840(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 3648(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 3584(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3520(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3456(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3392(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3328(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 3072(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3008(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 2944(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 2880(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 3776(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 3712(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 3264(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 3200(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 2752(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 2688(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 2240(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 2176(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 1728(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 1664(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 4032(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 3968(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 3904(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 3840(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 3520(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 3456(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 3392(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 3328(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 3008(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 2944(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 2880(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 2816(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 2496(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 2432(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 2368(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 2304(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1984(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1920(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1856(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1792(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1472(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1408(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1344(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 1280(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 896(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 832(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm6, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 3648(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 3584(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 3136(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 3072(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 2624(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 2560(%rax) ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 2112(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 1600(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 1536(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 1088(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 1024(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 576(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 2048(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 1600(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 1536(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1024(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, (%rax) ; AVX512DQBW-FAST-NEXT: addq $5384, %rsp # imm = 0x1508 ; AVX512DQBW-FAST-NEXT: vzeroupper ; AVX512DQBW-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll @@ -130,15 +130,15 @@ ; SSE-NEXT: movdqa (%rsi), %xmm2 ; SSE-NEXT: movdqa 16(%rsi), %xmm3 ; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: movdqa %xmm1, 32(%rdx) -; SSE-NEXT: movdqa %xmm2, 48(%rdx) -; SSE-NEXT: movdqa %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm4, 16(%rdx) +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE-NEXT: movdqa %xmm1, 48(%rdx) +; SSE-NEXT: movdqa %xmm2, 32(%rdx) +; SSE-NEXT: movdqa %xmm0, 16(%rdx) +; SSE-NEXT: movdqa %xmm4, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride2_vf32: @@ -216,25 +216,25 @@ ; SSE-NEXT: movdqa 32(%rsi), %xmm6 ; SSE-NEXT: movdqa 48(%rsi), %xmm7 ; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] ; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; SSE-NEXT: movdqa %xmm3, 96(%rdx) -; SSE-NEXT: movdqa %xmm6, 112(%rdx) -; SSE-NEXT: movdqa %xmm2, 64(%rdx) -; SSE-NEXT: movdqa %xmm5, 80(%rdx) -; SSE-NEXT: movdqa %xmm1, 32(%rdx) -; SSE-NEXT: movdqa %xmm4, 48(%rdx) -; SSE-NEXT: movdqa %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm8, 16(%rdx) +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] +; SSE-NEXT: movdqa %xmm3, 112(%rdx) +; SSE-NEXT: movdqa %xmm6, 96(%rdx) +; SSE-NEXT: movdqa %xmm2, 80(%rdx) +; SSE-NEXT: movdqa %xmm5, 64(%rdx) +; SSE-NEXT: movdqa %xmm1, 48(%rdx) +; SSE-NEXT: movdqa %xmm4, 32(%rdx) +; SSE-NEXT: movdqa %xmm0, 16(%rdx) +; SSE-NEXT: movdqa %xmm8, (%rdx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride2_vf64: @@ -247,22 +247,22 @@ ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 16(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, 48(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 96(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm6, 112(%rdx) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, 64(%rdx) -; AVX1-ONLY-NEXT: vmovdqa %xmm8, 80(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 80(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 48(%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 16(%rdx) ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i8_stride2_vf64: @@ -273,16 +273,16 @@ ; AVX2-ONLY-NEXT: vmovdqa 32(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] ; AVX2-ONLY-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] ; AVX2-ONLY-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 96(%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 96(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 64(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rdx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -324,22 +324,22 @@ ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll @@ -147,28 +147,28 @@ ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[13],zero,xmm1[6,14],zero,xmm1[7,15],zero,xmm1[u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5],zero,zero,xmm0[6],zero,zero,xmm0[7,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovq %xmm0, 16(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rcx) +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,u,13,12,u,15,14,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,11,4,6,13,10,12,15,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[2,3],zero,xmm0[4,5],zero,xmm0[6,7],zero,xmm0[8,9],zero,xmm0[10] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rcx) +; AVX1-ONLY-NEXT: vmovq %xmm3, 16(%rcx) ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i8_stride3_vf8: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-ONLY-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,0] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5,21],zero,ymm0[30,22],zero,ymm0[31,23],zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-ONLY-NEXT: vpbroadcastq (%rdx), %ymm1 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[0],zero,zero,ymm1[1],zero,zero,ymm1[2],zero,zero,ymm1[3],zero,zero,ymm1[4],zero,zero,ymm1[21],zero,zero,ymm1[22],zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-ONLY-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-ONLY-NEXT: vmovq %xmm1, 16(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rcx) @@ -179,13 +179,12 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,0] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5,21],zero,ymm0[30,22],zero,ymm0[31,23],zero,ymm0[u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpbroadcastq (%rdx), %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[0],zero,zero,ymm1[1],zero,zero,ymm1[2],zero,zero,ymm1[3],zero,zero,ymm1[4],zero,zero,ymm1[21],zero,zero,ymm1[22],zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vmovq %xmm1, 16(%rcx) ; AVX512F-NEXT: vmovdqa %xmm0, (%rcx) @@ -196,13 +195,12 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,0] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5,21],zero,ymm0[30,22],zero,ymm0[31,23],zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpbroadcastq (%rdx), %ymm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[0],zero,zero,ymm1[1],zero,zero,ymm1[2],zero,zero,ymm1[3],zero,zero,ymm1[4],zero,zero,ymm1[21],zero,zero,ymm1[22],zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512BW-NEXT: vmovq %xmm1, 16(%rcx) ; AVX512BW-NEXT: vmovdqa %xmm0, (%rcx) @@ -340,12 +338,12 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride3_vf32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: movdqa 16(%rsi), %xmm10 -; SSE-NEXT: movdqa (%rdx), %xmm1 -; SSE-NEXT: movdqa 16(%rdx), %xmm7 +; SSE-NEXT: movdqa (%rdi), %xmm8 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa (%rsi), %xmm10 +; SSE-NEXT: movdqa 16(%rsi), %xmm4 +; SSE-NEXT: movdqa (%rdx), %xmm7 +; SSE-NEXT: movdqa 16(%rdx), %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] @@ -355,107 +353,107 @@ ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,6] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] -; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,5,5,6] +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: por %xmm5, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] +; SSE-NEXT: pand %xmm6, %xmm9 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm9, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,6,5] ; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pandn %xmm6, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm6[0,1,2,3,4,5,5,6] +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm5[0,1,2,3,4,5,5,6] ; SSE-NEXT: pand %xmm0, %xmm11 ; SSE-NEXT: por %xmm9, %xmm11 -; SSE-NEXT: pand %xmm5, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm9, %xmm6 -; SSE-NEXT: por %xmm11, %xmm6 +; SSE-NEXT: pand %xmm6, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm9, %xmm5 +; SSE-NEXT: por %xmm11, %xmm5 ; SSE-NEXT: movdqa %xmm10, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,1,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm11 ; SSE-NEXT: pandn %xmm9, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,5,5,6,6] +; SSE-NEXT: pand %xmm6, %xmm12 ; SSE-NEXT: por %xmm11, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] -; SSE-NEXT: pand %xmm9, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm7[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,6,5,7,7] -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pandn %xmm13, %xmm11 -; SSE-NEXT: por %xmm12, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,5,5,6,6] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm11, %xmm9 +; SSE-NEXT: por %xmm12, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,2,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm11, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] +; SSE-NEXT: pand %xmm8, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm7[0,1,2,3,6,5,7,7] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm11, %xmm7 +; SSE-NEXT: por %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: pandn %xmm10, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,6,6] -; SSE-NEXT: pand %xmm5, %xmm8 -; SSE-NEXT: por %xmm12, %xmm8 -; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,5,5,6,6] -; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: pandn %xmm10, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm10 -; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: por %xmm10, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,7,7] -; SSE-NEXT: pandn %xmm10, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,6] -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,6] -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rcx) -; SSE-NEXT: movdqa %xmm9, 32(%rcx) -; SSE-NEXT: movdqa %xmm7, 48(%rcx) -; SSE-NEXT: movdqa %xmm11, 80(%rcx) -; SSE-NEXT: movdqa %xmm6, 16(%rcx) -; SSE-NEXT: movdqa %xmm3, 64(%rcx) +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,6,6] +; SSE-NEXT: pand %xmm6, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm6 +; SSE-NEXT: por %xmm11, %xmm6 +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,6,6] +; SSE-NEXT: movdqa %xmm0, %xmm11 +; SSE-NEXT: pandn %xmm10, %xmm11 +; SSE-NEXT: por %xmm6, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm8, 80(%rcx) +; SSE-NEXT: movdqa %xmm11, 48(%rcx) +; SSE-NEXT: movdqa %xmm7, 32(%rcx) +; SSE-NEXT: movdqa %xmm9, (%rcx) +; SSE-NEXT: movdqa %xmm5, 64(%rcx) +; SSE-NEXT: movdqa %xmm3, 16(%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride3_vf32: @@ -489,10 +487,10 @@ ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 64(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 80(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 48(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 64(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 80(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX1-ONLY-NEXT: retq @@ -590,252 +588,252 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride3_vf64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 16(%rdi), %xmm10 +; SSE-NEXT: movdqa (%rdi), %xmm9 +; SSE-NEXT: movdqa 16(%rdi), %xmm13 +; SSE-NEXT: movdqa 32(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm12 -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa 16(%rsi), %xmm9 -; SSE-NEXT: movdqa 32(%rsi), %xmm15 -; SSE-NEXT: movdqa 48(%rsi), %xmm4 -; SSE-NEXT: movdqa 16(%rdx), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdx), %xmm11 +; SSE-NEXT: movdqa (%rsi), %xmm4 +; SSE-NEXT: movdqa 16(%rsi), %xmm15 +; SSE-NEXT: movdqa 32(%rsi), %xmm7 +; SSE-NEXT: movdqa (%rdx), %xmm12 +; SSE-NEXT: movdqa 16(%rdx), %xmm11 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdx), %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,5,5,6] -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: por %xmm3, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,5] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm6, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,5] -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,1,2,3] +; SSE-NEXT: movdqa 32(%rdx), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,6] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] +; SSE-NEXT: pand %xmm3, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm12[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: por %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,5,5,6] +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: pand %xmm3, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm6, %xmm1 +; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm6[0,1,2,3,4,5,5,6] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pandn %xmm8, %xmm13 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,5] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: movdqa (%rsi), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm8, %xmm3 -; SSE-NEXT: movdqa (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm8[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm10 -; SSE-NEXT: pandn %xmm14, %xmm10 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm10 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: pandn %xmm3, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm14, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] -; SSE-NEXT: movdqa %xmm14, %xmm8 -; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,1,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,5,5,6,6] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,5,5,6,6] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm8, %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm3, %xmm10 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: movdqa 48(%rsi), %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm8[0,1,2,3,4,5,5,6] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: por %xmm10, %xmm14 +; SSE-NEXT: movdqa 48(%rdx), %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm10[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: pand %xmm3, %xmm14 +; SSE-NEXT: por %xmm14, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,6] +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm14, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm12[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,5,5,6,6] +; SSE-NEXT: movdqa %xmm0, %xmm14 +; SSE-NEXT: pandn %xmm2, %xmm14 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,2,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,6,5,7,7] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] ; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm9, %xmm6 ; SSE-NEXT: pand %xmm1, %xmm4 ; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,5,7,7] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,1,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,6] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,5,5,6,6] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[2,3,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm4[0,1,2,3,5,5,6,6] +; SSE-NEXT: pand %xmm3, %xmm12 +; SSE-NEXT: por %xmm9, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,5,5,6,6] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm9, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: por %xmm12, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm15[1,2,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: pandn %xmm9, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm13[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: por %xmm12, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,6,5,7,7] +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pandn %xmm13, %xmm12 +; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: por %xmm9, %xmm12 +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,1,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: pandn %xmm9, %xmm13 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,6,6] +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: por %xmm13, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm13[0,1,2,3,5,5,6,6] +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: pandn %xmm15, %xmm13 +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: por %xmm9, %xmm13 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,2,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm7, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,7,7] -; SSE-NEXT: movdqa %xmm14, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: por %xmm3, %xmm12 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,1,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,2,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,6] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,5,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm7[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: por %xmm9, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,6,5,7,7] ; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: pandn %xmm9, %xmm7 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; SSE-NEXT: pand %xmm1, %xmm15 +; SSE-NEXT: por %xmm15, %xmm7 +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,1,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,1,2,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,6,6] +; SSE-NEXT: pand %xmm3, %xmm15 +; SSE-NEXT: pandn %xmm9, %xmm3 +; SSE-NEXT: por %xmm15, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[0,1,0,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,6,6] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm9, %xmm15 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm3, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,2,2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm9, %xmm3 -; SSE-NEXT: pand %xmm14, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm15[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] -; SSE-NEXT: pandn %xmm9, %xmm14 -; SSE-NEXT: por %xmm3, %xmm14 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,1,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,6,6] -; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm9, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,0,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm9, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rcx) -; SSE-NEXT: movdqa %xmm14, 32(%rcx) -; SSE-NEXT: movdqa %xmm7, 48(%rcx) +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, 176(%rcx) +; SSE-NEXT: movdqa %xmm15, 144(%rcx) +; SSE-NEXT: movdqa %xmm7, 128(%rcx) +; SSE-NEXT: movdqa %xmm13, 96(%rcx) ; SSE-NEXT: movdqa %xmm12, 80(%rcx) -; SSE-NEXT: movdqa %xmm5, 96(%rcx) -; SSE-NEXT: movdqa %xmm4, 128(%rcx) -; SSE-NEXT: movdqa %xmm6, 144(%rcx) -; SSE-NEXT: movdqa %xmm8, 176(%rcx) -; SSE-NEXT: movdqa %xmm10, 16(%rcx) -; SSE-NEXT: movdqa %xmm13, 64(%rcx) +; SSE-NEXT: movdqa %xmm5, 48(%rcx) +; SSE-NEXT: movdqa %xmm6, 32(%rcx) +; SSE-NEXT: movdqa %xmm14, (%rcx) +; SSE-NEXT: movdqa %xmm8, 160(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 112(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 160(%rcx) +; SSE-NEXT: movaps %xmm0, 64(%rcx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rcx) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride3_vf64: @@ -940,18 +938,18 @@ ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm8 ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm8, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 64(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm15, 80(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm14, (%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, 32(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm12, 48(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm1, 160(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm7, 176(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm13, 96(%rcx) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 112(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm3, 128(%rcx) ; AVX1-ONLY-NEXT: vmovdqa %xmm11, 144(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm13, 96(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 112(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 64(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm15, 80(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 32(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 48(%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm14, (%rcx) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX1-ONLY-NEXT: popq %rax ; AVX1-ONLY-NEXT: retq ; @@ -1007,12 +1005,12 @@ ; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] ; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm3, 128(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 64(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm1, 160(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm6, (%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 64(%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm5, 96(%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm6, (%rcx) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1054,8 +1052,8 @@ ; AVX512F-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] ; AVX512F-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm3, 128(%rcx) ; AVX512F-NEXT: vmovdqa %ymm1, 160(%rcx) +; AVX512F-NEXT: vmovdqa %ymm3, 128(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, 64(%rcx) ; AVX512F-NEXT: vmovdqa %ymm2, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm5, 96(%rcx) @@ -1091,11 +1089,11 @@ ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm4 ; AVX512BW-NEXT: vpshufb %zmm4, %zmm2, %zmm2 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512BW-NEXT: vpshufb %zmm4, %zmm5, %zmm4 +; AVX512BW-NEXT: vpshufb %zmm3, %zmm5, %zmm3 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufb %zmm4, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rcx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rcx) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rcx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll @@ -69,38 +69,53 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,0,3,1,4,5,6,7] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,0,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, (%r8) +; SSE-NEXT: movdqa %xmm0, (%r8) ; SSE-NEXT: retq ; -; AVX-LABEL: store_i8_stride4_vf4: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa (%rdx), %xmm1 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] -; AVX-NEXT: vmovdqa %xmm0, (%r8) -; AVX-NEXT: retq +; AVX1-ONLY-LABEL: store_i8_stride4_vf4: +; AVX1-ONLY: # %bb.0: +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%r8) +; AVX1-ONLY-NEXT: retq +; +; AVX2-ONLY-LABEL: store_i8_stride4_vf4: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastd (%rcx), %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastd (%rdx), %xmm2 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,12,9,13,10,14,11,15,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%r8) +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: store_i8_stride4_vf4: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpbroadcastd (%rdx), %xmm1 +; AVX512-NEXT: vpunpckldq (%rcx){1to4}, %xmm1, %xmm1 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,12,9,13,10,14,11,15,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vmovdqa %xmm0, (%r8) +; AVX512-NEXT: retq %in.vec0 = load <4 x i8>, ptr %in.vecptr0, align 64 %in.vec1 = load <4 x i8>, ptr %in.vecptr1, align 64 %in.vec2 = load <4 x i8>, ptr %in.vecptr2, align 64 @@ -171,10 +186,10 @@ ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: movdqa %xmm0, 32(%r8) -; SSE-NEXT: movdqa %xmm1, 48(%r8) +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, 48(%r8) +; SSE-NEXT: movdqa %xmm1, 32(%r8) ; SSE-NEXT: movdqa %xmm5, 16(%r8) ; SSE-NEXT: movdqa %xmm6, (%r8) ; SSE-NEXT: retq @@ -193,8 +208,8 @@ ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-NEXT: vmovdqa %xmm4, 32(%r8) ; AVX1-NEXT: vmovdqa %xmm0, 48(%r8) +; AVX1-NEXT: vmovdqa %xmm4, 32(%r8) ; AVX1-NEXT: vmovdqa %xmm1, 16(%r8) ; AVX1-NEXT: vmovdqa %xmm3, (%r8) ; AVX1-NEXT: retq @@ -268,33 +283,33 @@ ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] ; SSE-NEXT: movdqa %xmm4, %xmm7 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] ; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] ; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: movdqa %xmm1, 96(%r8) -; SSE-NEXT: movdqa %xmm6, 112(%r8) -; SSE-NEXT: movdqa %xmm8, 64(%r8) -; SSE-NEXT: movdqa %xmm10, 80(%r8) -; SSE-NEXT: movdqa %xmm0, 32(%r8) -; SSE-NEXT: movdqa %xmm5, 48(%r8) -; SSE-NEXT: movdqa %xmm2, (%r8) -; SSE-NEXT: movdqa %xmm3, 16(%r8) +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: movdqa %xmm1, 112(%r8) +; SSE-NEXT: movdqa %xmm6, 96(%r8) +; SSE-NEXT: movdqa %xmm8, 80(%r8) +; SSE-NEXT: movdqa %xmm10, 64(%r8) +; SSE-NEXT: movdqa %xmm0, 48(%r8) +; SSE-NEXT: movdqa %xmm5, 32(%r8) +; SSE-NEXT: movdqa %xmm2, 16(%r8) +; SSE-NEXT: movdqa %xmm3, (%r8) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride4_vf32: @@ -327,10 +342,10 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 -; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r8) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -441,74 +456,74 @@ ; SSE-NEXT: movdqa %xmm5, %xmm6 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] ; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] ; SSE-NEXT: movdqa %xmm13, %xmm15 ; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; SSE-NEXT: movdqa %xmm11, %xmm7 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] ; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15] ; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] ; SSE-NEXT: movdqa %xmm10, %xmm15 ; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3],xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] ; SSE-NEXT: movdqa %xmm4, %xmm13 ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] ; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] ; SSE-NEXT: movdqa 48(%rdx), %xmm15 ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm12[8],xmm10[9],xmm12[9],xmm10[10],xmm12[10],xmm10[11],xmm12[11],xmm10[12],xmm12[12],xmm10[13],xmm12[13],xmm10[14],xmm12[14],xmm10[15],xmm12[15] ; SSE-NEXT: movdqa 48(%rcx), %xmm12 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] ; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] ; SSE-NEXT: movdqa %xmm15, %xmm10 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] ; SSE-NEXT: movdqa 48(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE-NEXT: movdqa %xmm2, 224(%r8) -; SSE-NEXT: movdqa %xmm1, 240(%r8) -; SSE-NEXT: movdqa %xmm3, 192(%r8) -; SSE-NEXT: movdqa %xmm0, 208(%r8) -; SSE-NEXT: movdqa %xmm4, 160(%r8) -; SSE-NEXT: movdqa %xmm9, 176(%r8) -; SSE-NEXT: movdqa %xmm13, 128(%r8) -; SSE-NEXT: movdqa %xmm14, 144(%r8) -; SSE-NEXT: movdqa %xmm11, 96(%r8) -; SSE-NEXT: movdqa %xmm8, 112(%r8) -; SSE-NEXT: movdqa %xmm7, 64(%r8) +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; SSE-NEXT: movdqa %xmm2, 240(%r8) +; SSE-NEXT: movdqa %xmm1, 224(%r8) +; SSE-NEXT: movdqa %xmm3, 208(%r8) +; SSE-NEXT: movdqa %xmm0, 192(%r8) +; SSE-NEXT: movdqa %xmm4, 176(%r8) +; SSE-NEXT: movdqa %xmm9, 160(%r8) +; SSE-NEXT: movdqa %xmm13, 144(%r8) +; SSE-NEXT: movdqa %xmm14, 128(%r8) +; SSE-NEXT: movdqa %xmm11, 112(%r8) +; SSE-NEXT: movdqa %xmm8, 96(%r8) +; SSE-NEXT: movdqa %xmm7, 80(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%r8) -; SSE-NEXT: movdqa %xmm5, 32(%r8) +; SSE-NEXT: movaps %xmm0, 64(%r8) +; SSE-NEXT: movdqa %xmm5, 48(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%r8) -; SSE-NEXT: movdqa %xmm6, (%r8) +; SSE-NEXT: movaps %xmm0, 32(%r8) +; SSE-NEXT: movdqa %xmm6, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%r8) +; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride4_vf64: @@ -573,14 +588,14 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4 -; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 160(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 224(%r8) -; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm7, 128(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 96(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 64(%r8) ; AVX1-ONLY-NEXT: vmovaps %ymm5, 32(%r8) +; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -618,14 +633,14 @@ ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm9 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3] ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 192(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm1, 224(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 64(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm3, 192(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 96(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm7, 128(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 64(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm9, 160(%r8) -; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm7, 128(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm5, 32(%r8) +; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%r8) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -663,14 +678,14 @@ ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm9 ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512F-NEXT: vmovdqa %ymm5, 192(%r8) ; AVX512F-NEXT: vmovdqa %ymm1, 224(%r8) -; AVX512F-NEXT: vmovdqa %ymm4, 64(%r8) +; AVX512F-NEXT: vmovdqa %ymm5, 192(%r8) ; AVX512F-NEXT: vmovdqa %ymm0, 96(%r8) -; AVX512F-NEXT: vmovdqa %ymm7, 128(%r8) +; AVX512F-NEXT: vmovdqa %ymm4, 64(%r8) ; AVX512F-NEXT: vmovdqa %ymm9, 160(%r8) -; AVX512F-NEXT: vmovdqa %ymm2, (%r8) +; AVX512F-NEXT: vmovdqa %ymm7, 128(%r8) ; AVX512F-NEXT: vmovdqa %ymm3, 32(%r8) +; AVX512F-NEXT: vmovdqa %ymm2, (%r8) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -726,7 +741,6 @@ ; AVX2-FAST: {{.*}} ; AVX2-FAST-PERLANE: {{.*}} ; AVX2-SLOW: {{.*}} -; AVX512: {{.*}} ; AVX512BW-FAST: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} ; AVX512BW-ONLY-SLOW: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -20,8 +20,8 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -46,8 +46,8 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa (%rdx), %xmm1 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,2,6,10,14,3,u,u,u,u,u,u] @@ -75,8 +75,8 @@ ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa (%rdx), %xmm2 ; SSE-NEXT: movdqa (%r8), %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: pxor %xmm3, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[3,1,2,1] @@ -119,28 +119,30 @@ ; ; AVX1-ONLY-LABEL: store_i8_stride5_vf4: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,4,8,12],zero,xmm0[1,5,9,13],zero,xmm0[2,6,10,14],zero,xmm0[3] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,xmm2[2],zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6] +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,4,8,12],zero,xmm1[1,5,9,13],zero,xmm1[2,6,10,14],zero,xmm1[3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vmovd %xmm0, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%r9) ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i8_stride5_vf4: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX2-ONLY-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -154,11 +156,13 @@ ; ; AVX512F-LABEL: store_i8_stride5_vf4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512F-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512F-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -172,11 +176,13 @@ ; ; AVX512BW-LABEL: store_i8_stride5_vf4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,ymm0[1,5,9,13],zero,ymm0[2,6,10,14],zero,ymm0[3],zero,zero,zero,ymm0[19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -288,128 +294,137 @@ ; ; AVX1-ONLY-LABEL: store_i8_stride5_vf8: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,13,6,8,10,12,15,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm1[0,8,u],zero,zero,xmm1[1,9,u],zero,zero,xmm1[2,10,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[6,14,u],zero,zero,xmm3[7,15,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,2,13,6,8,10,12,15,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm3[0,8,u],zero,zero,xmm3[1,9,u],zero,zero,xmm3[2,10,u],zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,8],zero,zero,xmm0[u,1,9],zero,zero,xmm0[u,2,10],zero,zero,xmm0[u,3] ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[5,6,7,8],zero,xmm4[10,11,12,13],zero,xmm4[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,xmm2[2],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,xmm1[2],zero ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[3,11,u],zero,zero,xmm1[4,12,u],zero,zero,xmm1[5,13,u],zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm3[3,11,u],zero,zero,xmm3[4,12,u],zero,zero,xmm3[5,13,u],zero,zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[11],zero,zero,xmm0[u,4,12],zero,zero,xmm0[u,5,13],zero,zero,xmm0[u,6,14] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6,7],zero,xmm0[9,10,11,12],zero,xmm0[14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm2[3],zero,zero,zero,zero,xmm2[4],zero,zero,zero,zero,xmm2[5],zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3],zero,zero,zero,zero,xmm1[4],zero,zero,zero,zero,xmm1[5],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 16(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm4, (%r9) -; AVX1-ONLY-NEXT: vmovq %xmm3, 32(%r9) +; AVX1-ONLY-NEXT: vmovq %xmm2, 32(%r9) +; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i8_stride5_vf8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: movq (%r8), %rax ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vmovq %rax, %xmm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] -; AVX2-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-SLOW-NEXT: movq (%r8), %rax +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX2-SLOW-NEXT: vmovq %rax, %xmm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[u,7,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,14,u],zero,zero,xmm4[7,15,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX2-SLOW-NEXT: shrq $48, %rax -; AVX2-SLOW-NEXT: vmovd %eax, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vmovq %xmm0, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-SLOW-NEXT: vmovd %eax, %xmm4 +; AVX2-SLOW-NEXT: vpbroadcastw %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0] +; AVX2-SLOW-NEXT: vpblendvb %xmm5, %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,ymm0[1,9],zero,zero,zero,ymm0[2,10],zero,zero,ymm0[27],zero,zero,zero,ymm0[20,28],zero,zero,zero,ymm0[21,29],zero,zero,zero,ymm0[22,30] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,ymm1[1,9],zero,zero,zero,ymm1[2,10],zero,zero,zero,ymm1[3],zero,ymm1[19,27],zero,zero,zero,ymm1[20,28],zero,zero,zero,ymm1[21,29],zero,zero,zero +; AVX2-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-SLOW-NEXT: vmovq %xmm3, 32(%r9) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride5_vf8: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: movq (%r8), %rax ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovq %rax, %xmm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] -; AVX2-FAST-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-FAST-NEXT: movq (%r8), %rax +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX2-FAST-NEXT: vmovq %rax, %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[u,7,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,14,u],zero,zero,xmm4[7,15,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX2-FAST-NEXT: shrq $48, %rax -; AVX2-FAST-NEXT: vmovd %eax, %xmm1 -; AVX2-FAST-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0] -; AVX2-FAST-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vmovq %xmm0, 32(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-FAST-NEXT: vmovd %eax, %xmm4 +; AVX2-FAST-NEXT: vpbroadcastw %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0] +; AVX2-FAST-NEXT: vpblendvb %xmm5, %xmm3, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,ymm0[1,9],zero,zero,zero,ymm0[2,10],zero,zero,ymm0[27],zero,zero,zero,ymm0[20,28],zero,zero,zero,ymm0[21,29],zero,zero,zero,ymm0[22,30] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,ymm1[1,9],zero,zero,zero,ymm1[2,10],zero,zero,zero,ymm1[3],zero,ymm1[19,27],zero,zero,zero,ymm1[20,28],zero,zero,zero,ymm1[21,29],zero,zero,zero +; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,1] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-FAST-NEXT: vmovq %xmm3, 32(%r9) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i8_stride5_vf8: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: movq (%r8), %rax ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovq %rax, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: movq (%r8), %rax +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX2-FAST-PERLANE-NEXT: vmovq %rax, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[u,7,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,14,u],zero,zero,xmm4[7,15,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX2-FAST-PERLANE-NEXT: shrq $48, %rax -; AVX2-FAST-PERLANE-NEXT: vmovd %eax, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovd %eax, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm5 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %xmm5, %xmm3, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,ymm0[1,9],zero,zero,zero,ymm0[2,10],zero,zero,ymm0[27],zero,zero,zero,ymm0[20,28],zero,zero,zero,ymm0[21,29],zero,zero,zero,ymm0[22,30] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8],zero,zero,zero,ymm1[1,9],zero,zero,zero,ymm1[2,10],zero,zero,zero,ymm1[3],zero,ymm1[19,27],zero,zero,zero,ymm1[20,28],zero,zero,zero,ymm1[21,29],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm3, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -422,24 +437,25 @@ ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vmovq %rax, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,ymm2[u,1,9],zero,zero,ymm2[u,2,10],zero,zero,ymm2[u,3],zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,ymm2[27],zero,zero,ymm2[u,20,28],zero,zero,ymm2[u,21,29],zero,zero,ymm2[u,22,30] -; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vmovq %rax, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,14,u],zero,zero,xmm3[7,15,u,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512F-SLOW-NEXT: shrq $48, %rax -; AVX512F-SLOW-NEXT: vmovd %eax, %xmm1 -; AVX512F-SLOW-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm0 -; AVX512F-SLOW-NEXT: vmovq %xmm1, 32(%r9) +; AVX512F-SLOW-NEXT: vmovd %eax, %xmm3 +; AVX512F-SLOW-NEXT: vpbroadcastw %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,8],zero,zero,ymm0[u,1,9],zero,zero,ymm0[u,2,10],zero,zero,ymm0[u,3],zero,ymm0[19,27,u],zero,zero,ymm0[20,28,u],zero,zero,ymm0[21,29,u],zero,zero +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8,u],zero,zero,ymm0[1,9,u],zero,zero,ymm0[2,10,u],zero,ymm0[27],zero,zero,ymm0[u,20,28],zero,zero,ymm0[u,21,29],zero,zero,ymm0[u,22,30] +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vmovq %xmm3, 32(%r9) ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%r9) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq @@ -453,24 +469,25 @@ ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vmovq %rax, %xmm3 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,ymm2[u,1,9],zero,zero,ymm2[u,2,10],zero,zero,ymm2[u,3],zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,ymm2[27],zero,zero,ymm2[u,20,28],zero,zero,ymm2[u,21,29],zero,zero,ymm2[u,22,30] -; AVX512F-FAST-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovq %rax, %xmm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,14,u],zero,zero,xmm3[7,15,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512F-FAST-NEXT: shrq $48, %rax -; AVX512F-FAST-NEXT: vmovd %eax, %xmm1 -; AVX512F-FAST-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm0 -; AVX512F-FAST-NEXT: vmovq %xmm1, 32(%r9) +; AVX512F-FAST-NEXT: vmovd %eax, %xmm3 +; AVX512F-FAST-NEXT: vpbroadcastw %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,8],zero,zero,ymm0[u,1,9],zero,zero,ymm0[u,2,10],zero,zero,ymm0[u,3],zero,ymm0[19,27,u],zero,zero,ymm0[20,28,u],zero,zero,ymm0[21,29,u],zero,zero +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8,u],zero,zero,ymm0[1,9,u],zero,zero,ymm0[2,10,u],zero,ymm0[27],zero,zero,ymm0[u,20,28],zero,zero,ymm0[u,21,29],zero,zero,ymm0[u,22,30] +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovq %xmm3, 32(%r9) ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq @@ -484,28 +501,29 @@ ; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX512BW-SLOW-NEXT: vmovq %rax, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] -; AVX512BW-SLOW-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX512BW-SLOW-NEXT: movl $554189328, %ecx # imm = 0x21084210 -; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] -; AVX512BW-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovq %rax, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,14,u],zero,zero,xmm3[7,15,u,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-SLOW-NEXT: shrq $48, %rax -; AVX512BW-SLOW-NEXT: vpbroadcastw %eax, %xmm1 +; AVX512BW-SLOW-NEXT: vpbroadcastw %eax, %xmm3 ; AVX512BW-SLOW-NEXT: movw $132, %ax ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 -; AVX512BW-SLOW-NEXT: vmovq %xmm0, 32(%r9) -; AVX512BW-SLOW-NEXT: vmovdqa %ymm1, (%r9) +; AVX512BW-SLOW-NEXT: vmovdqu8 %xmm3, %xmm2 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,8],zero,zero,zero,ymm0[1,9],zero,zero,zero,ymm0[2,10],zero,zero,zero,ymm0[3],zero,ymm0[19,27],zero,zero,zero,ymm0[20,28],zero,zero,zero,ymm0[21,29],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,ymm0[1,9],zero,zero,zero,ymm0[2,10],zero,zero,ymm0[27],zero,zero,zero,ymm0[20,28],zero,zero,zero,ymm0[21,29],zero,zero,zero,ymm0[22,30] +; AVX512BW-SLOW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX512BW-SLOW-NEXT: movl $554189328, %eax # imm = 0x21084210 +; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovq %xmm2, 32(%r9) +; AVX512BW-SLOW-NEXT: vmovdqa %ymm0, (%r9) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; @@ -518,28 +536,29 @@ ; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX512BW-FAST-NEXT: vmovq %rax, %xmm3 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30] -; AVX512BW-FAST-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] -; AVX512BW-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX512BW-FAST-NEXT: movl $554189328, %ecx # imm = 0x21084210 -; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovq %rax, %xmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,14,u],zero,zero,xmm3[7,15,u,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-FAST-NEXT: shrq $48, %rax -; AVX512BW-FAST-NEXT: vpbroadcastw %eax, %xmm1 +; AVX512BW-FAST-NEXT: vpbroadcastw %eax, %xmm3 ; AVX512BW-FAST-NEXT: movw $132, %ax ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 -; AVX512BW-FAST-NEXT: vmovq %xmm0, 32(%r9) -; AVX512BW-FAST-NEXT: vmovdqa %ymm1, (%r9) +; AVX512BW-FAST-NEXT: vmovdqu8 %xmm3, %xmm2 {%k1} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,8],zero,zero,zero,ymm0[1,9],zero,zero,zero,ymm0[2,10],zero,zero,zero,ymm0[3],zero,ymm0[19,27],zero,zero,zero,ymm0[20,28],zero,zero,zero,ymm0[21,29],zero,zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,ymm0[1,9],zero,zero,zero,ymm0[2,10],zero,zero,ymm0[27],zero,zero,zero,ymm0[20,28],zero,zero,zero,ymm0[21,29],zero,zero,zero,ymm0[22,30] +; AVX512BW-FAST-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,1,1] +; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX512BW-FAST-NEXT: movl $554189328, %eax # imm = 0x21084210 +; AVX512BW-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovq %xmm2, 32(%r9) +; AVX512BW-FAST-NEXT: vmovdqa %ymm0, (%r9) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64 @@ -560,212 +579,208 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa (%rsi), %xmm11 -; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: movdqa (%rcx), %xmm8 -; SSE-NEXT: movdqa (%r8), %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa (%rsi), %xmm3 +; SSE-NEXT: movdqa (%rdx), %xmm5 +; SSE-NEXT: movdqa (%rcx), %xmm4 +; SSE-NEXT: movdqa (%r8), %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, %xmm12 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm6, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: pand %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm11[0,1,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,3,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: por %xmm1, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: pand %xmm6, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,2,2] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,1,0] -; SSE-NEXT: movdqa %xmm2, %xmm11 +; SSE-NEXT: movdqa %xmm0, %xmm11 ; SSE-NEXT: pandn %xmm10, %xmm11 -; SSE-NEXT: por %xmm7, %xmm11 -; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: por %xmm9, %xmm11 +; SSE-NEXT: movdqa %xmm6, %xmm10 ; SSE-NEXT: pandn %xmm11, %xmm10 -; SSE-NEXT: por %xmm5, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm7, %xmm10 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,2,2] -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm11, %xmm0 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: pand %xmm12, %xmm11 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,1,2,1] +; SSE-NEXT: por %xmm7, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm9, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm11, %xmm1 +; SSE-NEXT: por %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm12[1,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,0,0] +; SSE-NEXT: pand %xmm9, %xmm10 +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm11[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,3,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: pandn %xmm12, %xmm13 +; SSE-NEXT: por %xmm10, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm14 +; SSE-NEXT: pandn %xmm10, %xmm14 +; SSE-NEXT: movdqa %xmm3, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3],xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6,6] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm14, %xmm10 +; SSE-NEXT: pand %xmm15, %xmm10 +; SSE-NEXT: pandn %xmm13, %xmm15 +; SSE-NEXT: por %xmm10, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: pand %xmm10, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,0,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: pandn %xmm13, %xmm7 +; SSE-NEXT: por %xmm15, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm3[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,4,7] -; SSE-NEXT: movdqa %xmm12, %xmm14 +; SSE-NEXT: movdqa %xmm10, %xmm14 ; SSE-NEXT: pandn %xmm13, %xmm14 -; SSE-NEXT: por %xmm11, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm11 -; SSE-NEXT: pandn %xmm14, %xmm11 -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm9[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[2,2,3,3] +; SSE-NEXT: pand %xmm10, %xmm13 +; SSE-NEXT: por %xmm13, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm15, %xmm13 +; SSE-NEXT: pandn %xmm14, %xmm13 +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,2,2,2] -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: movdqa %xmm8, %xmm14 -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm8[8],xmm14[9],xmm8[9],xmm14[10],xmm8[10],xmm14[11],xmm8[11],xmm14[12],xmm8[12],xmm14[13],xmm8[13],xmm14[14],xmm8[14],xmm14[15],xmm8[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,4] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm11, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: pandn %xmm15, %xmm11 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm7, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm15, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,1,1] -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm3, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; SSE-NEXT: pandn %xmm3, %xmm12 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,3] -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 -; SSE-NEXT: pand %xmm13, %xmm6 -; SSE-NEXT: pandn %xmm12, %xmm13 -; SSE-NEXT: por %xmm6, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm2, %xmm13 -; SSE-NEXT: por %xmm13, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,5,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,3,2] -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,6] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pandn %xmm14, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm4[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,4] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: por %xmm5, %xmm14 +; SSE-NEXT: pand %xmm15, %xmm14 +; SSE-NEXT: por %xmm13, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,2,2,2] +; SSE-NEXT: movdqa %xmm8, %xmm13 +; SSE-NEXT: pandn %xmm5, %xmm13 +; SSE-NEXT: pand %xmm8, %xmm14 +; SSE-NEXT: por %xmm14, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] +; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,1] +; SSE-NEXT: pandn %xmm11, %xmm10 +; SSE-NEXT: por %xmm5, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,0,0,0] +; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm12[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,1,3] +; SSE-NEXT: pandn %xmm11, %xmm8 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: pand %xmm15, %xmm8 +; SSE-NEXT: pandn %xmm10, %xmm15 +; SSE-NEXT: por %xmm8, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: por %xmm15, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,7,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,3,2] +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[3,3,3,3] +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,2] -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: por %xmm9, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3] -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm0, 64(%r9) -; SSE-NEXT: movdqa %xmm6, (%r9) -; SSE-NEXT: movdqa %xmm15, 16(%r9) -; SSE-NEXT: movdqa %xmm11, 48(%r9) +; SSE-NEXT: movdqa %xmm8, (%r9) +; SSE-NEXT: movdqa %xmm13, 48(%r9) +; SSE-NEXT: movdqa %xmm7, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride5_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm4[6,u,u,u],zero,xmm4[7,u,u,u],zero,xmm4[8,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm2[6,u,u,u],zero,xmm2[7,u,u,u],zero,xmm2[8,u,u,u],zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9] ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u],zero,xmm3[7,u,u,u],zero,xmm3[8,u,u,u],zero,xmm3[9,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,7],zero,xmm2[u,u,u,8],zero,xmm2[u,u,u,9],zero,xmm2[u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u],zero,xmm4[7,u,u,u],zero,xmm4[8,u,u,u],zero,xmm4[9,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,7],zero,xmm3[u,u,u,8],zero,xmm3[u,u,u,9],zero,xmm3[u] ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255> ; AVX1-ONLY-NEXT: vpblendvb %xmm7, %xmm5, %xmm6, %xmm5 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1],zero,xmm5[3,4,5,6],zero,xmm5[8,9,10,11],zero,xmm5[13,14,15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm0[6],zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,xmm0[8],zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[10,11],zero,zero,zero,xmm6[12,13],zero,zero,zero,xmm6[14,15],zero -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[10,11],zero,zero,zero,xmm7[12,13],zero,zero,zero,xmm7[14,15],zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm8[0,1],zero,zero,zero,xmm8[2,3],zero,zero,zero,xmm8[4,5],zero,zero -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1],zero,zero,zero,xmm10[2,3],zero,zero,zero,xmm10[4,5],zero,zero,zero,xmm10[6] ; AVX1-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm9 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm8[6,7],zero,zero,zero,xmm8[8,9],zero,zero,zero,xmm8[10,11],zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6],zero,zero,zero,xmm2[9,8],zero,zero,zero,xmm2[11,10],zero,zero,zero,xmm2[13,12] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm8, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm0[3],zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,xmm0[5],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm7[4,5],zero,zero,zero,xmm7[6,7],zero,zero,zero,xmm7[8,9],zero,zero -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,xmm3[9,8],zero,zero,zero,xmm3[11,10],zero,zero,zero,xmm3[13,12] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm0[3],zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,xmm0[5],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm7[4,5],zero,zero,zero,xmm7[6,7],zero,zero,zero,xmm7[8,9],zero,zero +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,xmm1[5,4],zero,zero,zero,xmm1[7,6],zero,zero,zero,xmm1[9,8] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 48(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%r9) +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 16(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm9, (%r9) -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 64(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 64(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%r9) ; AVX1-ONLY-NEXT: retq ; @@ -947,13 +962,13 @@ ; ; AVX512F-FAST-LABEL: store_i8_stride5_vf16: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm0 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm4 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8,u],zero,zero,ymm7[1,9,u],zero,zero,ymm7[2,10,u],zero,zero,ymm7[19,27,u],zero,zero,ymm7[20,28,u],zero,zero,ymm7[21,29,u],zero,zero ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7] @@ -967,18 +982,18 @@ ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,6,u],zero,zero,ymm6[3,7,u],zero,zero,ymm6[8,12,u],zero,zero,ymm6[9,17,u],zero,zero,ymm6[22,18,u],zero,zero,ymm6[23,19,u],zero,zero,ymm6[24,28] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 ; AVX512F-FAST-NEXT: vporq %zmm7, %zmm5, %zmm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm6 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = ; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm7, %zmm6 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,10,11],zero,zero,xmm1[u,12,13],zero,zero,xmm1[u,14,15],zero,zero,xmm1[u] -; AVX512F-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] -; AVX512F-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[10,11,u],zero,zero,xmm2[12,13,u],zero,zero,xmm2[14,15,u] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,11],zero,zero,xmm0[u,12,13],zero,zero,xmm0[u,14,15],zero,zero,xmm0[u] +; AVX512F-FAST-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4],zero,xmm0[6,7,8,9],zero,xmm0[11,12,13,14],zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15] +; AVX512F-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, 64(%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512F-FAST-NEXT: vzeroupper @@ -1049,7 +1064,7 @@ ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zmm5[0,8],zero,zero,zero,zmm5[1,9],zero,zero,zero,zmm5[2,10],zero,zero,zero,zmm5[19,27],zero,zero,zero,zmm5[20,28],zero,zero,zero,zmm5[21,29],zero,zero,zero,zero,zero,zero,zmm5[35,39],zero,zero,zero,zmm5[40,44],zero,zero,zero,zmm5[41,45],zero,zero,zero,zmm5[50,54],zero,zero,zero,zmm5[51,55],zero,zero,zero,zmm5[56,60],zero,zero ; AVX512BW-FAST-NEXT: vporq %zmm7, %zmm5, %zmm5 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm6 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm6 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] ; AVX512BW-FAST-NEXT: vpermd %zmm6, %zmm7, %zmm6 ; AVX512BW-FAST-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 @@ -1083,501 +1098,519 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride5_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $152, %rsp -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa (%rsi), %xmm10 -; SSE-NEXT: movdqa 16(%rsi), %xmm12 -; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm1 -; SSE-NEXT: movdqa (%rcx), %xmm15 +; SSE-NEXT: subq $120, %rsp +; SSE-NEXT: movdqa (%rdi), %xmm8 +; SSE-NEXT: movdqa (%rsi), %xmm9 +; SSE-NEXT: movdqa 16(%rsi), %xmm15 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rcx), %xmm7 -; SSE-NEXT: movdqa 16(%r8), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdx), %xmm1 +; SSE-NEXT: movdqa 16(%rdx), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rcx), %xmm10 +; SSE-NEXT: movdqa 16(%rcx), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r8), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm1, %xmm13 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,1,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,0] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,0] +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,1,2,1] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,0,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdi), %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,2] -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm15 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa 16(%r8), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa (%r8), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm12[8],xmm0[9],xmm12[9],xmm0[10],xmm12[10],xmm0[11],xmm12[11],xmm0[12],xmm12[12],xmm0[13],xmm12[13],xmm0[14],xmm12[14],xmm0[15],xmm12[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,7] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm7[8],xmm14[9],xmm7[9],xmm14[10],xmm7[10],xmm14[11],xmm7[11],xmm14[12],xmm7[12],xmm14[13],xmm7[13],xmm14[14],xmm7[14],xmm14[15],xmm7[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,4] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,1,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm7, %xmm4 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,4,7] -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,2,3,3] -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm5, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,5,6,6,7] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,1,1] +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5,6,6] +; SSE-NEXT: pand %xmm11, %xmm8 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: pand %xmm3, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,7] +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,5,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm5, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[0,1,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,4] -; SSE-NEXT: pand %xmm3, %xmm15 -; SSE-NEXT: por %xmm7, %xmm15 -; SSE-NEXT: pand %xmm8, %xmm15 +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,4] +; SSE-NEXT: pand %xmm11, %xmm5 +; SSE-NEXT: por %xmm9, %xmm5 +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: por %xmm8, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[2,2,2,2] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: pand %xmm12, %xmm5 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[1,1,1,1] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[3,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,6] +; SSE-NEXT: pand %xmm11, %xmm15 ; SSE-NEXT: por %xmm4, %xmm15 +; SSE-NEXT: pand %xmm3, %xmm15 +; SSE-NEXT: pandn %xmm8, %xmm3 +; SSE-NEXT: por %xmm15, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: pand %xmm13, %xmm15 -; SSE-NEXT: por %xmm15, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: por %xmm3, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,4,7] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] ; SSE-NEXT: movdqa %xmm11, %xmm15 ; SSE-NEXT: pandn %xmm4, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,1,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,4] ; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm4, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm4, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,6] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: por %xmm15, %xmm4 ; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm15, %xmm1 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,1,1] -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,5,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,2] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,2] -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,3,3,3] -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: pand %xmm6, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: pand %xmm10, %xmm15 -; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshuflw $164, (%rsp), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,3] -; SSE-NEXT: movdqa %xmm13, %xmm15 -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: pand %xmm8, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm15 -; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,5,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,2] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,7,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm15, %xmm3 +; SSE-NEXT: pand %xmm6, %xmm4 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: pandn %xmm4, %xmm14 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[3,3,3,3] -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[0,1,2,3,7,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,3,2,2] -; SSE-NEXT: pandn %xmm12, %xmm11 -; SSE-NEXT: por %xmm2, %xmm11 -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pand %xmm6, %xmm11 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: por %xmm11, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: por %xmm5, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[1,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; SSE-NEXT: pandn %xmm5, %xmm9 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm2 -; SSE-NEXT: pshuflw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm15[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm4, %xmm14 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm14, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,0,1,3] +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: pandn %xmm13, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[0,0,0,0] +; SSE-NEXT: pand %xmm6, %xmm13 +; SSE-NEXT: por %xmm13, %xmm14 +; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: por %xmm4, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm13, %xmm4 +; SSE-NEXT: pand %xmm11, %xmm14 +; SSE-NEXT: por %xmm14, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,3,2] +; SSE-NEXT: movdqa %xmm11, %xmm13 +; SSE-NEXT: pandn %xmm10, %xmm13 +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm15[0,1,2,3,7,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,2,2,2] +; SSE-NEXT: pand %xmm11, %xmm10 +; SSE-NEXT: por %xmm10, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: pandn %xmm13, %xmm10 +; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: # xmm13 = mem[0,1,2,3,7,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,3,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: pandn %xmm13, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[3,3,3,3] +; SSE-NEXT: pand %xmm7, %xmm13 +; SSE-NEXT: por %xmm13, %xmm14 +; SSE-NEXT: pand %xmm12, %xmm14 +; SSE-NEXT: por %xmm10, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm15, %xmm10 +; SSE-NEXT: pand %xmm13, %xmm14 +; SSE-NEXT: por %xmm14, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm9[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,0,0,0] +; SSE-NEXT: pand %xmm2, %xmm14 +; SSE-NEXT: pshuflw $225, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = mem[1,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,0,1] +; SSE-NEXT: pandn %xmm12, %xmm2 +; SSE-NEXT: por %xmm14, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm14[0,0,0,0] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm12 +; SSE-NEXT: pshuflw $164, (%rsp), %xmm5 # 16-byte Folded Reload ; SSE-NEXT: # xmm5 = mem[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,1,3] -; SSE-NEXT: pandn %xmm5, %xmm13 -; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: pand %xmm8, %xmm13 -; SSE-NEXT: pandn %xmm9, %xmm8 -; SSE-NEXT: por %xmm13, %xmm8 -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm8, %xmm3 -; SSE-NEXT: movdqa %xmm3, (%r9) +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: por %xmm12, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pandn %xmm5, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,7,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,2,3,5,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,3,2] +; SSE-NEXT: pandn %xmm5, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pshufhw $167, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; SSE-NEXT: # xmm5 = mem[0,1,2,3,7,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,2] +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: por %xmm7, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: por %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm13, 144(%r9) +; SSE-NEXT: movdqa %xmm2, 80(%r9) ; SSE-NEXT: movdqa %xmm10, 64(%r9) -; SSE-NEXT: movdqa %xmm0, 80(%r9) -; SSE-NEXT: movdqa %xmm14, 144(%r9) -; SSE-NEXT: movdqa %xmm7, 16(%r9) +; SSE-NEXT: movdqa %xmm4, (%r9) +; SSE-NEXT: movdqa %xmm3, 128(%r9) +; SSE-NEXT: movdqa %xmm8, 96(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 48(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 96(%r9) +; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%r9) +; SSE-NEXT: movaps %xmm0, 112(%r9) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 32(%r9) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 112(%r9) -; SSE-NEXT: addq $152, %rsp +; SSE-NEXT: addq $120, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride5_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm13 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] +; AVX1-ONLY-NEXT: subq $24, %rsp +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm7 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [3,0,0,1,4,5,0,2,3,0,0,1,4,5,0,2] ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [128,1,2,3,4,128,6,7,8,9,128,11,12,13,14,128] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm8 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,1,4,5,0,2,3,6,0,1,4,5,0,2,3,6] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[4,5,6,7],zero,xmm4[9,10,11,12],zero,xmm4[14,15] +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm9[3],zero,zero,zero,zero,xmm9[4],zero,zero,zero,zero,xmm9[5],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm8, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,128,5,6,7,8,128,10,11,12,13,128,15] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,128,0,128,128,128,128,1,128,128,128,128,2,128] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm9, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm8, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5],zero,xmm2[7,8,9,10],zero,xmm2[12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm1[9],zero,zero,zero,zero,xmm1[10],zero,zero,zero,zero,xmm1[11],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3],xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm15 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm15[8],xmm8[9],xmm15[9],xmm8[10],xmm15[10],xmm8[11],xmm15[11],xmm8[12],xmm15[12],xmm8[13],xmm15[13],xmm8[14],xmm15[14],xmm8[15],xmm15[15] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm10 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm12, %ymm10 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm10 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3],zero,xmm10[5,6,7,8],zero,xmm10[10,11,12,13],zero,xmm10[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,xmm1[2],zero -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm14, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] +; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm8, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [128,1,2,3,4,128,6,7,8,9,128,11,12,13,14,128] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm9, %xmm12 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm12, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm9[6,u,u,u],zero,xmm9[7,u,u,u],zero,xmm9[8,u,u,u],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6],zero,xmm5[u,u,u,7],zero,xmm5[u,u,u,8],zero,xmm5[u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm5, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [7,0,4,5,8,9,0,6,7,0,4,5,8,9,0,6] ; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm13, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm9, %xmm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm9, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm9, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,128,4,5,6,7,128,9,10,11,12,128,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,3,128,128,128,128,4,128,128,128,128,5,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,128,3,4,5,6,128,8,9,10,11,128,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3],xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm5 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm14, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,xmm1[5,6,7,8],zero,xmm1[10,11,12,13],zero,xmm1[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm14[0],zero,zero,zero,zero,xmm14[1],zero,zero,zero,zero,xmm14[2],zero -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm11[8],xmm15[9],xmm11[9],xmm15[10],xmm11[10],xmm15[11],xmm11[11],xmm15[12],xmm11[12],xmm15[13],xmm11[13],xmm15[14],xmm11[14],xmm15[15],xmm11[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm15[8],xmm11[9],xmm15[9],xmm11[10],xmm15[10],xmm11[11],xmm15[11],xmm11[12],xmm15[12],xmm11[13],xmm15[13],xmm11[14],xmm15[14],xmm11[15],xmm15[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,128,2,3,4,5,128,7,8,9,10,128,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [128,9,128,128,128,128,10,128,128,128,128,11,128,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u],zero,xmm5[7,u,u,u],zero,xmm5[8,u,u,u],zero,xmm5[9,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] +; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm15[6,u,u,u],zero,xmm15[7,u,u,u],zero,xmm15[8,u,u,u],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[6],zero,xmm8[u,u,u,7],zero,xmm8[u,u,u,8],zero,xmm8[u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5],zero,xmm2[7,8,9,10],zero,xmm2[12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm14[9],zero,zero,zero,zero,xmm14[10],zero,zero,zero,zero,xmm14[11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] +; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm6 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [6,128,8,128,0,7,128,9,6,128,8,128,0,7,128,9] +; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm14, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,128,3,4,5,6,128,8,9,10,11,128,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm9, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 48(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 32(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 16(%r9) +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm15, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u,u],zero,xmm10[7,u,u,u],zero,xmm10[8,u,u,u],zero,xmm10[9,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,2],zero,xmm0[4,5,6,7],zero,xmm0[9,10,11,12],zero,xmm0[14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm13[3],zero,zero,zero,zero,xmm13[4],zero,zero,zero,zero,xmm13[5],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 112(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 96(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 96(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 48(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 32(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%r9) +; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%r9) +; AVX1-ONLY-NEXT: addq $24, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -1671,8 +1704,8 @@ ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm8, 96(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 128(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm7, 32(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 128(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm6, (%r9) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -1696,10 +1729,11 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm5, %ymm10, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,0,0,0,0,1,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm5, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,0,0,0,0,1,1] +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm5, %ymm11, %ymm5 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm9[6],zero,xmm9[8,u],zero,xmm9[7],zero,xmm9[9],zero,xmm9[11,u],zero,xmm9[10],zero,xmm9[12] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6],zero,xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[11],zero,xmm8[u,10],zero,xmm8[12],zero ; AVX2-FAST-NEXT: vpor %xmm9, %xmm8, %xmm8 @@ -1711,40 +1745,40 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm8, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,2,2,2,2,2,2] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,ymm3[19,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25],zero,ymm3[23] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,ymm1[29,26],zero,ymm1[28],zero,ymm1[30],zero,ymm1[28,29],zero,ymm1[31],zero,ymm1[29] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX2-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[29,26],zero,ymm3[28],zero,ymm3[26,27,28,29],zero,ymm3[31],zero,ymm3[29,30],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,6,5,5,5,5,4,6] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,6,6,6,7,7,7,7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,ymm1[29,26],zero,ymm1[28],zero,ymm1[30],zero,ymm1[28,29],zero,ymm1[31],zero,ymm1[29] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,ymm3[19,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25],zero,ymm3[23] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[29,26],zero,ymm3[28],zero,ymm3[26,27,28,29],zero,ymm3[31],zero,ymm3[29,30],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] ; AVX2-FAST-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [6,6,6,6,7,7,7,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,6,5,5,5,5,4,6] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <3,3,3,u,4,4,4,4> ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 @@ -1761,8 +1795,8 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 128(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 96(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 96(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 128(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm6, 32(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-NEXT: vzeroupper @@ -1806,37 +1840,37 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,ymm3[19,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25],zero,ymm3[23] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,ymm1[29,26],zero,ymm1[28],zero,ymm1[30],zero,ymm1[28,29],zero,ymm1[31],zero,ymm1[29] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[29,26],zero,ymm3[28],zero,ymm3[26,27,28,29],zero,ymm3[31],zero,ymm3[29,30],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,1,1,4,6,5,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,2,3,3,6,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,ymm1[29,26],zero,ymm1[28],zero,ymm1[30],zero,ymm1[28,29],zero,ymm1[31],zero,ymm1[29] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,ymm3[19,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25],zero,ymm3[23] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[29,26],zero,ymm3[28],zero,ymm3[26,27,28,29],zero,ymm3[31],zero,ymm3[29,30],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,2,3,3,6,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,2,1,1,4,6,5,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <3,3,3,u,4,4,4,4> ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm9, %ymm3 @@ -1853,8 +1887,8 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 128(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 128(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -1862,169 +1896,157 @@ ; ; AVX512F-SLOW-LABEL: store_i8_stride5_vf32: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[8],zero,xmm5[u,7],zero,xmm5[9],zero,xmm5[u],zero,xmm5[u,10],zero,xmm5[12],zero,xmm5[u,11] -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9,u,11,u],zero,xmm7[10],zero,xmm7[12,u],zero +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm5 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm5[6],zero,xmm5[8,u],zero,xmm5[7],zero,xmm5[9],zero,xmm5[11,u],zero,xmm5[10],zero,xmm5[12] +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[6],zero,xmm7[8],zero,xmm7[u,7],zero,xmm7[9],zero,xmm7[11],zero,xmm7[u,10],zero,xmm7[12],zero ; AVX512F-SLOW-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm8[6],zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9],zero,xmm8[11,u],zero,xmm8[10],zero,xmm8[12] -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[6],zero,xmm10[8],zero,xmm10[u,7],zero,xmm10[9],zero,xmm10[11],zero,xmm10[u,10],zero,xmm10[12],zero -; AVX512F-SLOW-NEXT: vpor %xmm9, %xmm11, %xmm9 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm6, %ymm11, %ymm9 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm10, %zmm6 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[0,0,1,1,4,4,5,5] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[u],zero,xmm9[u,10],zero,xmm9[12],zero,xmm9[u,11] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9,u,11,u],zero,xmm8[10],zero,xmm8[12,u],zero +; AVX512F-SLOW-NEXT: vpor %xmm9, %xmm8, %xmm8 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm8, %ymm7, %ymm5 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,0,1,1,4,4,5,5] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm5 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm6 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,1,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u],zero,ymm4[13,u,u,u],zero,ymm4[14,u,u,u],zero,ymm4[15,u,u,u],zero,ymm4[16,u,u,u],zero,ymm4[17,u,u,u],zero,ymm4[18,u,u] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,13],zero,ymm3[u,u,u,14],zero,ymm3[u,u,u,15],zero,ymm3[u,u,u,16],zero,ymm3[u,u,u,17],zero,ymm3[u,u,u,18],zero,ymm3[u,u] -; AVX512F-SLOW-NEXT: vpor %ymm5, %ymm8, %ymm5 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u],zero,ymm2[13,u,u,u],zero,ymm2[14,u,u,u],zero,ymm2[15,u,u,u],zero,ymm2[16,u,u,u],zero,ymm2[17,u,u,u],zero,ymm2[18,u,u,u],zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,13],zero,ymm1[u,u,u,14],zero,ymm1[u,u,u,15],zero,ymm1[u,u,u,16],zero,ymm1[u,u,u,17],zero,ymm1[u,u,u,18],zero,ymm1[u,u,u,19] -; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512F-SLOW-NEXT: vpternlogq $226, %ymm5, %ymm11, %ymm8 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [18374966859431608575,18374966859431608575,18446463693966278400,18446463693966278400] -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm10, %ymm5, %ymm9 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm3[21],zero,ymm3[21,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX512F-SLOW-NEXT: vpor %ymm5, %ymm11, %ymm5 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[12],zero,zero,zero,zero,ymm0[13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,ymm0[18],zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,2,1,1,4,6,5,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX512F-SLOW-NEXT: vpandn %ymm9, %ymm11, %ymm9 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm8 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,27,u,u,26,u,28,u,30,u,u,29,u,31,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm1[21,u],zero,ymm1[20],zero,ymm1[22],zero,ymm1[24,u],zero,ymm1[23],zero,ymm1[25,u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX512F-SLOW-NEXT: vpor %ymm5, %ymm7, %ymm5 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm1[13,u,u],zero,zero,ymm1[14,u,u],zero,zero,ymm1[15,u,u],zero,zero,ymm1[16,u,u],zero,zero,ymm1[17,u,u],zero,zero,ymm1[18,u,u],zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[12,13],zero,ymm2[u,u,13,14],zero,ymm2[u,u,14,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,16,17],zero,ymm2[u,u,17,18],zero,ymm2[u,u,18,19] +; AVX512F-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm4[21],zero,ymm4[21,20],zero,ymm4[22],zero,ymm4[24],zero,ymm4[22,23],zero,ymm4[25] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX512F-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,13],zero,ymm4[u,u,u,14],zero,ymm4[u,u,u,15],zero,ymm4[u,u,u,16],zero,ymm4[u,u,u,17],zero,ymm4[u,u,u,18],zero,ymm4[u,u] +; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm0[12],zero,zero,zero,zero,ymm0[13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,ymm0[18],zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,1,1,4,6,5,5] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX512F-SLOW-NEXT: vpandn %ymm8, %ymm9, %ymm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm5 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[27],zero,zero,ymm4[26],zero,ymm4[28],zero,ymm4[30],zero,zero,ymm4[29],zero,ymm4[31],zero,zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[27],zero,zero,ymm3[26],zero,ymm3[28],zero,ymm3[30],zero,zero,ymm3[29],zero,ymm3[31],zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[27,u],zero,ymm3[26],zero,ymm3[28],zero,ymm3[30,u],zero,ymm3[29],zero,ymm3[31,u] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm10, %ymm4, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] +; AVX512F-SLOW-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,ymm1[27],zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30],zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm1[26],zero,ymm1[28,u],zero,ymm1[u],zero,ymm1[29],zero,ymm1[31,u],zero,ymm1[30] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm3, %ymm7, %ymm1 +; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm1 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 128(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 64(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 64(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i8_stride5_vf32: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11] -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero -; AVX512F-FAST-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[6],zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9],zero,xmm7[11,u],zero,xmm7[10],zero,xmm7[12] -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[6],zero,xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[11],zero,xmm9[u,10],zero,xmm9[12],zero -; AVX512F-FAST-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX512F-FAST-NEXT: vpternlogq $226, %ymm5, %ymm10, %ymm8 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512F-FAST-NEXT: vpternlogq $226, %ymm7, %ymm6, %ymm4 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm5 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm5[6],zero,xmm5[8,u],zero,xmm5[7],zero,xmm5[9],zero,xmm5[11,u],zero,xmm5[10],zero,xmm5[12] +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[6],zero,xmm7[8],zero,xmm7[u,7],zero,xmm7[9],zero,xmm7[11],zero,xmm7[u,10],zero,xmm7[12],zero +; AVX512F-FAST-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm10, %zmm6 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm6[0,0,1,1,4,4,5,5] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[u],zero,xmm9[u,10],zero,xmm9[12],zero,xmm9[u,11] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9,u,11,u],zero,xmm8[10],zero,xmm8[12,u],zero +; AVX512F-FAST-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,0,1,1,4,4,5,5] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm6 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm6 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = -; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm7, %zmm7 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,13],zero,ymm2[u,u,u,14],zero,ymm2[u,u,u,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,u,17],zero,ymm2[u,u,u,18],zero,ymm2[u,u] -; AVX512F-FAST-NEXT: vpor %ymm5, %ymm8, %ymm5 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u],zero,ymm1[13,u,u,u],zero,ymm1[14,u,u,u],zero,ymm1[15,u,u,u],zero,ymm1[16,u,u,u],zero,ymm1[17,u,u,u],zero,ymm1[18,u,u,u],zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,13],zero,ymm0[u,u,u,14],zero,ymm0[u,u,u,15],zero,ymm0[u,u,u,16],zero,ymm0[u,u,u,17],zero,ymm0[u,u,u,18],zero,ymm0[u,u,u,19] -; AVX512F-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512F-FAST-NEXT: vpternlogq $226, %ymm5, %ymm10, %ymm8 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[21],zero,zero,ymm0[20],zero,ymm0[22],zero,ymm0[24],zero,zero,ymm0[23],zero,ymm0[25],zero,zero -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [18374966859431608575,18374966859431608575,18446463693966278400,18446463693966278400] -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm10, %ymm5, %ymm9 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero +; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm7, %zmm6 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm2[21],zero,ymm2[21,20],zero,ymm2[22],zero,ymm2[24],zero,ymm2[22,23],zero,ymm2[25] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX512F-FAST-NEXT: vpor %ymm5, %ymm11, %ymm5 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <4,u,5,5,5,5,u,6> -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX512F-FAST-NEXT: vpandn %ymm8, %ymm9, %ymm8 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[12],zero,zero,zero,zero,ymm4[13],zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,ymm4[18],zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm8 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,27,u,u,26,u,28,u,30,u,u,29,u,31,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm1[21,u],zero,ymm1[20],zero,ymm1[22],zero,ymm1[24,u],zero,ymm1[23],zero,ymm1[25,u] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX512F-FAST-NEXT: vpor %ymm5, %ymm7, %ymm5 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm1[13,u,u],zero,zero,ymm1[14,u,u],zero,zero,ymm1[15,u,u],zero,zero,ymm1[16,u,u],zero,zero,ymm1[17,u,u],zero,zero,ymm1[18,u,u],zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[12,13],zero,ymm2[u,u,13,14],zero,ymm2[u,u,14,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,16,17],zero,ymm2[u,u,17,18],zero,ymm2[u,u,18,19] +; AVX512F-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm4[21],zero,ymm4[21,20],zero,ymm4[22],zero,ymm4[24],zero,ymm4[22,23],zero,ymm4[25] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX512F-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,13],zero,ymm4[u,u,u,14],zero,ymm4[u,u,u,15],zero,ymm4[u,u,u,16],zero,ymm4[u,u,u,17],zero,ymm4[u,u,u,18],zero,ymm4[u,u] +; AVX512F-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <4,u,5,5,5,5,u,6> +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX512F-FAST-NEXT: vpandn %ymm5, %ymm8, %ymm5 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[12],zero,zero,zero,zero,ymm0[13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,ymm0[18],zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm5 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[27],zero,zero,ymm4[26],zero,ymm4[28],zero,ymm4[30],zero,zero,ymm4[29],zero,ymm4[31],zero,zero +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[27,u],zero,ymm3[26],zero,ymm3[28],zero,ymm3[30,u],zero,ymm3[29],zero,ymm3[31,u] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero,zero +; AVX512F-FAST-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm10, %ymm3, %ymm2 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm1[26],zero,ymm1[28,u],zero,ymm1[u],zero,ymm1[29],zero,ymm1[31,u],zero,ymm1[30] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm0[26],zero,ymm0[28],zero,zero,ymm0[27],zero,ymm0[29],zero,ymm0[31],zero,zero,ymm0[30],zero -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm2, %ymm6, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <6,6,6,u,7,7,7,7> -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm1, 128(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 64(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512F-FAST-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <6,6,6,u,7,7,7,7> +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, 128(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 64(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, (%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -2127,11 +2149,11 @@ ; AVX512BW-FAST-LABEL: store_i8_stride5_vf32: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm4 ; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %ymm0 ; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11] +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[8],zero,xmm3[u,7],zero,xmm3[9],zero,xmm3[u],zero,xmm3[u,10],zero,xmm3[12],zero,xmm3[u,11] ; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %xmm6 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero ; AVX512BW-FAST-NEXT: vpor %xmm5, %xmm7, %xmm5 @@ -2144,53 +2166,54 @@ ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm8[6],zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9],zero,xmm8[11,u],zero,xmm8[10],zero,xmm8[12] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6],zero,xmm7[8],zero,xmm7[u,7],zero,xmm7[9],zero,xmm7[11],zero,xmm7[u,10],zero,xmm7[12],zero ; AVX512BW-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,0,1,1,4,4,5,5] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,0,1,1,4,4,5,5] ; AVX512BW-FAST-NEXT: movabsq $3570337559743967628, %rax # imm = 0x318C631818C6318C ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm4 {%k1} -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm5 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm5 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] -; AVX512BW-FAST-NEXT: vpermd %zmm5, %zmm6, %zmm6 +; AVX512BW-FAST-NEXT: vpermd %zmm5, %zmm6, %zmm5 ; AVX512BW-FAST-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25] +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX512BW-FAST-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18],zero,zero,zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero ; AVX512BW-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18],zero,zero,zero -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero -; AVX512BW-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX512BW-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <3,3,3,u,4,4,4,4> -; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm8 +; AVX512BW-FAST-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <3,3,3,u,4,4,4,4> +; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm7 ; AVX512BW-FAST-NEXT: movl $138547332, %eax # imm = 0x8421084 ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm8 {%k1} = ymm3[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 {%k1} = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 ; AVX512BW-FAST-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = <3,3,3,3,u,4,4,4,12,14,13,13,13,13,12,14> -; AVX512BW-FAST-NEXT: vpermd %zmm6, %zmm8, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm6 {%k1} +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = <3,3,3,3,u,4,4,4,12,14,13,13,13,13,12,14> +; AVX512BW-FAST-NEXT: vpermd %zmm5, %zmm7, %zmm7 ; AVX512BW-FAST-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,zero,zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm7, %zmm6 {%k1} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,ymm1[27],zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX512BW-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX512BW-FAST-NEXT: vpor %ymm4, %ymm1, %ymm1 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero @@ -2205,8 +2228,8 @@ ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 ; AVX512BW-FAST-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa %ymm0, 128(%r9) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, 64(%r9) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, (%r9) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, (%r9) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64 @@ -2925,560 +2948,548 @@ ; AVX1-ONLY-LABEL: store_i8_stride5_vf64: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: subq $104, %rsp -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm0[6,u,u,u],zero,xmm0[7,u,u,u],zero,xmm0[8,u,u,u],zero -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [6,128,8,128,0,7,128,9,6,128,8,128,0,7,128,9] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm9 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0] +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [3,0,0,1,4,5,0,2,3,0,0,1,4,5,0,2] ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm9, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1],zero,xmm3[3,4,5,6],zero,xmm3[8,9,10,11],zero,xmm3[13,14,15] -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm9 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm9[6],zero,zero,zero,zero,xmm9[7],zero,zero,zero,zero,xmm9[8],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7],zero,xmm1[9,10,11,12],zero,xmm1[14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm9[3],zero,zero,zero,zero,xmm9[4],zero,zero,zero,zero,xmm9[5],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,10,11,u,u,u,12,13,u,u,u,14,15,u,u,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [7,0,4,5,8,9,0,6,7,0,4,5,8,9,0,6] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] -; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,7,6,0,5,4,9,8,2,7,6,0,5,4,9,8] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[12],zero,zero,zero,zero,xmm9[13],zero,zero,zero,zero,xmm9[14],zero,zero,zero,zero,xmm9[15] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [0,128,2,3,4,5,128,7,8,9,10,128,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [128,9,128,128,128,128,10,128,128,128,128,11,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm9, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm4[6,u,u,u],zero,xmm4[7,u,u,u],zero,xmm4[8,u,u,u],zero -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm8, %xmm10, %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm10, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm14 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm14, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1],zero,xmm8[3,4,5,6],zero,xmm8[8,9,10,11],zero,xmm8[13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm3[6],zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,xmm3[8],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [3,0,0,1,4,5,0,2,3,0,0,1,4,5,0,2] -; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm13, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm14 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm15 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,1,4,5,0,2,3,6,0,1,4,5,0,2,3,6] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm13 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,1,4,5,0,2,3,6,0,1,4,5,0,2,3,6] ; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,128,5,6,7,8,128,10,11,12,13,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,128,0,128,128,128,128,1,128,128,128,128,2,128] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[0,1,2],zero,xmm2[4,5,6,7],zero,xmm2[9,10,11,12],zero,xmm2[14,15] +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm2[3],zero,zero,zero,zero,xmm2[4],zero,zero,zero,zero,xmm2[5],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [128,1,2,3,4,128,6,7,8,9,128,11,12,13,14,128] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,128,5,6,7,8,128,10,11,12,13,128,15] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,xmm2[2],zero ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm14 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u],zero,xmm4[7,u,u,u],zero,xmm4[8,u,u,u],zero,xmm4[9,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,7],zero,xmm0[u,u,u,8],zero,xmm0[u,u,u,9],zero,xmm0[u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [6,128,8,128,0,7,128,9,6,128,8,128,0,7,128,9] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0],zero,xmm4[2,3,4,5],zero,xmm4[7,8,9,10],zero,xmm4[12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm2[9],zero,zero,zero,zero,xmm2[10],zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[3,4,5,6],zero,xmm0[8,9,10,11],zero,xmm0[13,14,15] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u] +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] +; AVX1-ONLY-NEXT: # xmm10 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm12, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm6[0],zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,xmm6[2],zero +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm8[1,2,3,4],zero,xmm8[6,7,8,9],zero,xmm8[11,12,13,14],zero +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u],zero,xmm3[7,u,u,u],zero,xmm3[8,u,u,u],zero,xmm3[9,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,7],zero,xmm0[u,u,u,8],zero,xmm0[u,u,u,9],zero,xmm0[u] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12] +; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1],zero,xmm4[3,4,5,6],zero,xmm4[8,9,10,11],zero,xmm4[13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,128,4,5,6,7,128,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,3,128,128,128,128,4,128,128,128,128,5,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [7,0,4,5,8,9,0,6,7,0,4,5,8,9,0,6] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7],zero,xmm1[9,10,11,12],zero,xmm1[14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm3[3],zero,zero,zero,zero,xmm3[4],zero,zero,zero,zero,xmm3[5],zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,128,2,3,4,5,128,7,8,9,10,128,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [128,9,128,128,128,128,10,128,128,128,128,11,128,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm8, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0],zero,xmm5[2,3,4,5],zero,xmm5[7,8,9,10],zero,xmm5[12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3],zero,xmm5[5,6,7,8],zero,xmm5[10,11,12,13],zero,xmm5[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,xmm1[2],zero ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm11 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,10,11,u,u,u,12,13,u,u,u,14,15,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm14, %ymm11 -; AVX1-ONLY-NEXT: vandps %ymm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[12],zero,zero,zero,zero,xmm11[13],zero,zero,zero,zero,xmm11[14],zero,zero,zero,zero,xmm11[15] -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6],zero,xmm2[u,u,u,7],zero,xmm2[u,u,u,8],zero,xmm2[u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,6,7,10,11,0,8,9,0,6,7,10,11,0,8,9] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,xmm8[7,u,u,u],zero,xmm8[8,u,u,u],zero,xmm8[9,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,7],zero,xmm4[u,u,u,8],zero,xmm4[u,u,u,9],zero,xmm4[u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [6,11,10,0,9,8,13,12,6,11,10,0,9,8,13,12] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u],zero,xmm4[7,u,u,u],zero,xmm4[8,u,u,u],zero,xmm4[9,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,7],zero,xmm3[u,u,u,8],zero,xmm3[u,u,u,9],zero,xmm3[u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128] +; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [6,128,8,128,0,7,128,9,6,128,8,128,0,7,128,9] +; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,128,3,4,5,6,128,8,9,10,11,128,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,6,128,128,128,128,7,128,128,128,128,8,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,128,4,5,6,7,128,9,10,11,12,128,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,3,128,128,128,128,4,128,128,128,128,5,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm1[6],zero,zero,zero,zero,xmm1[7],zero,zero,zero,zero,xmm1[8],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,0,1,u,u,u,2,3,u,u,u,4,5,u,u] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [12,13,0,10,11,14,15,0,12,13,0,10,11,14,15,0] +; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm12, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm7, %ymm10 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[0,1,2,3],zero,xmm0[5,6,7,8],zero,xmm0[10,11,12,13],zero,xmm0[15] +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm15, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [128,1,2,3,4,128,6,7,8,9,128,11,12,13,14,128] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15] +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm10, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm15, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u],zero,xmm2[7,u,u,u],zero,xmm2[8,u,u,u],zero,xmm2[9,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,7],zero,xmm3[u,u,u,8],zero,xmm3[u,u,u,9],zero,xmm3[u] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm15, %xmm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm15, %ymm10 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm11, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm10 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm0[6],zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,xmm0[8],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2],zero,xmm6[4,5,6,7],zero,xmm6[9,10,11,12],zero,xmm6[14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm0[3],zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,xmm0[5],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,u,u,u,2,3,u,u,u,4,5,u,u,u,6] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3],zero,xmm2[5,6,7,8],zero,xmm2[10,11,12,13],zero,xmm2[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm11[0],zero,zero,zero,zero,xmm11[1],zero,zero,zero,zero,xmm11[2],zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u],zero,xmm5[7,u,u,u],zero,xmm5[8,u,u,u],zero,xmm5[9,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,7],zero,xmm6[u,u,u,8],zero,xmm6[u,u,u,9],zero,xmm6[u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm9[6,u,u,u],zero,xmm9[7,u,u,u],zero,xmm9[8,u,u,u],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[6],zero,xmm7[u,u,u,7],zero,xmm7[u,u,u,8],zero,xmm7[u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,5,4,u,u,u,7,6,u,u,u,9,8] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm1, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0],zero,xmm5[2,3,4,5],zero,xmm5[7,8,9,10],zero,xmm5[12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm11[9],zero,zero,zero,zero,xmm11[10],zero,zero,zero,zero,xmm11[11],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 96(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm13, 112(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%r9) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%r9) +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5],zero,xmm2[7,8,9,10],zero,xmm2[12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 288(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 304(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm6, 256(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 272(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 224(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 240(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%r9) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%r9) ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%r9) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%r9) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%r9) ; AVX1-ONLY-NEXT: addq $104, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i8_stride5_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $184, %rsp -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX2-SLOW-NEXT: subq $216, %rsp +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm9 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm11, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm10, %xmm7 ; AVX2-SLOW-NEXT: vpor %xmm4, %xmm7, %xmm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-SLOW-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm1 -; AVX2-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm10 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[1,1,2,2] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm15 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,5,128,3,4,128,6,128,8,128,6,7,128,9,128,7,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm10, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,128,5,128,128,4,128,6,128,8,128,128,7,128,9,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm9, %ymm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] ; AVX2-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm12 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,3,128,5,128,128,4,128,6,128,8,128,128,7,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm12, %ymm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm14, %ymm13 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] -; AVX2-SLOW-NEXT: vpor %ymm4, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm13, %ymm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm9 -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX2-SLOW-NEXT: vpor %ymm1, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm3, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm14 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,3,128,5,128,128,4,128,6,128,8,128,128,7,128,9,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm14, %ymm7 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,2,1,1,4,6,5,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpor %ymm4, %ymm7, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm15, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm13, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX2-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX2-SLOW-NEXT: vpor %ymm3, %ymm6, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm3, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[0,2,1,1,4,6,5,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,3,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm6, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[0,2,1,1,4,6,5,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,2] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[0,2,1,1,4,6,5,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,3,2] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <3,3,3,u,4,4,4,4> -; AVX2-SLOW-NEXT: vpermd %ymm15, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <3,3,3,u,4,4,4,4> +; AVX2-SLOW-NEXT: vpermd %ymm11, %ymm5, %ymm6 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm9, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm5, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm14, %ymm15 -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm15, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm5 -; AVX2-SLOW-NEXT: vpor %ymm2, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <3,3,3,3,u,4,4,4> -; AVX2-SLOW-NEXT: vpermd %ymm8, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm15 -; AVX2-SLOW-NEXT: vpermd %ymm7, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm6, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermd %ymm15, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm13, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm12, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm14, %ymm11 +; AVX2-SLOW-NEXT: vpor %ymm6, %ymm11, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm7, %ymm2, %ymm6 +; AVX2-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <3,3,3,3,u,4,4,4> +; AVX2-SLOW-NEXT: vpermd %ymm8, %ymm5, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,0,1,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm5, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm6 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3],xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufd $80, (%rsp), %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = mem[0,0,1,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm1 = mem[0,0,1,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm5, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm5 = mem[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm12, %ymm6 +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm12, %ymm10 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm14[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,3,3,6,6,7,7] ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0,255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0] ; AVX2-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm6, %ymm11, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255] -; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm5, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm9, %ymm4 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm13[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm9, %ymm5 +; AVX2-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255] +; AVX2-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm5, %ymm11, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm13, %ymm4 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm15[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm4, %ymm9, %ymm4 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm9, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm10[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm9, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm2, %ymm4, %ymm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[2,2,3,3,6,6,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm15, 224(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 128(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 288(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm7, 224(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 256(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 288(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm2, 192(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 128(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-SLOW-NEXT: addq $184, %rsp +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-SLOW-NEXT: addq $216, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride5_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $168, %rsp -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm11 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm9 +; AVX2-FAST-NEXT: subq $232, %rsp +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm10 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 @@ -3491,158 +3502,163 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpor %xmm4, %xmm6, %xmm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm0 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX2-FAST-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm3 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm3 ; AVX2-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,2,2,2,2,2,2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm12 -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,5,128,3,4,128,6,128,8,128,6,7,128,9,128,7,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqu %ymm13, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] -; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm13 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,2,3,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [3,128,5,128,128,4,128,6,128,8,128,128,7,128,9,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm11 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,3,128,5,128,128,4,128,6,128,8,128,128,7,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,3,128,5,128,128,4,128,6,128,8,128,128,7,128,9,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm15, %ymm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] ; AVX2-FAST-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm7 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm10, %ymm14, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [6,6,6,6,7,7,7,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm10, %ymm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm14, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,6,5,5,5,5,4,6] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm14 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [128,11,128,13,10,128,12,128,14,128,12,13,128,15,128,13,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm7 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm7, %ymm14, %ymm7 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [9,128,11,128,128,10,128,12,128,14,128,128,13,128,15,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm12 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm7, %ymm12, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,13,10,128,12,128,10,11,12,13,128,15,128,13,14,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] +; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm13 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm15 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm13, %ymm15, %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm13, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm8 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm8, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [11,128,128,10,128,12,128,128,128,128,13,128,15,128,128,14,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm10 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm6 +; AVX2-FAST-NEXT: vpor %ymm13, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm7, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm8 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm6 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX2-FAST-NEXT: vpor %ymm6, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm8, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [4,6,5,5,5,5,4,6] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm8, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm7, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm6, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <3,3,3,u,4,4,4,4> -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm8 -; AVX2-FAST-NEXT: vpor %ymm3, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <3,3,3,3,u,4,4,4> -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX2-FAST-NEXT: vpor %ymm5, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm6, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [6,6,6,6,7,7,7,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <3,3,3,u,4,4,4,4> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm8 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm5, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm11 +; AVX2-FAST-NEXT: vpor %ymm9, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <3,3,3,3,u,4,4,4> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm8, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] @@ -3653,51 +3669,55 @@ ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm4, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 224(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 96(%r9) +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm0, 224(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 288(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 128(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 256(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 160(%r9) +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 160(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-NEXT: addq $168, %rsp +; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-FAST-NEXT: addq $232, %rsp ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i8_stride5_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $200, %rsp -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX2-FAST-PERLANE-NEXT: subq $232, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm6 @@ -3708,1097 +3728,878 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [128,5,128,3,4,128,6,128,8,128,6,7,128,9,128,7,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [3,128,5,128,128,4,128,6,128,8,128,128,7,128,9,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm10, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm4, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm6, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,3,128,5,128,128,4,128,6,128,8,128,128,7,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm10, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [128,3,128,5,128,128,4,128,6,128,8,128,128,7,128,9,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm15, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm2, %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm9, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm3, %ymm8, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,2,3,3,6,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm14, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[2,2,3,3,6,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm15, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm5, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm3, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm14[0,2,1,1,4,6,5,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm13[0,2,1,1,4,6,5,5] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [128,11,128,13,10,128,12,128,14,128,12,13,128,15,128,13,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm15, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [9,128,11,128,128,10,128,12,128,14,128,128,13,128,15,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm10, %ymm14, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm10, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [128,13,10,128,12,128,10,11,12,13,128,15,128,13,14,128,128,29,26,128,28,128,26,27,28,29,128,31,128,29,30,128] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm12, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm9, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm12, %ymm15, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm10, %ymm12, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [11,128,128,10,128,12,128,128,128,128,13,128,15,128,128,14,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm0, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm0[21],zero,zero,ymm0[20],zero,ymm0[22],zero,ymm0[24],zero,zero,ymm0[23],zero,ymm0[25],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm7, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm13, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm12, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm7, %ymm11, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm3, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm9, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[0,2,1,1,4,6,5,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm10, %ymm9, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,1,1,4,6,5,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <3,3,3,u,4,4,4,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm6, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm14[2,2,3,3,6,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[2,2,3,3,6,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <3,3,3,u,4,4,4,4> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm5, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm10 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm5, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm11, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm9, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm11, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm13, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <3,3,3,3,u,4,4,4> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <3,3,3,3,u,4,4,4> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm14, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm8, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm13, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw (%rsp), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufd $80, (%rsp), %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 224(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 288(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 128(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 256(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 160(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 160(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: addq $200, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r9) +; AVX2-FAST-PERLANE-NEXT: addq $232, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i8_stride5_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm5, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19> -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm4, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm4, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19> +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm5, %ymm1 ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm31 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm25 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm17 -; AVX512F-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm14 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm14, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm7, %ymm10 -; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm10, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm2, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm16 +; AVX512F-SLOW-NEXT: vporq %xmm0, %xmm1, %xmm17 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vporq %ymm6, %ymm7, %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm12 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm8 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm27 -; AVX512F-SLOW-NEXT: vporq %xmm12, %xmm13, %xmm20 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm22 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,ymm4[27],zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30],zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> +; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm7, %xmm9 +; AVX512F-SLOW-NEXT: vporq %xmm8, %xmm9, %xmm20 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm5[10],zero,ymm5[12],zero,zero,ymm5[11],zero,ymm5[13],zero,ymm5[15],zero,zero,ymm5[14],zero,zero,zero,ymm5[26],zero,ymm5[28],zero,zero,ymm5[27],zero,ymm5[29],zero,ymm5[31],zero,zero,ymm5[30],zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[5],zero,zero,ymm5[4],zero,ymm5[6],zero,ymm5[8],zero,zero,ymm5[7],zero,ymm5[9],zero,zero,zero,ymm5[21],zero,zero,ymm5[20],zero,ymm5[22],zero,ymm5[24],zero,zero,ymm5[23],zero,ymm5[25],zero,zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm21 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[11,u],zero,ymm4[10],zero,ymm4[12,u],zero,ymm4[u],zero,ymm4[13],zero,ymm4[15,u],zero,ymm4[14,27,u],zero,ymm4[26],zero,ymm4[28,u],zero,ymm4[u],zero,ymm4[29],zero,ymm4[31,u],zero,ymm4[30] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[3],zero,ymm4[5,u],zero,ymm4[4],zero,ymm4[6],zero,ymm4[8,u],zero,ymm4[7],zero,ymm4[9,u,19],zero,ymm4[21,u],zero,ymm4[20],zero,ymm4[22],zero,ymm4[24,u],zero,ymm4[23],zero,ymm4[25,u] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm23 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,zero,ymm7[26],zero,ymm7[28],zero,ymm7[30],zero,zero,ymm7[29],zero,ymm7[31],zero,zero -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm31 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm18 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm12, %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512F-SLOW-NEXT: vporq %ymm15, %ymm1, %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm15, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm11, %ymm9 -; AVX512F-SLOW-NEXT: vporq %ymm1, %ymm9, %ymm24 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm13 -; AVX512F-SLOW-NEXT: vporq %xmm1, %xmm13, %xmm25 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm5 -; AVX512F-SLOW-NEXT: vporq %xmm13, %xmm5, %xmm28 -; AVX512F-SLOW-NEXT: vmovdqa64 32(%r8), %xmm29 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm26 = xmm29[1,1,2,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[0,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm27 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX512F-SLOW-NEXT: vpandnq %ymm26, %ymm27, %ymm26 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm5, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm26, %zmm26 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm4 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm27 = ymm4[0,2,1,1,4,6,5,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,3,3,2] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm0[11],zero,zero,ymm0[10],zero,ymm0[12],zero,ymm0[14],zero,zero,ymm0[13],zero,ymm0[15],zero,zero,zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm13[3],zero,ymm13[5],zero,zero,ymm13[4],zero,ymm13[6],zero,ymm13[8],zero,zero,ymm13[7],zero,zero,zero,ymm13[19],zero,ymm13[21],zero,zero,ymm13[20],zero,ymm13[22],zero,ymm13[24],zero,zero,ymm13[23],zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm24 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <9,128,11,u,128,10,128,12,128,14,u,128,13,128,15,u,25,128,27,u,128,26,128,28,128,30,u,128,29,128,31,u> +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm13, %ymm13 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,128,5,128,5,4,128,6,128,8,128,6,7,128,9,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm19 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm13 +; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm5 +; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm5, %ymm10 +; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm10, %ymm22 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm10, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm6, %ymm14 +; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm14, %ymm25 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm15 +; AVX512F-SLOW-NEXT: vporq %xmm0, %xmm15, %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm0, %xmm11 +; AVX512F-SLOW-NEXT: vporq %xmm15, %xmm11, %xmm27 +; AVX512F-SLOW-NEXT: vmovdqa64 32(%r8), %xmm28 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm28[1,1,2,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,1] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX512F-SLOW-NEXT: vpandn %ymm11, %ymm15, %ymm11 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm15 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm11, %zmm11 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm15 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm29 = ymm15[0,2,1,1,4,6,5,5] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm29[2,3,3,2] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm30 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX512F-SLOW-NEXT: vpandnq %ymm27, %ymm30, %ymm27 -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm4, %ymm13 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm13, %zmm27 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm9 -; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm13 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm2 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm7, %ymm7 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm6, %ymm12 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,ymm6[26],zero,ymm6[28],zero,ymm6[30],zero,zero,ymm6[29],zero,ymm6[31],zero,zero -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm14[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm7[2,2,3,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[21],zero,zero,ymm11[20],zero,ymm11[22],zero,ymm11[24],zero,zero,ymm11[23],zero,ymm11[25],zero,zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm11[26],zero,ymm11[28],zero,zero,ymm11[27],zero,ymm11[29],zero,ymm11[31],zero,zero,ymm11[30],zero -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[0,2,1,1,4,6,5,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] +; AVX512F-SLOW-NEXT: vpandnq %ymm29, %ymm30, %ymm29 +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm15, %ymm4 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm4, %zmm4 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3],xmm14[4],xmm3[4],xmm14[5],xmm3[5],xmm14[6],xmm3[6],xmm14[7],xmm3[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm29 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm26, %zmm3, %zmm26 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm27, %zmm0, %zmm27 +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm5, %ymm8 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[11],zero,zero,ymm5[10],zero,ymm5[12],zero,ymm5[14],zero,zero,ymm5[13],zero,ymm5[15],zero,zero,zero,ymm5[27],zero,zero,ymm5[26],zero,ymm5[28],zero,ymm5[30],zero,zero,ymm5[29],zero,ymm5[31],zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm13[3],zero,ymm13[5],zero,zero,ymm13[4],zero,ymm13[6],zero,ymm13[8],zero,zero,ymm13[7],zero,zero,zero,ymm13[19],zero,ymm13[21],zero,zero,ymm13[20],zero,ymm13[22],zero,ymm13[24],zero,zero,ymm13[23],zero +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm13, %ymm9 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,2,1,1,4,6,5,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm6[5],zero,zero,ymm6[4],zero,ymm6[6],zero,ymm6[8],zero,zero,ymm6[7],zero,ymm6[9],zero,zero,zero,ymm6[21],zero,zero,ymm6[20],zero,ymm6[22],zero,ymm6[24],zero,zero,ymm6[23],zero,ymm6[25],zero,zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm10[3],zero,ymm10[5,u],zero,ymm10[4],zero,ymm10[6],zero,ymm10[8,u],zero,ymm10[7],zero,ymm10[9,u,19],zero,ymm10[21,u],zero,ymm10[20],zero,ymm10[22],zero,ymm10[24,u],zero,ymm10[23],zero,ymm10[25,u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[10],zero,ymm6[12],zero,zero,ymm6[11],zero,ymm6[13],zero,ymm6[15],zero,zero,ymm6[14],zero,zero,zero,ymm6[26],zero,ymm6[28],zero,zero,ymm6[27],zero,ymm6[29],zero,ymm6[31],zero,zero,ymm6[30],zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm25, %zmm0, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm28, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,0,1,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm20[0,0,1,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm16 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm16, %zmm3 -; AVX512F-SLOW-NEXT: vpor %ymm13, %ymm12, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [18374966859431608575,18374966859431608575,18446463693966278400,18446463693966278400] -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm12, %ymm14, %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm24, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm16, %zmm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm22[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm13 = zmm23[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm12, %ymm2, %ymm6 -; AVX512F-SLOW-NEXT: vpandq %ymm12, %ymm31, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm30, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm18[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vporq %zmm2, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm13, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm15, %ymm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm8 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm2, %zmm8 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm26 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm9[2,3,3,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,2,3,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm27 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,2,3,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm29[0,0,1,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[11,u],zero,ymm10[10],zero,ymm10[12,u],zero,ymm10[u],zero,ymm10[13],zero,ymm10[15,u],zero,ymm10[14,27,u],zero,ymm10[26],zero,ymm10[28,u],zero,ymm10[u],zero,ymm10[29],zero,ymm10[31,u],zero,ymm10[30] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm17[0,0,1,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm15 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm20[0,0,1,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm15, %zmm17, %zmm16 +; AVX512F-SLOW-NEXT: vpor %ymm3, %ymm8, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm22, %zmm3 +; AVX512F-SLOW-NEXT: vpor %ymm12, %ymm13, %ymm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm25, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm17, %zmm8 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm3 = zmm21[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm23[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vporq %zmm3, %zmm12, %zmm3 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm24[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm13 = zmm19[2,2,3,3,6,6,7,7] +; AVX512F-SLOW-NEXT: vporq %zmm12, %zmm13, %zmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm13, %zmm12 +; AVX512F-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-SLOW-NEXT: vpor %ymm6, %ymm10, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm2, %zmm13, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm11 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm7[2,3,3,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm3 = zmm26[0,0,1,1,4,4,5,5] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm5 = zmm27[0,0,1,1,4,4,5,5] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm25[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm14[2,2,3,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm28[0,0,1,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, 64(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, (%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 128(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 256(%r9) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 192(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 256(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 128(%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 192(%r9) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i8_stride5_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19> -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm1 -; AVX512F-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 -; AVX512F-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm9 -; AVX512F-FAST-NEXT: vpor %ymm8, %ymm9, %ymm7 -; AVX512F-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19> +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm1 +; AVX512F-FAST-NEXT: vporq %ymm0, %ymm1, %ymm16 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm31 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm1 +; AVX512F-FAST-NEXT: vporq %xmm0, %xmm1, %xmm17 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm14 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm3 +; AVX512F-FAST-NEXT: vporq %ymm2, %ymm3, %ymm18 ; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm12 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm26 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm27 -; AVX512F-FAST-NEXT: vporq %xmm12, %xmm13, %xmm20 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm22 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,ymm3[27],zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30],zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero,zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm23 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[27],zero,zero,ymm4[26],zero,ymm4[28],zero,ymm4[30],zero,zero,ymm4[29],zero,ymm4[31],zero,zero -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm0[19],zero,ymm0[21],zero,zero,ymm0[20],zero,ymm0[22],zero,ymm0[24],zero,zero,ymm0[23],zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm18 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm13 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm7 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm1 -; AVX512F-FAST-NEXT: vporq %ymm13, %ymm1, %ymm21 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm13, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm11, %ymm10 -; AVX512F-FAST-NEXT: vporq %ymm1, %ymm10, %ymm24 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm16 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm15 -; AVX512F-FAST-NEXT: vporq %xmm1, %xmm15, %xmm25 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm15 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm5 -; AVX512F-FAST-NEXT: vporq %xmm15, %xmm5, %xmm26 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [1,1,2,2,2,2,2,2] -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm15, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] -; AVX512F-FAST-NEXT: vpandnq %ymm15, %ymm27, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm14 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm30 -; AVX512F-FAST-NEXT: vmovdqa64 (%r8), %zmm31 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm27 = <4,u,5,5,5,5,u,6,30,30,30,u,31,31,31,31> -; AVX512F-FAST-NEXT: vpermi2d %zmm31, %zmm5, %zmm27 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = <4,u,5,5,5,5,u,6> -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm5 -; AVX512F-FAST-NEXT: vpermd %ymm5, %ymm28, %ymm28 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] -; AVX512F-FAST-NEXT: vpandnq %ymm28, %ymm29, %ymm28 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm3, %zmm28 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm14 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm12[19],zero,ymm12[21],zero,zero,ymm12[20],zero,ymm12[22],zero,ymm12[24],zero,zero,ymm12[23],zero -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm12 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,zero,ymm7[26],zero,ymm7[28],zero,ymm7[30],zero,zero,ymm7[29],zero,ymm7[31],zero,zero -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm14[2,2,3,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[3,u,5,u,u,4,u,6,u,8,u,u,7,u,9,u,19,u,21,u,u,20,u,22,u,24,u,u,23,u,25,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[21],zero,zero,ymm11[20],zero,ymm11[22],zero,ymm11[24],zero,zero,ymm11[23],zero,ymm11[25],zero,zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm11[26],zero,ymm11[28],zero,zero,ymm11[27],zero,ymm11[29],zero,ymm11[31],zero,zero,ymm11[30],zero -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm10 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm6, %xmm28 +; AVX512F-FAST-NEXT: vporq %xmm2, %xmm3, %xmm20 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm5[10],zero,ymm5[12],zero,zero,ymm5[11],zero,ymm5[13],zero,ymm5[15],zero,zero,ymm5[14],zero,zero,zero,ymm5[26],zero,ymm5[28],zero,zero,ymm5[27],zero,ymm5[29],zero,ymm5[31],zero,zero,ymm5[30],zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm5[5],zero,zero,ymm5[4],zero,ymm5[6],zero,ymm5[8],zero,zero,ymm5[7],zero,ymm5[9],zero,zero,zero,ymm5[21],zero,zero,ymm5[20],zero,ymm5[22],zero,ymm5[24],zero,zero,ymm5[23],zero,ymm5[25],zero,zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm21 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[11,u],zero,ymm4[10],zero,ymm4[12,u],zero,ymm4[u],zero,ymm4[13],zero,ymm4[15,u],zero,ymm4[14,27,u],zero,ymm4[26],zero,ymm4[28,u],zero,ymm4[u],zero,ymm4[29],zero,ymm4[31,u],zero,ymm4[30] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[3],zero,ymm4[5,u],zero,ymm4[4],zero,ymm4[6],zero,ymm4[8,u],zero,ymm4[7],zero,ymm4[9,u,19],zero,ymm4[21,u],zero,ymm4[20],zero,ymm4[22],zero,ymm4[24,u],zero,ymm4[23],zero,ymm4[25,u] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm23 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm0[11],zero,zero,ymm0[10],zero,ymm0[12],zero,ymm0[14],zero,zero,ymm0[13],zero,ymm0[15],zero,zero,zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm14[3],zero,ymm14[5],zero,zero,ymm14[4],zero,ymm14[6],zero,ymm14[8],zero,zero,ymm14[7],zero,zero,zero,ymm14[19],zero,ymm14[21],zero,zero,ymm14[20],zero,ymm14[22],zero,ymm14[24],zero,zero,ymm14[23],zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm24 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[9],zero,ymm14[11,u],zero,ymm14[10],zero,ymm14[12],zero,ymm14[14,u],zero,ymm14[13],zero,ymm14[15,u,25],zero,ymm14[27,u],zero,ymm14[26],zero,ymm14[28],zero,ymm14[30,u],zero,ymm14[29],zero,ymm14[31,u] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,128,5,128,5,4,128,6,128,8,128,6,7,128,9,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm1 +; AVX512F-FAST-NEXT: vporq %ymm0, %ymm1, %ymm22 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm1 +; AVX512F-FAST-NEXT: vporq %ymm0, %ymm1, %ymm25 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm15 +; AVX512F-FAST-NEXT: vporq %xmm0, %xmm15, %xmm26 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm15, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm13 +; AVX512F-FAST-NEXT: vporq %xmm1, %xmm13, %xmm29 +; AVX512F-FAST-NEXT: vmovdqa64 32(%r8), %xmm27 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,2,2,2,2,2,2] +; AVX512F-FAST-NEXT: vpermd %ymm27, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX512F-FAST-NEXT: vpandnq %ymm1, %ymm28, %ymm1 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm13, %ymm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm12 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <4,u,5,5,5,5,u,6,14,14,14,u,15,15,15,15> +; AVX512F-FAST-NEXT: vpermd %zmm13, %zmm1, %zmm13 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [4,6,5,5,5,5,4,6] +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm28, %ymm28 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm30 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] +; AVX512F-FAST-NEXT: vpandnq %ymm28, %ymm30, %ymm28 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm4, %zmm4 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm26, %zmm2, %zmm26 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm29, %zmm0, %zmm28 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[11],zero,zero,ymm6[10],zero,ymm6[12],zero,ymm6[14],zero,zero,ymm6[13],zero,ymm6[15],zero,zero,zero,ymm6[27],zero,zero,ymm6[26],zero,ymm6[28],zero,ymm6[30],zero,zero,ymm6[29],zero,ymm6[31],zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm14[3],zero,ymm14[5],zero,zero,ymm14[4],zero,ymm14[6],zero,ymm14[8],zero,zero,ymm14[7],zero,zero,zero,ymm14[19],zero,ymm14[21],zero,zero,ymm14[20],zero,ymm14[22],zero,ymm14[24],zero,zero,ymm14[23],zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[9],zero,ymm14[11,u],zero,ymm14[10],zero,ymm14[12],zero,ymm14[14,u],zero,ymm14[13],zero,ymm14[15,u,25],zero,ymm14[27,u],zero,ymm14[26],zero,ymm14[28],zero,ymm14[30,u],zero,ymm14[29],zero,ymm14[31,u] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm10 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm15 -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm5[5],zero,zero,ymm5[4],zero,ymm5[6],zero,ymm5[8],zero,zero,ymm5[7],zero,ymm5[9],zero,zero,zero,ymm5[21],zero,zero,ymm5[20],zero,ymm5[22],zero,ymm5[24],zero,zero,ymm5[23],zero,ymm5[25],zero,zero +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm11[3],zero,ymm11[5,u],zero,ymm11[4],zero,ymm11[6],zero,ymm11[8,u],zero,ymm11[7],zero,ymm11[9,u,19],zero,ymm11[21,u],zero,ymm11[20],zero,ymm11[22],zero,ymm11[24,u],zero,ymm11[23],zero,ymm11[25,u] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm7 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[10],zero,ymm5[12],zero,zero,ymm5[11],zero,ymm5[13],zero,ymm5[15],zero,zero,ymm5[14],zero,zero,zero,ymm5[26],zero,ymm5[28],zero,zero,ymm5[27],zero,ymm5[29],zero,ymm5[31],zero,zero,ymm5[30],zero +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[11,u],zero,ymm11[10],zero,ymm11[12,u],zero,ymm11[u],zero,ymm11[13],zero,ymm11[15,u],zero,ymm11[14,27,u],zero,ymm11[26],zero,ymm11[28,u],zero,ymm11[u],zero,ymm11[29],zero,ymm11[31,u],zero,ymm11[30] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,1] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm25, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm26, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm31, %zmm5 -; AVX512F-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm6 = mem[0,0,1,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm20[0,0,1,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm15 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3],xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm27, %zmm1, %zmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm17[0,0,1,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm20[0,0,1,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm15, %zmm15 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm6, %zmm16, %zmm10 -; AVX512F-FAST-NEXT: vpor %ymm0, %ymm12, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [18374966859431608575,18374966859431608575,18446463693966278400,18446463693966278400] -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm6, %ymm14, %ymm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm24, %zmm9 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm9 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm22[2,2,3,3,6,6,7,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm12 = zmm23[2,2,3,3,6,6,7,7] -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm12 -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm6, %ymm2, %ymm7 -; AVX512F-FAST-NEXT: vpandq %ymm6, %ymm29, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm18[2,2,3,3,6,6,7,7] -; AVX512F-FAST-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm12, %zmm2, %zmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm11 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm11, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm30 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm27 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <6,6,6,u,7,7,7,7,u,8,8,8,8,u,9,9> -; AVX512F-FAST-NEXT: vpermd %zmm31, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm4[0,0,1,1,4,4,5,5] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,1,1,4,4,5,5] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm9, %zmm16, %zmm15 +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm22, %zmm0 +; AVX512F-FAST-NEXT: vpor %ymm8, %ymm10, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm25, %zmm2 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm16, %zmm2 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm21[2,2,3,3,6,6,7,7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm23[2,2,3,3,6,6,7,7] +; AVX512F-FAST-NEXT: vporq %zmm0, %zmm8, %zmm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm24[2,2,3,3,6,6,7,7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm19[2,2,3,3,6,6,7,7] +; AVX512F-FAST-NEXT: vporq %zmm8, %zmm9, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm9, %zmm8 +; AVX512F-FAST-NEXT: vpor %ymm6, %ymm14, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpor %ymm5, %ymm11, %ymm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm9, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm12 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm13 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm26[0,0,1,1,4,4,5,5] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm28[0,0,1,1,4,4,5,5] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm5 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = -; AVX512F-FAST-NEXT: vpermd %zmm5, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, 64(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 128(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 256(%r9) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 192(%r9) +; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <6,6,6,u,7,7,7,7,u,8,8,8,8,u,9,9> +; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 64(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 256(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 128(%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, 192(%r9) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; -; AVX512BW-ONLY-SLOW-LABEL: store_i8_stride5_vf64: -; AVX512BW-ONLY-SLOW: # %bb.0: -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: movl $693250386, %eax # imm = 0x29522952 -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm16 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm16, %ymm2, %ymm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94 -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm9, %ymm4 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm5, %zmm4 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r8), %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm17[0,0,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm9, %zmm4 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm23, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm24 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm24, %ymm19 -; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm18, %ymm19, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm12, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm6, %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm12, %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm14, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm22 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm22, %xmm13, %xmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,0,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <3,3,3,u,4,4,4,4> -; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm25, %ymm12, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm26 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] -; AVX512BW-ONLY-SLOW-NEXT: movl $138547332, %eax # imm = 0x8421084 -; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm26, %ymm18 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm14, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm14, %zmm6 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <3,3,3,3,u,4,4,4> -; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm15, %ymm14, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm17[1,1,2,2] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,1,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm17, %zmm6 {%k6} -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] -; AVX512BW-ONLY-SLOW-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm18[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm18, %ymm25, %ymm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm27, %ymm28, %ymm27 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm16, %ymm26, %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm25, %ymm16 {%k5} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm23, %ymm26 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX512BW-ONLY-SLOW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm24, %ymm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm26, %ymm28, %ymm26 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm24, %ymm8 -; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm23 = ymm23[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[2,2,3,3,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm23, %ymm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm26, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k4} -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm15[0,2,1,1,4,6,5,5] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,3,2] -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,3,3,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm16 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm15, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm16, %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm15 -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm7, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm16, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm22, %xmm15, %xmm20 -; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,1,1,4,4,5,5] -; AVX512BW-ONLY-SLOW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,0,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm1, %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm0, %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm7, %ymm15, %ymm7 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm2, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm18, %ymm3, %ymm3 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm5, %ymm14, %ymm0 -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,2,1,1,4,6,5,5] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 256(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 128(%r9) -; AVX512BW-ONLY-SLOW-NEXT: vzeroupper -; AVX512BW-ONLY-SLOW-NEXT: retq +; AVX512BW-SLOW-LABEL: store_i8_stride5_vf64: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] +; AVX512BW-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm2 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] +; AVX512BW-SLOW-NEXT: movl $693250386, %eax # imm = 0x29522952 +; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1} +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm15 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %xmm17 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm18 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX512BW-SLOW-NEXT: vpshufb %xmm18, %xmm3, %xmm3 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm6 +; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] +; AVX512BW-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm3 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] +; AVX512BW-SLOW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94 +; AVX512BW-SLOW-NEXT: kmovd %eax, %k3 +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm9, %ymm3 {%k3} +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %xmm19 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm20 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512BW-SLOW-NEXT: vpshufb %xmm21, %xmm9, %xmm9 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3 +; AVX512BW-SLOW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k2 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm6, %zmm3 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %ymm6 +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[2,2,3,3,6,6,7,7] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %xmm22 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm25 +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm25[0,0,1,1] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512BW-SLOW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k4 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm9, %zmm3 {%k4} +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] +; AVX512BW-SLOW-NEXT: vpshufb %ymm9, %ymm14, %ymm23 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm16 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512BW-SLOW-NEXT: vpshufb %ymm10, %ymm16, %ymm24 +; AVX512BW-SLOW-NEXT: vporq %ymm23, %ymm24, %ymm23 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm26 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> +; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm11, %xmm11 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm27 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> +; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm7, %xmm7 +; AVX512BW-SLOW-NEXT: vpor %xmm7, %xmm11, %xmm7 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm7, %zmm7 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm28 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> +; AVX512BW-SLOW-NEXT: vpshufb %xmm28, %xmm13, %xmm11 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm29 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512BW-SLOW-NEXT: vpshufb %xmm29, %xmm12, %xmm12 +; AVX512BW-SLOW-NEXT: vpor %xmm11, %xmm12, %xmm11 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm11[0,0,1,1] +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm24 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <3,3,3,u,4,4,4,4> +; AVX512BW-SLOW-NEXT: vpermd %ymm24, %ymm11, %ymm23 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm30 +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm12 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX512BW-SLOW-NEXT: movl $138547332, %eax # imm = 0x8421084 +; AVX512BW-SLOW-NEXT: kmovd %eax, %k4 +; AVX512BW-SLOW-NEXT: vpshufb %ymm12, %ymm30, %ymm23 {%k4} +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm13, %zmm13 +; AVX512BW-SLOW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k5 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm13, %zmm7 {%k5} +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %ymm23 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <3,3,3,3,u,4,4,4> +; AVX512BW-SLOW-NEXT: vpermd %ymm23, %ymm13, %ymm31 +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm25 = xmm25[1,1,2,2] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,1,1,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm25, %zmm25 +; AVX512BW-SLOW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k6 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm25, %zmm7 {%k6} +; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm17, %xmm25 +; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm15, %xmm26 +; AVX512BW-SLOW-NEXT: vporq %xmm25, %xmm26, %xmm25 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm18, %xmm15, %xmm15 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm25, %zmm15, %zmm15 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm17 = zmm15[0,0,1,1,4,4,5,5] +; AVX512BW-SLOW-NEXT: vpshufb %xmm28, %xmm20, %xmm15 +; AVX512BW-SLOW-NEXT: vpshufb %xmm29, %xmm19, %xmm18 +; AVX512BW-SLOW-NEXT: vporq %xmm15, %xmm18, %xmm15 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm21, %xmm18, %xmm18 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm15, %zmm18, %zmm15 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm15 = zmm15[0,0,1,1,4,4,5,5] +; AVX512BW-SLOW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C +; AVX512BW-SLOW-NEXT: kmovq %rax, %k6 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm17, %zmm15 {%k6} +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm22[0,0,1,1] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,0,0,1] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm18 = xmm22[1,1,2,2] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm18[0,1,1,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm17 +; AVX512BW-SLOW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k6 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm17, %zmm15 {%k6} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [3,128,5,128,128,4,128,6,128,8,128,128,7,128,9,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX512BW-SLOW-NEXT: vpshufb %ymm17, %ymm30, %ymm18 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm18[2,2,3,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [128,5,128,128,4,128,6,128,8,128,128,7,128,9,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] +; AVX512BW-SLOW-NEXT: vpshufb %ymm19, %ymm24, %ymm20 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,3,3] +; AVX512BW-SLOW-NEXT: vporq %ymm18, %ymm20, %ymm18 +; AVX512BW-SLOW-NEXT: vpshufb %ymm8, %ymm30, %ymm8 +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm20 = ymm24[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,2,3,3,6,6,7,7] +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm20, %ymm8 {%k3} +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm18, %zmm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,3,128,5,128,5,4,128,6,128,8,128,6,7,128,9,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] +; AVX512BW-SLOW-NEXT: vpshufb %ymm18, %ymm14, %ymm20 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,3,3] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [128,128,3,128,5,128,128,4,128,6,128,8,128,128,7,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX512BW-SLOW-NEXT: vpshufb %ymm21, %ymm16, %ymm22 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,3,3] +; AVX512BW-SLOW-NEXT: vporq %ymm20, %ymm22, %ymm20 +; AVX512BW-SLOW-NEXT: vpshufb %ymm5, %ymm16, %ymm5 +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,3,3,6,6,7,7] +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm14, %ymm5 {%k1} +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm20, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm8, %zmm5 {%k2} +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm23[0,2,1,1,4,6,5,5] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[2,2,3,3,6,6,7,7] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm8, %zmm8 +; AVX512BW-SLOW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm8, %zmm5 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb %ymm18, %ymm1, %ymm8 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX512BW-SLOW-NEXT: vpshufb %ymm21, %ymm0, %ymm14 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] +; AVX512BW-SLOW-NEXT: vpor %ymm8, %ymm14, %ymm8 +; AVX512BW-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpshufb %ymm17, %ymm2, %ymm1 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] +; AVX512BW-SLOW-NEXT: vpshufb %ymm19, %ymm4, %ymm8 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX512BW-SLOW-NEXT: vpor %ymm1, %ymm8, %ymm1 +; AVX512BW-SLOW-NEXT: vpermd %ymm4, %ymm11, %ymm4 +; AVX512BW-SLOW-NEXT: vpshufb %ymm12, %ymm2, %ymm4 {%k4} +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5} +; AVX512BW-SLOW-NEXT: vpermd %ymm6, %ymm13, %ymm0 +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,2,1,1,4,6,5,5] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, 256(%r9) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, (%r9) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, 192(%r9) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, 128(%r9) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: store_i8_stride5_vf64: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %ymm19 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] -; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm19, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm3 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] +; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm3 ; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rcx), %xmm16 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm16, %xmm3 -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm18 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm18, %xmm4 +; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm10 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm16 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> +; AVX512BW-FAST-NEXT: vpshufb %xmm16, %xmm11, %xmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %xmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm18 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> +; AVX512BW-FAST-NEXT: vpshufb %xmm18, %xmm13, %xmm4 ; AVX512BW-FAST-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm15, %xmm3 -; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %xmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %xmm17 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm17, %xmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm19 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm20 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> +; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm12, %xmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %xmm21 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %xmm14 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm22 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512BW-FAST-NEXT: vpshufb %xmm22, %xmm14, %xmm4 ; AVX512BW-FAST-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm3[0,0,1,1] -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %ymm21 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <3,3,3,u,4,4,4,4> -; AVX512BW-FAST-NEXT: vpermd %ymm21, %ymm3, %ymm22 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %ymm23 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,0,1,1] +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <3,3,3,u,4,4,4,4> +; AVX512BW-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm15 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] ; AVX512BW-FAST-NEXT: movl $138547332, %eax # imm = 0x8421084 ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm23, %ymm22 {%k1} -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm20 +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm15 {%k1} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 ; AVX512BW-FAST-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm20, %zmm0 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %ymm20 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <1,1,2,2,2,2,2,2,27,27,27,27,u,28,28,28> -; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm20, %zmm22 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm7, %zmm0 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %xmm23 +; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %xmm15 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm24 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <1,1,2,2,2,2,2,2,11,11,11,11,u,12,12,12> +; AVX512BW-FAST-NEXT: vpermd %zmm24, %zmm25, %zmm24 ; AVX512BW-FAST-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 ; AVX512BW-FAST-NEXT: kmovq %rax, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm22, %zmm0 {%k3} -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm22 = zmm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,zmm22[21],zero,zero,zmm22[20],zero,zmm22[22],zero,zmm22[24],zero,zero,zmm22[23],zero,zmm22[25],zero,zmm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zmm22[58],zero,zmm22[60],zero,zero,zero,zero,zmm22[61],zero,zmm22[63],zero,zero,zmm22[62] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,2,3,3,6,6,7,7] -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm21 = zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm21[21],zero,zero,zmm21[20],zero,zmm21[22],zero,zmm21[24],zero,zero,zmm21[23],zero,zmm21[25],zero,zero,zmm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm21[58],zero,zmm21[60],zero,zero,zmm21[59],zero,zmm21[61],zero,zmm21[63],zero,zero,zmm21[62],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm21 = zmm21[2,2,3,3,6,6,7,7] -; AVX512BW-FAST-NEXT: vporq %zmm22, %zmm21, %zmm21 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm19 = zmm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,zmm19[21],zero,zmm19[21,20],zero,zmm19[22],zero,zmm19[24],zero,zmm19[22,23],zero,zmm19[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm19[59],zero,zero,zmm19[58],zero,zmm19[60],zero,zmm19[62],zero,zero,zmm19[61],zero,zmm19[63],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm19 = zmm19[2,2,3,3,6,6,7,7] -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm13 = zmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm13[19],zero,zmm13[21],zero,zero,zmm13[20],zero,zmm13[22],zero,zmm13[24],zero,zero,zmm13[23],zero,zmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm13[59],zero,zero,zmm13[58],zero,zmm13[60],zero,zmm13[62],zero,zero,zmm13[61],zero,zmm13[63],zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm13 = zmm13[2,2,3,3,6,6,7,7] -; AVX512BW-FAST-NEXT: vporq %zmm19, %zmm13, %zmm13 -; AVX512BW-FAST-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 -; AVX512BW-FAST-NEXT: kmovq %rax, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm21, %zmm13 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = [4,6,5,5,5,5,4,6,30,30,30,30,31,31,31,31] -; AVX512BW-FAST-NEXT: vpermi2d %zmm5, %zmm20, %zmm19 -; AVX512BW-FAST-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 -; AVX512BW-FAST-NEXT: kmovq %rax, %k4 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm19, %zmm13 {%k4} -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %ymm19 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm19[27],zero,zero,ymm19[26],zero,ymm19[28],zero,ymm19[30],zero,zero,ymm19[29],zero,ymm19[31],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,3,3] -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %ymm21 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm22 = ymm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm21[27],zero,zero,ymm21[26],zero,ymm21[28],zero,ymm21[30],zero,zero,ymm21[29],zero,ymm21[31],zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,3,3] -; AVX512BW-FAST-NEXT: vporq %ymm20, %ymm22, %ymm20 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm16[0],xmm18[0],xmm16[1],xmm18[1],xmm16[2],xmm18[2],xmm16[3],xmm18[3],xmm16[4],xmm18[4],xmm16[5],xmm18[5],xmm16[6],xmm18[6],xmm16[7],xmm18[7] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm18 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX512BW-FAST-NEXT: vpshufb %xmm18, %xmm16, %xmm16 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,0,1,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm20, %zmm20 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %ymm16 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm22 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm16[26],zero,ymm16[28],zero,zero,zero,zero,ymm16[29],zero,ymm16[31],zero,zero,ymm16[30] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,3,3] -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %ymm23 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm24 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm23[26],zero,ymm23[28],zero,zero,ymm23[27],zero,ymm23[29],zero,ymm23[31],zero,zero,ymm23[30],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,3,3] -; AVX512BW-FAST-NEXT: vporq %ymm22, %ymm24, %ymm22 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3],xmm15[4],xmm17[4],xmm15[5],xmm17[5],xmm15[6],xmm17[6],xmm15[7],xmm17[7] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm17 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512BW-FAST-NEXT: vpshufb %xmm17, %xmm15, %xmm15 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm22, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm20, %zmm15 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] -; AVX512BW-FAST-NEXT: vpermd %zmm5, %zmm20, %zmm5 -; AVX512BW-FAST-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 -; AVX512BW-FAST-NEXT: kmovq %rax, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm15 {%k3} -; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm5 -; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm8 -; AVX512BW-FAST-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm18, %xmm6, %xmm6 -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm6, %zmm5 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,0,1,1,4,4,5,5] -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm6 -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm7 -; AVX512BW-FAST-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm17, %xmm7, %xmm7 -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm7, %zmm6 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm6[0,0,1,1,4,4,5,5] +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm24, %zmm0 {%k3} +; AVX512BW-FAST-NEXT: vpshufb %xmm16, %xmm10, %xmm16 +; AVX512BW-FAST-NEXT: vpshufb %xmm18, %xmm17, %xmm18 +; AVX512BW-FAST-NEXT: vporq %xmm16, %xmm18, %xmm16 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm17[0],xmm10[1],xmm17[1],xmm10[2],xmm17[2],xmm10[3],xmm17[3],xmm10[4],xmm17[4],xmm10[5],xmm17[5],xmm10[6],xmm17[6],xmm10[7],xmm17[7] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm17 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX512BW-FAST-NEXT: vpshufb %xmm17, %xmm10, %xmm10 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm16, %zmm10, %zmm10 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm16 = zmm10[0,0,1,1,4,4,5,5] +; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm19, %xmm10 +; AVX512BW-FAST-NEXT: vpshufb %xmm22, %xmm21, %xmm18 +; AVX512BW-FAST-NEXT: vporq %xmm10, %xmm18, %xmm10 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm19[0],xmm21[0],xmm19[1],xmm21[1],xmm19[2],xmm21[2],xmm19[3],xmm21[3],xmm19[4],xmm21[4],xmm19[5],xmm21[5],xmm19[6],xmm21[6],xmm19[7],xmm21[7] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm19 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512BW-FAST-NEXT: vpshufb %xmm19, %xmm18, %xmm18 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm18, %zmm10 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,1,1,4,4,5,5] ; AVX512BW-FAST-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C ; AVX512BW-FAST-NEXT: kmovq %rax, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm6 {%k3} -; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] -; AVX512BW-FAST-NEXT: vpermd %zmm5, %zmm7, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm16, %zmm10 {%k3} +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm23, %zmm23, %zmm16 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,0,0,0,0,0,1,1,9,9,10,10,10,10,10,10] +; AVX512BW-FAST-NEXT: vpermd %zmm16, %zmm18, %zmm16 ; AVX512BW-FAST-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 ; AVX512BW-FAST-NEXT: kmovq %rax, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm7, %zmm6 {%k3} -; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm21, %ymm1 -; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm19, %ymm2 -; AVX512BW-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm21[21],zero,ymm21[21,20],zero,ymm21[22],zero,ymm21[24],zero,ymm21[22,23],zero,ymm21[25] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm19[19],zero,ymm19[21],zero,zero,ymm19[20],zero,ymm19[22],zero,ymm19[24],zero,zero,ymm19[23],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512BW-FAST-NEXT: vpor %ymm2, %ymm7, %ymm2 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm16[21],zero,zero,ymm16[20],zero,ymm16[22],zero,ymm16[24],zero,zero,ymm16[23],zero,ymm16[25],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm23[21],zero,zero,ymm23[20],zero,ymm23[22],zero,ymm23[24],zero,zero,ymm23[23],zero,ymm23[25],zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512BW-FAST-NEXT: vpor %ymm2, %ymm7, %ymm2 -; AVX512BW-FAST-NEXT: vpermd %ymm23, %ymm3, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm16, %ymm3 {%k1} -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm2 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <3,3,3,3,u,4,4,4,12,14,13,13,13,13,12,14> -; AVX512BW-FAST-NEXT: vpermd %zmm5, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm16, %zmm10 {%k3} +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %ymm16 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm18 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm16[27],zero,zero,ymm16[26],zero,ymm16[28],zero,ymm16[30],zero,zero,ymm16[29],zero,ymm16[31],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm18[2,2,3,3] +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %ymm20 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm21 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm20[27],zero,zero,ymm20[26],zero,ymm20[28],zero,ymm20[30],zero,zero,ymm20[29],zero,ymm20[31],zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,2,3,3] +; AVX512BW-FAST-NEXT: vporq %ymm18, %ymm21, %ymm18 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm17, %xmm11, %xmm11 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm18, %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %ymm17 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm17[26],zero,ymm17[28],zero,zero,zero,zero,ymm17[29],zero,ymm17[31],zero,zero,ymm17[30] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %ymm18 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm21 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm18[26],zero,ymm18[28],zero,zero,ymm18[27],zero,ymm18[29],zero,ymm18[31],zero,zero,ymm18[30],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,2,3,3] +; AVX512BW-FAST-NEXT: vporq %ymm11, %ymm21, %ymm11 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm19, %xmm12, %xmm12 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512BW-FAST-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 +; AVX512BW-FAST-NEXT: kmovq %rax, %k3 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm13, %zmm11 {%k3} +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] +; AVX512BW-FAST-NEXT: vpermd %zmm13, %zmm14, %zmm13 +; AVX512BW-FAST-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 +; AVX512BW-FAST-NEXT: kmovq %rax, %k4 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm13, %zmm11 {%k4} +; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm20, %ymm5 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm16, %ymm6 +; AVX512BW-FAST-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm20[2,3],zero,ymm20[5],zero,ymm20[5,4],zero,ymm20[6],zero,ymm20[8],zero,ymm20[6,7],zero,ymm20[9,18,19],zero,ymm20[21],zero,ymm20[21,20],zero,ymm20[22],zero,ymm20[24],zero,ymm20[22,23],zero,ymm20[25] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,ymm16[3],zero,ymm16[5],zero,zero,ymm16[4],zero,ymm16[6],zero,ymm16[8],zero,zero,ymm16[7],zero,zero,zero,ymm16[19],zero,ymm16[21],zero,zero,ymm16[20],zero,ymm16[22],zero,ymm16[24],zero,zero,ymm16[23],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] +; AVX512BW-FAST-NEXT: vpor %ymm6, %ymm13, %ymm6 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm17[3],zero,ymm17[5],zero,zero,ymm17[4],zero,ymm17[6],zero,ymm17[8],zero,zero,ymm17[7],zero,ymm17[9],zero,ymm17[19],zero,ymm17[21],zero,zero,ymm17[20],zero,ymm17[22],zero,ymm17[24],zero,zero,ymm17[23],zero,ymm17[25],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,ymm18[5],zero,zero,ymm18[4],zero,ymm18[6],zero,ymm18[8],zero,zero,ymm18[7],zero,ymm18[9],zero,zero,zero,ymm18[21],zero,zero,ymm18[20],zero,ymm18[22],zero,ymm18[24],zero,zero,ymm18[23],zero,ymm18[25],zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] +; AVX512BW-FAST-NEXT: vpor %ymm6, %ymm13, %ymm6 +; AVX512BW-FAST-NEXT: vpermd %ymm18, %ymm8, %ymm8 +; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm17, %ymm8 {%k1} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm6 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = <3,3,3,3,u,4,4,4,12,14,13,13,13,13,12,14> +; AVX512BW-FAST-NEXT: vpermd %zmm12, %zmm5, %zmm5 ; AVX512BW-FAST-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm2 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, 64(%r9) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, (%r9) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, 128(%r9) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, 256(%r9) +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm6 {%k1} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,zmm5[21],zero,zmm5[21,20],zero,zmm5[22],zero,zmm5[24],zero,zmm5[22,23],zero,zmm5[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm5[59],zero,zero,zmm5[58],zero,zmm5[60],zero,zmm5[62],zero,zero,zmm5[61],zero,zmm5[63],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm1[19],zero,zmm1[21],zero,zero,zmm1[20],zero,zmm1[22],zero,zmm1[24],zero,zero,zmm1[23],zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm1[59],zero,zero,zmm1[58],zero,zmm1[60],zero,zmm1[62],zero,zero,zmm1[61],zero,zmm1[63],zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,3,3,6,6,7,7] +; AVX512BW-FAST-NEXT: vporq %zmm5, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm2 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,zmm4[21],zero,zero,zmm4[20],zero,zmm4[22],zero,zmm4[24],zero,zero,zmm4[23],zero,zmm4[25],zero,zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zmm4[58],zero,zmm4[60],zero,zero,zero,zero,zmm4[61],zero,zmm4[63],zero,zero,zmm4[62] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,3,3,6,6,7,7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm3[21],zero,zero,zmm3[20],zero,zmm3[22],zero,zmm3[24],zero,zero,zmm3[23],zero,zmm3[25],zero,zero,zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm3[58],zero,zmm3[60],zero,zero,zmm3[59],zero,zmm3[61],zero,zmm3[63],zero,zero,zmm3[62],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,2,3,3,6,6,7,7] +; AVX512BW-FAST-NEXT: vporq %zmm2, %zmm3, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm1 {%k3} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,5,5,5,5,4,6,14,14,14,14,15,15,15,15] +; AVX512BW-FAST-NEXT: vpermd %zmm7, %zmm2, %zmm2 +; AVX512BW-FAST-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 +; AVX512BW-FAST-NEXT: kmovq %rax, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 64(%r9) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 256(%r9) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 128(%r9) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, (%r9) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 192(%r9) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq -; -; AVX512DQBW-SLOW-LABEL: store_i8_stride5_vf64: -; AVX512DQBW-SLOW: # %bb.0: -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: movl $693250386, %eax # imm = 0x29522952 -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm16 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm16, %ymm2, %ymm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94 -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k5 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm9, %ymm4 {%k5} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 -; AVX512DQBW-SLOW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k4 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm5, %zmm4 {%k4} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r8), %ymm15 -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm17 -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm17[0,0,1,1] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 -; AVX512DQBW-SLOW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm9, %zmm4 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm23 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm9, %ymm23, %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm24 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm11, %ymm24, %ymm19 -; AVX512DQBW-SLOW-NEXT: vporq %ymm18, %ymm19, %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm12, %xmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm6, %xmm6 -; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm12, %xmm6 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm14, %xmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm22 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm22, %xmm13, %xmm13 -; AVX512DQBW-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,0,1,1] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <3,3,3,u,4,4,4,4> -; AVX512DQBW-SLOW-NEXT: vpermd %ymm25, %ymm12, %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm26 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] -; AVX512DQBW-SLOW-NEXT: movl $138547332, %eax # imm = 0x8421084 -; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm13, %ymm26, %ymm18 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm14, %zmm14 -; AVX512DQBW-SLOW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm14, %zmm6 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <3,3,3,3,u,4,4,4> -; AVX512DQBW-SLOW-NEXT: vpermd %ymm15, %ymm14, %ymm18 -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm17[1,1,2,2] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,1,1,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm17 -; AVX512DQBW-SLOW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k6 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm17, %zmm6 {%k6} -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm17 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] -; AVX512DQBW-SLOW-NEXT: # ymm17 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm18 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm18[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm18, %ymm25, %ymm28 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vporq %ymm27, %ymm28, %ymm27 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm16, %ymm26, %ymm16 -; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm25, %ymm16 {%k5} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm23, %ymm26 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX512DQBW-SLOW-NEXT: # ymm27 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm24, %ymm28 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vporq %ymm26, %ymm28, %ymm26 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm8, %ymm24, %ymm8 -; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm23 = ymm23[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[2,2,3,3,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm23, %ymm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm26, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k4} -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm15[0,2,1,1,4,6,5,5] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,3,2] -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,3,3,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm16 -; AVX512DQBW-SLOW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm15 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm15, %xmm19 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm16, %xmm20 -; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm15 -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm7, %zmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm16, %xmm19 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm22, %xmm15, %xmm20 -; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm10 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,1,1,4,4,5,5] -; AVX512DQBW-SLOW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %xmm7 -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,0,1,1] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 -; AVX512DQBW-SLOW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1} -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm1, %ymm7 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm0, %ymm15 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpor %ymm7, %ymm15, %ymm7 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm1 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm1 {%k2} -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm2, %ymm2 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm18, %ymm3, %ymm3 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX512DQBW-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k3} -; AVX512DQBW-SLOW-NEXT: vpermd %ymm5, %ymm14, %ymm0 -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,2,1,1,4,6,5,5] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQBW-SLOW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 -; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 256(%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 128(%r9) -; AVX512DQBW-SLOW-NEXT: vzeroupper -; AVX512DQBW-SLOW-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 %in.vec1 = load <64 x i8>, ptr %in.vecptr1, align 64 %in.vec2 = load <64 x i8>, ptr %in.vecptr2, align 64 @@ -4818,9 +4619,11 @@ ; AVX2: {{.*}} ; AVX512: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} +; AVX512BW-ONLY-SLOW: {{.*}} ; AVX512DQ-FAST: {{.*}} ; AVX512DQ-SLOW: {{.*}} ; AVX512DQBW-FAST: {{.*}} +; AVX512DQBW-SLOW: {{.*}} ; AVX512F-ONLY-FAST: {{.*}} ; AVX512F-ONLY-SLOW: {{.*}} ; FALLBACK0: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -22,31 +22,29 @@ ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa (%r8), %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,5,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movq %xmm0, (%rax) -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movq %xmm3, (%rax) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE-NEXT: movd %xmm0, 8(%rax) ; SSE-NEXT: retq ; @@ -55,14 +53,13 @@ ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa (%rdx), %xmm1 -; AVX-NEXT: vmovdqa (%r9), %xmm2 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vmovdqa (%r8), %xmm2 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1],zero,xmm0[2,6,10,14,3],zero,xmm0[u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,xmm2[1,u,u,u,u] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,2,6,10,14,3,7,u,u,u,u] ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rax) ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq @@ -90,44 +87,51 @@ ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa (%rdx), %xmm2 ; SSE-NEXT: movdqa (%r8), %xmm0 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,6] -; SSE-NEXT: packuswb %xmm5, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,5] -; SSE-NEXT: packuswb %xmm3, %xmm6 -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,4,6] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[3,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm6[0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,2,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm6[0,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,5] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[2,3] +; SSE-NEXT: packuswb %xmm3, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,0,0] -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] +; SSE-NEXT: pand %xmm3, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,65535,65535,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movq %xmm2, 16(%rax) ; SSE-NEXT: movdqa %xmm3, (%rax) ; SSE-NEXT: retq @@ -135,13 +139,13 @@ ; AVX1-ONLY-LABEL: store_i8_stride6_vf4: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0,2] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,4,8,12],zero,zero,xmm0[1,5,9,13],zero,zero,xmm0[2,6,10,14] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[0,4],zero,zero,zero,zero,xmm1[1,5],zero,zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 @@ -155,13 +159,15 @@ ; AVX2-ONLY-LABEL: store_i8_stride6_vf4: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX2-ONLY-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,ymm0[1,5,9,13],zero,zero,ymm0[2,6,10,14,18,22],zero,zero,zero,zero,ymm0[19,23],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -176,13 +182,15 @@ ; AVX512F-LABEL: store_i8_stride6_vf4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512F-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-NEXT: vmovdqa (%r8), %xmm1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512F-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,ymm0[1,5,9,13],zero,zero,ymm0[2,6,10,14,18,22],zero,zero,zero,zero,ymm0[19,23,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -197,13 +205,15 @@ ; AVX512BW-LABEL: store_i8_stride6_vf4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm1 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12,u,u,1,5,9,13,u,u,2,6,10,14,18,22,u,u,u,u,19,23,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -299,30 +309,32 @@ ; AVX1-ONLY-LABEL: store_i8_stride6_vf8: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u],zero,zero,xmm1[3,11,u,u],zero,zero,xmm1[4,12,u,u],zero,zero +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u],zero,zero,xmm2[3,11,u,u],zero,zero,xmm2[4,12,u,u],zero,zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,3,11],zero,zero,xmm0[u,u,4,12],zero,zero,xmm0[u,u,5,13] ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,u,u,u,u,3,11,u,u,u,u,4,12,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[2,10,u,u,u,u,3,11,u,u,u,u,4,12,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm1[0,8,u,u],zero,zero,xmm1[1,9,u,u],zero,zero,xmm1[2,10] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm2[0,8,u,u],zero,zero,xmm2[1,9,u,u],zero,zero,xmm2[2,10] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,8],zero,zero,xmm0[u,u,1,9],zero,zero,xmm0[u,u,2,10],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,13,u,u],zero,zero,xmm2[6,14,u,u],zero,zero,xmm2[7,15,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax) @@ -334,29 +346,31 @@ ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14],ymm2[15] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] -; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] -; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[u,u,6,14],zero,zero,xmm1[u,u,7,15],zero,zero,xmm1[u,u] +; AVX2-ONLY-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[5,13,u,u],zero,zero,xmm4[6,14,u,u],zero,zero,xmm4[7,15,u,u] +; AVX2-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] +; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-ONLY-NEXT: vmovdqa %xmm3, 32(%rax) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -365,29 +379,31 @@ ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512F-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512F-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] -; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] -; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] -; AVX512F-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX512F-NEXT: vmovdqa %ymm1, (%rax) +; AVX512F-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512F-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512F-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2 +; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm2[u,u,6,14],zero,zero,xmm2[u,u,7,15],zero,zero,xmm2[u,u] +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,13,u,u],zero,zero,xmm3[6,14,u,u],zero,zero,xmm3[7,15,u,u] +; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] +; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 +; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa %xmm1, 32(%rax) +; AVX512F-NEXT: vmovdqa %ymm0, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -396,31 +412,33 @@ ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm2[u,u,6,14],zero,zero,xmm2[u,u,7,15],zero,zero,xmm2[u,u] +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,13,u,u],zero,zero,xmm3[6,14,u,u],zero,zero,xmm3[7,15,u,u] +; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512BW-NEXT: movw $18724, %cx # imm = 0x4924 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] -; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX512BW-NEXT: vmovdqa %ymm1, (%rax) +; AVX512BW-NEXT: vmovdqu16 %ymm0, %ymm2 {%k1} +; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa %xmm1, 32(%rax) +; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64 @@ -443,138 +461,134 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm11 -; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa (%rdx), %xmm13 -; SSE-NEXT: movdqa (%rcx), %xmm2 -; SSE-NEXT: movdqa (%r8), %xmm9 -; SSE-NEXT: movdqa (%r9), %xmm12 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: por %xmm3, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm3, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: por %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,2,2] -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm10 -; SSE-NEXT: por %xmm8, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pand %xmm8, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,2,2] -; SSE-NEXT: movdqa %xmm8, %xmm14 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: por %xmm10, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm12[8],xmm2[9],xmm12[9],xmm2[10],xmm12[10],xmm2[11],xmm12[11],xmm2[12],xmm12[12],xmm2[13],xmm12[13],xmm2[14],xmm12[14],xmm2[15],xmm12[15] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm2[1,1,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: pandn %xmm15, %xmm12 -; SSE-NEXT: pand %xmm10, %xmm14 -; SSE-NEXT: por %xmm14, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm11[3,3,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: pandn %xmm14, %xmm15 -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,2,2,3] -; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa (%rsi), %xmm9 +; SSE-NEXT: movdqa (%rdx), %xmm3 +; SSE-NEXT: movdqa (%rcx), %xmm7 +; SSE-NEXT: movdqa (%r8), %xmm13 +; SSE-NEXT: movdqa (%r9), %xmm8 +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,0,1,1] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm13, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: pandn %xmm5, %xmm14 +; SSE-NEXT: por %xmm6, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm5, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm15, %xmm6 +; SSE-NEXT: por %xmm14, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,2,2] +; SSE-NEXT: pand %xmm4, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm7, %xmm15 +; SSE-NEXT: por %xmm9, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pand %xmm9, %xmm15 +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,2,2] +; SSE-NEXT: movdqa %xmm9, %xmm14 +; SSE-NEXT: pandn %xmm7, %xmm14 ; SSE-NEXT: por %xmm15, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm15, %xmm5 -; SSE-NEXT: pand %xmm4, %xmm14 -; SSE-NEXT: por %xmm14, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] -; SSE-NEXT: movdqa %xmm14, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pand %xmm14, %xmm5 -; SSE-NEXT: por %xmm5, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,1,1] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: por %xmm5, %xmm9 -; SSE-NEXT: pand %xmm3, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0] -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: por %xmm9, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[2,2,3,3] -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: por %xmm4, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pand %xmm10, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,2] -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,2,2] +; SSE-NEXT: movdqa %xmm7, %xmm8 +; SSE-NEXT: pandn %xmm1, %xmm8 +; SSE-NEXT: pand %xmm7, %xmm14 +; SSE-NEXT: por %xmm14, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: pandn %xmm1, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,2,2] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm1, %xmm14 +; SSE-NEXT: pand %xmm9, %xmm14 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,2,2] +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: por %xmm14, %xmm9 +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,2,2] +; SSE-NEXT: pandn %xmm1, %xmm7 +; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: por %xmm9, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm12[2,2,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: pandn %xmm9, %xmm12 +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; SSE-NEXT: pand %xmm10, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: pandn %xmm1, %xmm11 +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,1] +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: pandn %xmm9, %xmm12 +; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: pand %xmm4, %xmm12 +; SSE-NEXT: por %xmm12, %xmm9 +; SSE-NEXT: pand %xmm5, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,0,0] +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: por %xmm9, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,2,3,3] +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,3,3] +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm10, 16(%rax) -; SSE-NEXT: movdqa %xmm14, 32(%rax) -; SSE-NEXT: movdqa %xmm3, 48(%rax) -; SSE-NEXT: movdqa %xmm15, 80(%rax) -; SSE-NEXT: movdqa %xmm12, 64(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movdqa %xmm10, 80(%rax) +; SSE-NEXT: movdqa %xmm5, 48(%rax) +; SSE-NEXT: movdqa %xmm11, 32(%rax) +; SSE-NEXT: movdqa %xmm7, 16(%rax) +; SSE-NEXT: movdqa %xmm8, 64(%rax) +; SSE-NEXT: movdqa %xmm6, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride6_vf16: @@ -685,9 +699,9 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512F-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 ; AVX512F-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512F-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 ; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u],zero,zero,ymm3[1,9,u,u],zero,zero,ymm3[2,10,u,u],zero,zero,ymm3[19,27,u,u],zero,zero,ymm3[20,28,u,u],zero,zero ; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] @@ -724,9 +738,9 @@ ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 @@ -781,636 +795,616 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride6_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $200, %rsp -; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: movdqa 16(%rsi), %xmm5 -; SSE-NEXT: movdqa 16(%rdx), %xmm12 -; SSE-NEXT: movdqa 16(%rcx), %xmm4 -; SSE-NEXT: movdqa 16(%r8), %xmm11 -; SSE-NEXT: movdqa 16(%r9), %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0] +; SSE-NEXT: subq $40, %rsp +; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa (%rsi), %xmm9 +; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa (%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r8), %xmm10 +; SSE-NEXT: movdqa (%r9), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,1,1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm2, %xmm7 -; SSE-NEXT: pandn %xmm3, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm10, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[3,3,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,5,6,7,7] +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm10, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm5, %xmm15 +; SSE-NEXT: por %xmm6, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm13, %xmm15 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,0,0,0] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: por %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm5, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: por %xmm5, %xmm15 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,0,65535,65535,0,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: pand %xmm11, %xmm15 +; SSE-NEXT: por %xmm15, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[1,1,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm15, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,5,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] -; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm4 ; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm4, %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsi), %xmm12 -; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: por %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,0,1,1] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,0,0] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm8, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa (%rdx), %xmm14 -; SSE-NEXT: movdqa (%rcx), %xmm6 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,6,7,7] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa (%r8), %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm15 -; SSE-NEXT: pandn %xmm4, %xmm15 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa (%r9), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm15 -; SSE-NEXT: por %xmm15, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm6, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: pand %xmm11, %xmm8 +; SSE-NEXT: por %xmm8, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm9 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[3,3,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: movdqa 16(%rdx), %xmm12 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm8, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: movdqa 16(%rcx), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm8 +; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,2] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm15 -; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,2,2] -; SSE-NEXT: movdqa %xmm15, %xmm11 -; SSE-NEXT: pandn %xmm3, %xmm11 -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm14, %xmm4 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,2,2] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,2,2] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3],xmm15[4],xmm3[4],xmm15[5],xmm3[5],xmm15[6],xmm3[6],xmm15[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[0,0,1,1] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa 16(%r8), %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm9, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: movdqa 16(%r9), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,0,0] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm9, %xmm10 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,2,2] +; SSE-NEXT: movdqa %xmm11, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: pand %xmm11, %xmm9 +; SSE-NEXT: por %xmm9, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: pand %xmm5, %xmm10 +; SSE-NEXT: por %xmm10, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[3,3,3,3] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,3] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm10, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[2,2,3,3] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm8, %xmm10 ; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,2] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: pandn %xmm0, %xmm14 -; SSE-NEXT: pand %xmm15, %xmm5 -; SSE-NEXT: por %xmm5, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,1,1] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm4, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: por %xmm3, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm7, %xmm8 ; SSE-NEXT: pandn %xmm3, %xmm8 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: por %xmm8, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,2] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: por %xmm10, %xmm8 +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: # xmm12 = xmm12[8],mem[8],xmm12[9],mem[9],xmm12[10],mem[10],xmm12[11],mem[11],xmm12[12],mem[12],xmm12[13],mem[13],xmm12[14],mem[14],xmm12[15],mem[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[1,0,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] ; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm3, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,2,2] -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm15, %xmm8 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,2,2] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,2] -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm8, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] -; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,1,1] -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm6, %xmm10 +; SSE-NEXT: pand %xmm13, %xmm10 +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0] +; SSE-NEXT: pandn %xmm3, %xmm13 +; SSE-NEXT: por %xmm10, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,2,2] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: pand %xmm11, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,2,2] +; SSE-NEXT: pandn %xmm3, %xmm11 +; SSE-NEXT: por %xmm10, %xmm11 +; SSE-NEXT: pand %xmm5, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2] +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm11, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,5,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,3] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,2,3,3] +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm8 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: pshuflw $161, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,0,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,1,1] -; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pshufd $0, (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,0,0] -; SSE-NEXT: movdqa %xmm2, %xmm13 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: por %xmm8, %xmm13 -; SSE-NEXT: pand %xmm10, %xmm13 -; SSE-NEXT: pshufd $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,0,0] -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm13, %xmm10 -; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,3,3,3] -; SSE-NEXT: pshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[0,1,2,3,5,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,3] -; SSE-NEXT: pand %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm8, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm2 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: pandn %xmm1, %xmm9 -; SSE-NEXT: por %xmm2, %xmm9 -; SSE-NEXT: pand %xmm7, %xmm9 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: por %xmm9, %xmm7 +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: por %xmm1, %xmm7 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm7, 32(%rax) -; SSE-NEXT: movdqa %xmm10, 48(%rax) -; SSE-NEXT: movdqa %xmm0, 96(%rax) -; SSE-NEXT: movdqa %xmm15, 112(%rax) -; SSE-NEXT: movdqa %xmm6, 160(%rax) -; SSE-NEXT: movdqa %xmm11, 176(%rax) -; SSE-NEXT: movdqa %xmm12, (%rax) -; SSE-NEXT: movdqa %xmm14, 16(%rax) +; SSE-NEXT: movdqa %xmm7, 176(%rax) +; SSE-NEXT: movdqa %xmm5, 160(%rax) +; SSE-NEXT: movdqa %xmm13, 144(%rax) +; SSE-NEXT: movdqa %xmm8, 128(%rax) +; SSE-NEXT: movdqa %xmm9, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 64(%rax) +; SSE-NEXT: movaps %xmm0, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rax) +; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rax) -; SSE-NEXT: addq $200, %rsp +; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: addq $40, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride6_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm12[8,u],zero,zero,zero,zero,xmm12[9,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4],zero,xmm3[6,7,8,9,10],zero,xmm3[12,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm14 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm14[8],zero,zero,zero,zero,zero,xmm14[9],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm12[5,u],zero,zero,zero,zero,xmm12[6,u],zero,zero,zero,zero,xmm12[7,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm14[5],zero,zero,zero,zero,zero,xmm14[6],zero,zero,zero,zero,zero,xmm14[7] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps %ymm3, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm6 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[2,u],zero,zero,zero,zero,xmm5[3,u],zero,zero,zero,zero,xmm5[4,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm2[1,2],xmm7[3],xmm2[4,5],xmm7[6],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm3[2],zero,zero,zero,zero,zero,xmm3[3],zero,zero,zero,zero,zero,xmm3[4],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm5[0,u],zero,zero,zero,zero,xmm5[1,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm3[0],zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[10,u],zero,zero,zero,zero,xmm12[11,u],zero,zero,zero,zero,xmm12[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,3,4,5,6],zero,xmm1[8,9,10,11,12],zero,xmm1[14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm14[10],zero,zero,zero,zero,zero,xmm14[11],zero,zero,zero,zero,zero,xmm14[12],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm5[13,u],zero,zero,zero,zero,xmm5[14,u],zero,zero,zero,zero,xmm5[15,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7,8],zero,xmm1[10,11,12,13,14],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[13],zero,zero,zero,zero,zero,xmm3[14],zero,zero,zero,zero,zero,xmm3[15] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u> -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm12, %xmm1 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,13,128,128,128,128,128,14,128,128,128,128,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm14, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[10,u],zero,zero,zero,zero,xmm5[11,u],zero,zero,zero,zero,xmm5[12,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm3[10],zero,zero,zero,zero,zero,xmm3[11],zero,zero,zero,zero,zero,xmm3[12],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm15 = xmm5[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm9 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3],xmm0[4],xmm9[5,6],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm8 -; AVX1-ONLY-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm15[1,2],xmm8[3],xmm15[4,5],xmm8[6],xmm15[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0],zero,xmm8[2,3,4,5,6],zero,xmm8[8,9,10,11,12],zero,xmm8[14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm0[10],zero,zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero,zero,xmm0[12],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm9[0,0,1,1] -; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm13[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm15, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm15, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm10, %ymm8, %ymm10 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm12[0,u],zero,zero,zero,zero,xmm12[1,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,xmm14[0],zero,zero,zero,zero,zero,xmm14[1],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm9, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1,2],xmm12[3],xmm9[4,5],xmm12[6],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm14, %xmm14 -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm14 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,zero,xmm2[8,u],zero,zero,zero,zero,xmm2[9,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm12[2],xmm0[3,4],xmm12[5],xmm0[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,zero,zero,xmm1[8],zero,zero,zero,zero,zero,xmm1[9],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <128,128,5,u,128,128,128,128,6,u,128,128,128,128,7,u> +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm12 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3],xmm12[4],xmm14[5,6],xmm12[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,5,128,128,128,128,128,6,128,128,128,128,128,7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = mem[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm15, %ymm9 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,0,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm6[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm15, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm13, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm15, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm15, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm9, %ymm7 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2,3],xmm0[4],xmm7[5,6],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm13 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm9 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm5[8,u],zero,zero,zero,zero,xmm5[9,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm3[8],zero,zero,zero,zero,zero,xmm3[9],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm2[0,u],zero,zero,zero,zero,xmm2[1,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[2,u],zero,zero,zero,zero,xmm2[3,u],zero,zero,zero,zero,xmm2[4,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5],xmm6[6],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm1[2],zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,xmm1[4],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6 ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm15, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm2, %ymm7 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm2 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm10 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2],xmm2[3],xmm10[4,5],xmm2[6],xmm10[7] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,xmm1[0,u],zero,zero,zero,zero,xmm1[1,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2],xmm7[3,4],xmm10[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[10,u],zero,zero,zero,zero,xmm2[11,u],zero,zero,zero,zero,xmm2[12,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm1[10],zero,zero,zero,zero,zero,xmm1[11],zero,zero,zero,zero,zero,xmm1[12],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm1[5,u],zero,zero,zero,zero,xmm1[6,u],zero,zero,zero,zero,xmm1[7,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[4,5,6,7,8],zero,xmm4[10,11,12,13,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[8,u],zero,zero,zero,zero,xmm1[9,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,xmm0[9],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[13,u],zero,zero,zero,zero,xmm2[14,u],zero,zero,zero,zero,xmm2[15,u] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm2[1],xmm6[2,3],xmm2[4],xmm6[5,6],xmm2[7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[13],zero,zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,zero,xmm1[15] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm7, (%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm9, 112(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm11, 96(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 176(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 160(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 112(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm3, 96(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 176(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rax) ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i8_stride6_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: subq $40, %rsp +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm6 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm9 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm10 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm8 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm5, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm7 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm11 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm10 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm8 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm14 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm5 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[4],ymm12[4],ymm8[5],ymm12[5],ymm8[6],ymm12[6],ymm8[7],ymm12[7],ymm8[16],ymm12[16],ymm8[17],ymm12[17],ymm8[18],ymm12[18],ymm8[19],ymm12[19],ymm8[20],ymm12[20],ymm8[21],ymm12[21],ymm8[22],ymm12[22],ymm8[23],ymm12[23] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm15 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm8[0],ymm15[0],ymm8[1],ymm15[1],ymm8[2],ymm15[2],ymm8[3],ymm15[3],ymm8[4],ymm15[4],ymm8[5],ymm15[5],ymm8[6],ymm15[6],ymm8[7],ymm15[7],ymm8[16],ymm15[16],ymm8[17],ymm15[17],ymm8[18],ymm15[18],ymm8[19],ymm15[19],ymm8[20],ymm15[20],ymm8[21],ymm15[21],ymm8[22],ymm15[22],ymm8[23],ymm15[23] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm8 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm14 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm6, %ymm12 +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm4, %ymm10 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[16],ymm12[16],ymm10[17],ymm12[17],ymm10[18],ymm12[18],ymm10[19],ymm12[19],ymm10[20],ymm12[20],ymm10[21],ymm12[21],ymm10[22],ymm12[22],ymm10[23],ymm12[23] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm2, %ymm15 +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm10 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[16],ymm15[16],ymm10[17],ymm15[17],ymm10[18],ymm15[18],ymm10[19],ymm15[19],ymm10[20],ymm15[20],ymm10[21],ymm15[21],ymm10[22],ymm15[22],ymm10[23],ymm15[23] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm10 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm12, %ymm15, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm12, %ymm15, %ymm11 ; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm10, %ymm15, %ymm13 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm11, %ymm15, %ymm13 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm10, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm13 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, %xmm13 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,0,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,4,4] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[16],ymm7[16],ymm1[17],ymm7[17],ymm1[18],ymm7[18],ymm1[19],ymm7[19],ymm1[20],ymm7[20],ymm1[21],ymm7[21],ymm1[22],ymm7[22],ymm1[23],ymm7[23] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm15[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm15, %ymm10, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm3 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm10, %ymm10 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm14, %ymm3 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm10, %ymm14, %ymm14 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm10, %ymm15 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm13[8],xmm6[9],xmm13[9],xmm6[10],xmm13[10],xmm6[11],xmm13[11],xmm6[12],xmm13[12],xmm6[13],xmm13[13],xmm6[14],xmm13[14],xmm6[15],xmm13[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm7[8],ymm2[9],ymm7[9],ymm2[10],ymm7[10],ymm2[11],ymm7[11],ymm2[12],ymm7[12],ymm2[13],ymm7[13],ymm2[14],ymm7[14],ymm2[15],ymm7[15],ymm2[24],ymm7[24],ymm2[25],ymm7[25],ymm2[26],ymm7[26],ymm2[27],ymm7[27],ymm2[28],ymm7[28],ymm2[29],ymm7[29],ymm2[30],ymm7[30],ymm2[31],ymm7[31] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm11 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm14, %ymm14 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm7, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15],ymm6[24],ymm4[24],ymm6[25],ymm4[25],ymm6[26],ymm4[26],ymm6[27],ymm4[27],ymm6[28],ymm4[28],ymm6[29],ymm4[29],ymm6[30],ymm4[30],ymm6[31],ymm4[31] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm2, 128(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm14, 96(%rax) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm15, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm14, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, (%rax) +; AVX2-SLOW-NEXT: addq $40, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -1420,27 +1414,25 @@ ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm0 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm8 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm7 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm7 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm11 -; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm9 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm10 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm10 ; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm8 -; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] @@ -1458,90 +1450,87 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm12, %ymm15, %ymm11 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm15, %ymm13 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX2-FAST-NEXT: vmovdqa %xmm9, %xmm13 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm9 -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm8 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm13 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm15[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm15, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm14, %ymm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm14, %ymm14 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm13[8],xmm6[9],xmm13[9],xmm6[10],xmm13[10],xmm6[11],xmm13[11],xmm6[12],xmm13[12],xmm6[13],xmm13[13],xmm6[14],xmm13[14],xmm6[15],xmm13[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FAST-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm14, 96(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm14, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm11, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm11, (%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -1551,27 +1540,25 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm6, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm7, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm7, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm9, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] @@ -1589,194 +1576,194 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm12, %ymm15, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm15, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm15[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm14, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm15, %ymm11, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm14, %ymm11, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm14, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm11, %ymm14, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm14, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm13[8],xmm6[9],xmm13[9],xmm6[10],xmm13[10],xmm6[11],xmm13[11],xmm6[12],xmm13[12],xmm6[13],xmm13[13],xmm6[14],xmm13[14],xmm6[15],xmm13[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,u,17,u,16,u,19,u,u,u,u,u,20,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i8_stride6_vf32: ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm13 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[16],ymm12[16],ymm10[17],ymm12[17],ymm10[18],ymm12[18],ymm10[19],ymm12[19],ymm10[20],ymm12[20],ymm10[21],ymm12[21],ymm10[22],ymm12[22],ymm10[23],ymm12[23] -; AVX512F-SLOW-NEXT: vprold $16, %ymm3, %ymm3 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[16],ymm13[16],ymm11[17],ymm13[17],ymm11[18],ymm13[18],ymm11[19],ymm13[19],ymm11[20],ymm13[20],ymm11[21],ymm13[21],ymm11[22],ymm13[22],ymm11[23],ymm13[23] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm5, %zmm17, %zmm14 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm6 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm14 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] +; AVX512F-SLOW-NEXT: vprold $16, %ymm7, %ymm7 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm7 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[16],ymm5[16],ymm3[17],ymm5[17],ymm3[18],ymm5[18],ymm3[19],ymm5[19],ymm3[20],ymm5[20],ymm3[21],ymm5[21],ymm3[22],ymm5[22],ymm3[23],ymm5[23] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm6, %zmm12 +; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm18 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm18, %zmm12 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpandq %zmm7, %zmm12, %zmm15 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm13 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[16],ymm9[16],ymm8[17],ymm9[17],ymm8[18],ymm9[18],ymm8[19],ymm9[19],ymm8[20],ymm9[20],ymm8[21],ymm9[21],ymm8[22],ymm9[22],ymm8[23],ymm9[23] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[2,1,0,3,4,5,6,7,10,9,8,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm7 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm12, %ymm15 -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm10, %ymm14 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[16],ymm15[16],ymm14[17],ymm15[17],ymm14[18],ymm15[18],ymm14[19],ymm15[19],ymm14[20],ymm15[20],ymm14[21],ymm15[21],ymm14[22],ymm15[22],ymm14[23],ymm15[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15],ymm11[24],ymm13[24],ymm11[25],ymm13[25],ymm11[26],ymm13[26],ymm11[27],ymm13[27],ymm11[28],ymm13[28],ymm11[29],ymm13[29],ymm11[30],ymm13[30],ymm11[31],ymm13[31] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm15, %zmm16 -; AVX512F-SLOW-NEXT: vpandq %zmm16, %zmm14, %zmm14 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm13, %ymm13 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[4],ymm13[4],ymm0[5],ymm13[5],ymm0[6],ymm13[6],ymm0[7],ymm13[7],ymm0[16],ymm13[16],ymm0[17],ymm13[17],ymm0[18],ymm13[18],ymm0[19],ymm13[19],ymm0[20],ymm13[20],ymm0[21],ymm13[21],ymm0[22],ymm13[22],ymm0[23],ymm13[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm10[8],ymm12[8],ymm10[9],ymm12[9],ymm10[10],ymm12[10],ymm10[11],ymm12[11],ymm10[12],ymm12[12],ymm10[13],ymm12[13],ymm10[14],ymm12[14],ymm10[15],ymm12[15],ymm10[24],ymm12[24],ymm10[25],ymm12[25],ymm10[26],ymm12[26],ymm10[27],ymm12[27],ymm10[28],ymm12[28],ymm10[29],ymm12[29],ymm10[30],ymm12[30],ymm10[31],ymm12[31] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $186, %zmm14, %zmm15, %zmm0 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm10 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm9, %ymm11 -; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm8, %ymm10 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[16],ymm11[16],ymm10[17],ymm11[17],ymm10[18],ymm11[18],ymm10[19],ymm11[19],ymm10[20],ymm11[20],ymm10[21],ymm11[21],ymm10[22],ymm11[22],ymm10[23],ymm11[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15],ymm8[24],ymm9[24],ymm8[25],ymm9[25],ymm8[26],ymm9[26],ymm8[27],ymm9[27],ymm8[28],ymm9[28],ymm8[29],ymm9[29],ymm8[30],ymm9[30],ymm8[31],ymm9[31] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm14[0],ymm1[1],ymm14[1],ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[4],ymm14[4],ymm1[5],ymm14[5],ymm1[6],ymm14[6],ymm1[7],ymm14[7],ymm1[16],ymm14[16],ymm1[17],ymm14[17],ymm1[18],ymm14[18],ymm1[19],ymm14[19],ymm1[20],ymm14[20],ymm1[21],ymm14[21],ymm1[22],ymm14[22],ymm1[23],ymm14[23] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,0,3,4,5,6,7,10,9,8,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-SLOW-NEXT: vprold $16, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-SLOW-NEXT: vprold $16, %xmm15, %xmm15 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm0 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6,5] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,1,0,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm18, %zmm0 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm4, %ymm9 +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm8 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[16],ymm9[16],ymm8[17],ymm9[17],ymm8[18],ymm9[18],ymm8[19],ymm9[19],ymm8[20],ymm9[20],ymm8[21],ymm9[21],ymm8[22],ymm9[22],ymm8[23],ymm9[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11],ymm3[12],ymm5[12],ymm3[13],ymm5[13],ymm3[14],ymm5[14],ymm3[15],ymm5[15],ymm3[24],ymm5[24],ymm3[25],ymm5[25],ymm3[26],ymm5[26],ymm3[27],ymm5[27],ymm3[28],ymm5[28],ymm3[29],ymm5[29],ymm3[30],ymm5[30],ymm3[31],ymm5[31] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm6, %zmm9 +; AVX512F-SLOW-NEXT: vpandq %zmm9, %zmm8, %zmm8 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[16],ymm5[16],ymm3[17],ymm5[17],ymm3[18],ymm5[18],ymm3[19],ymm5[19],ymm3[20],ymm5[20],ymm3[21],ymm5[21],ymm3[22],ymm5[22],ymm3[23],ymm5[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $186, %zmm8, %zmm6, %zmm2 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm14, %ymm4 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm3 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm14[8],ymm1[9],ymm14[9],ymm1[10],ymm14[10],ymm1[11],ymm14[11],ymm1[12],ymm14[12],ymm1[13],ymm14[13],ymm1[14],ymm14[14],ymm1[15],ymm14[15],ymm1[24],ymm14[24],ymm1[25],ymm14[25],ymm1[26],ymm14[26],ymm1[27],ymm14[27],ymm1[28],ymm14[28],ymm1[29],ymm14[29],ymm1[30],ymm14[30],ymm1[31],ymm14[31] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq @@ -1788,8 +1775,8 @@ ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm13 ; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm10 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm12 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm9 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm6 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] @@ -1798,186 +1785,190 @@ ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[16],ymm12[16],ymm10[17],ymm12[17],ymm10[18],ymm12[18],ymm10[19],ymm12[19],ymm10[20],ymm12[20],ymm10[21],ymm12[21],ymm10[22],ymm12[22],ymm10[23],ymm12[23] ; AVX512F-FAST-NEXT: vprold $16, %ymm3, %ymm3 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm7 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[16],ymm13[16],ymm11[17],ymm13[17],ymm11[18],ymm13[18],ymm11[19],ymm13[19],ymm11[20],ymm13[20],ymm11[21],ymm13[21],ymm11[22],ymm13[22],ymm11[23],ymm13[23] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm14 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm17, %zmm14 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm6 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[16],ymm9[16],ymm8[17],ymm9[17],ymm8[18],ymm9[18],ymm8[19],ymm9[19],ymm8[20],ymm9[20],ymm8[21],ymm9[21],ymm8[22],ymm9[22],ymm8[23],ymm9[23] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,16,17,22,23,24,25,24,25,24,25,24,25] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm7 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm14 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[16],ymm15[16],ymm14[17],ymm15[17],ymm14[18],ymm15[18],ymm14[19],ymm15[19],ymm14[20],ymm15[20],ymm14[21],ymm15[21],ymm14[22],ymm15[22],ymm14[23],ymm15[23] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[16],ymm13[16],ymm11[17],ymm13[17],ymm11[18],ymm13[18],ymm11[19],ymm13[19],ymm11[20],ymm13[20],ymm11[21],ymm13[21],ymm11[22],ymm13[22],ymm11[23],ymm13[23] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm8 +; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm16 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm7, %zmm16, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vpandq %zmm7, %zmm8, %zmm15 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm7 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm8 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,16,17,22,23,24,25,24,25,24,25,24,25] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm9 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm2 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[16],ymm15[16],ymm2[17],ymm15[17],ymm2[18],ymm15[18],ymm2[19],ymm15[19],ymm2[20],ymm15[20],ymm2[21],ymm15[21],ymm2[22],ymm15[22],ymm2[23],ymm15[23] ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11],ymm11[12],ymm13[12],ymm11[13],ymm13[13],ymm11[14],ymm13[14],ymm11[15],ymm13[15],ymm11[24],ymm13[24],ymm11[25],ymm13[25],ymm11[26],ymm13[26],ymm11[27],ymm13[27],ymm11[28],ymm13[28],ymm11[29],ymm13[29],ymm11[30],ymm13[30],ymm11[31],ymm13[31] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm15, %zmm16 -; AVX512F-FAST-NEXT: vpandq %zmm16, %zmm14, %zmm14 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[2],ymm13[2],ymm2[3],ymm13[3],ymm2[4],ymm13[4],ymm2[5],ymm13[5],ymm2[6],ymm13[6],ymm2[7],ymm13[7],ymm2[16],ymm13[16],ymm2[17],ymm13[17],ymm2[18],ymm13[18],ymm2[19],ymm13[19],ymm2[20],ymm13[20],ymm2[21],ymm13[21],ymm2[22],ymm13[22],ymm2[23],ymm13[23] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm14, %zmm15 +; AVX512F-FAST-NEXT: vpandq %zmm15, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm11, %ymm11 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[4],ymm13[4],ymm11[5],ymm13[5],ymm11[6],ymm13[6],ymm11[7],ymm13[7],ymm11[16],ymm13[16],ymm11[17],ymm13[17],ymm11[18],ymm13[18],ymm11[19],ymm13[19],ymm11[20],ymm13[20],ymm11[21],ymm13[21],ymm11[22],ymm13[22],ymm11[23],ymm13[23] ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm10[8],ymm12[8],ymm10[9],ymm12[9],ymm10[10],ymm12[10],ymm10[11],ymm12[11],ymm10[12],ymm12[12],ymm10[13],ymm12[13],ymm10[14],ymm12[14],ymm10[15],ymm12[15],ymm10[24],ymm12[24],ymm10[25],ymm12[25],ymm10[26],ymm12[26],ymm10[27],ymm12[27],ymm10[28],ymm12[28],ymm10[29],ymm12[29],ymm10[30],ymm12[30],ymm10[31],ymm12[31] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpternlogq $186, %zmm2, %zmm14, %zmm10 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm11[0],ymm2[1],ymm11[1],ymm2[2],ymm11[2],ymm2[3],ymm11[3],ymm2[4],ymm11[4],ymm2[5],ymm11[5],ymm2[6],ymm11[6],ymm2[7],ymm11[7],ymm2[16],ymm11[16],ymm2[17],ymm11[17],ymm2[18],ymm11[18],ymm2[19],ymm11[19],ymm2[20],ymm11[20],ymm2[21],ymm11[21],ymm2[22],ymm11[22],ymm2[23],ymm11[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $186, %zmm14, %zmm15, %zmm2 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm10 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[16],ymm11[16],ymm10[17],ymm11[17],ymm10[18],ymm11[18],ymm10[19],ymm11[19],ymm10[20],ymm11[20],ymm10[21],ymm11[21],ymm10[22],ymm11[22],ymm10[23],ymm11[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm8[8],ymm9[8],ymm8[9],ymm9[9],ymm8[10],ymm9[10],ymm8[11],ymm9[11],ymm8[12],ymm9[12],ymm8[13],ymm9[13],ymm8[14],ymm9[14],ymm8[15],ymm9[15],ymm8[24],ymm9[24],ymm8[25],ymm9[25],ymm8[26],ymm9[26],ymm8[27],ymm9[27],ymm8[28],ymm9[28],ymm8[29],ymm9[29],ymm8[30],ymm9[30],ymm8[31],ymm9[31] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm8 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm9 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm9, %zmm2 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm2 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm6 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm5 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm6, %zmm5 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-FAST-NEXT: vprold $16, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,2,3,0,1,6,7,8,9,8,9,8,9,8,9] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm3 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,2,3,0,1,6,7,8,9,8,9,8,9,8,9] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm17, %zmm1 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm16, %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 64(%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: store_i8_stride6_vf32: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %ymm5 +; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %ymm6 ; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %ymm1 -; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm6 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] ; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm12, %xmm10 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX512BW-SLOW-NEXT: movw $18724, %cx # imm = 0x4924 -; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm6, %ymm10 {%k1} -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm6 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] -; AVX512BW-SLOW-NEXT: vpermw %ymm10, %ymm11, %ymm10 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] -; AVX512BW-SLOW-NEXT: vprold $16, %xmm11, %xmm11 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX512BW-SLOW-NEXT: movw $9362, %cx # imm = 0x2492 -; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm11, %ymm10 {%k2} -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm10[0,1,2,3],zmm6[4,5,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm10 -; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm11 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512BW-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm11, %xmm14, %xmm11 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4] -; AVX512BW-SLOW-NEXT: vpermw %ymm13, %ymm15, %ymm13 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] +; AVX512BW-SLOW-NEXT: vprold $16, %ymm7, %ymm7 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm7 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13] +; AVX512BW-SLOW-NEXT: vpermw %ymm4, %ymm10, %ymm4 +; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm12, %zmm4 ; AVX512BW-SLOW-NEXT: movl $613566756, %ecx # imm = 0x24924924 -; AVX512BW-SLOW-NEXT: kmovd %ecx, %k3 -; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm11, %zmm6 {%k3} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13] -; AVX512BW-SLOW-NEXT: vpermw %ymm11, %ymm13, %ymm11 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512BW-SLOW-NEXT: vprold $16, %ymm13, %ymm13 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm13, %ymm11 {%k2} -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7] -; AVX512BW-SLOW-NEXT: vpermw %ymm7, %ymm9, %ymm7 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] -; AVX512BW-SLOW-NEXT: vpermw %ymm8, %ymm9, %ymm7 {%k1} -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm11[4,5,6,7] -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,26,25,24,27,26,25,24,27,26,25,24,27,28,28,28,28] +; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm7, %zmm4 {%k1} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [10,9,8,11,10,9,8,11,10,9,8,11,12,12,12,12] +; AVX512BW-SLOW-NEXT: vpermw %ymm7, %ymm12, %ymm7 +; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm12 +; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm13 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm14, %zmm7 ; AVX512BW-SLOW-NEXT: movl $1227133513, %ecx # imm = 0x49249249 ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-SLOW-NEXT: vpermw %zmm8, %zmm9, %zmm7 {%k2} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm7, %zmm4 {%k2} +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-SLOW-NEXT: vpshufb %xmm7, %xmm8, %xmm14 +; AVX512BW-SLOW-NEXT: vpshufb %xmm7, %xmm9, %xmm7 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3],xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512BW-SLOW-NEXT: vpermw %ymm14, %ymm15, %ymm14 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm14, %zmm7 +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm14 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm11 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512BW-SLOW-NEXT: vprold $16, %xmm8, %xmm8 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] +; AVX512BW-SLOW-NEXT: movl $1227105426, %ecx # imm = 0x49242492 +; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2 +; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm8, %zmm7 {%k2} +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm12, %xmm9 +; AVX512BW-SLOW-NEXT: vpshufb %xmm8, %xmm13, %xmm8 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4] ; AVX512BW-SLOW-NEXT: vpermw %ymm9, %ymm10, %ymm9 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 +; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm8, %zmm7 {%k1} +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm9 +; AVX512BW-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm8 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[4],ymm9[4],ymm8[5],ymm9[5],ymm8[6],ymm9[6],ymm8[7],ymm9[7],ymm8[16],ymm9[16],ymm8[17],ymm9[17],ymm8[18],ymm9[18],ymm8[19],ymm9[19],ymm8[20],ymm9[20],ymm8[21],ymm9[21],ymm8[22],ymm9[22],ymm8[23],ymm9[23] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-SLOW-NEXT: vpermw %ymm8, %ymm10, %ymm9 {%k1} -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm8 -; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-SLOW-NEXT: vpermw %ymm9, %ymm10, %ymm9 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-SLOW-NEXT: vpshufb %ymm9, %ymm6, %ymm6 ; AVX512BW-SLOW-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX512BW-SLOW-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512BW-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm4, %ymm2 {%k1} -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm8[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-SLOW-NEXT: vpermw %ymm2, %ymm3, %ymm2 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 +; AVX512BW-SLOW-NEXT: movl $1227114788, %ecx # imm = 0x49244924 +; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm8, %zmm2 {%k1} ; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512BW-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm4 +; AVX512BW-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm5 ; AVX512BW-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm3 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[16],ymm5[16],ymm3[17],ymm5[17],ymm3[18],ymm5[18],ymm3[19],ymm5[19],ymm3[20],ymm5[20],ymm3[21],ymm5[21],ymm3[22],ymm5[22],ymm3[23],ymm5[23] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] @@ -1987,116 +1978,112 @@ ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: store_i8_stride6_vf32: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %ymm5 +; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %ymm6 ; AVX512BW-FAST-NEXT: vmovdqa (%r8), %ymm0 ; AVX512BW-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm9 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm8 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] +; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm9 +; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm10 ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] -; AVX512BW-FAST-NEXT: vpermw %ymm11, %ymm12, %ymm11 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6] -; AVX512BW-FAST-NEXT: movw $9362, %cx # imm = 0x2492 -; AVX512BW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-FAST-NEXT: vpermw %ymm6, %ymm12, %ymm11 {%k2} -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm12 -; AVX512BW-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm6 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm12[8],xmm6[9],xmm12[9],xmm6[10],xmm12[10],xmm6[11],xmm12[11],xmm6[12],xmm12[12],xmm6[13],xmm12[13],xmm6[14],xmm12[14],xmm6[15],xmm12[15] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm13 -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm12 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] -; AVX512BW-FAST-NEXT: movw $18724, %cx # imm = 0x4924 +; AVX512BW-FAST-NEXT: vpermw %ymm11, %ymm12, %ymm11 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm11 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm12 +; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm8 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512BW-FAST-NEXT: vpermw %ymm12, %ymm13, %ymm12 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm8 +; AVX512BW-FAST-NEXT: movl $1227105426, %ecx # imm = 0x49242492 ; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu16 %ymm6, %ymm12 {%k1} -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm6 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm11[0,1,2,3],zmm6[4,5,6,7] +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm11, %zmm8 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm11 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm13 -; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm14 -; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm11, %xmm14 +; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm12 +; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm13 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4] -; AVX512BW-FAST-NEXT: vpermw %ymm13, %ymm15, %ymm13 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 +; AVX512BW-FAST-NEXT: vpermw %ymm14, %ymm15, %ymm14 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 ; AVX512BW-FAST-NEXT: movl $613566756, %ecx # imm = 0x24924924 -; AVX512BW-FAST-NEXT: kmovd %ecx, %k3 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm12, %zmm6 {%k3} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13] -; AVX512BW-FAST-NEXT: vpermw %ymm13, %ymm15, %ymm13 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [9,8,11,10,9,8,11,10,9,8,11,10,13,12,15,14] -; AVX512BW-FAST-NEXT: vpermw %ymm12, %ymm15, %ymm13 {%k2} -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm12 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7] -; AVX512BW-FAST-NEXT: vpermw %ymm8, %ymm9, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] -; AVX512BW-FAST-NEXT: vpermw %ymm7, %ymm9, %ymm8 {%k1} -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm12[4,5,6,7] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,26,25,24,27,26,25,24,27,26,25,24,27,28,28,28,28] -; AVX512BW-FAST-NEXT: movl $1227133513, %ecx # imm = 0x49249249 +; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm13, %zmm8 {%k1} +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm14 +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm13 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[16],ymm14[16],ymm13[17],ymm14[17],ymm13[18],ymm14[18],ymm13[19],ymm14[19],ymm13[20],ymm14[20],ymm13[21],ymm14[21],ymm13[22],ymm14[22],ymm13[23],ymm14[23] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm14 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512BW-FAST-NEXT: vpermw %ymm14, %ymm15, %ymm14 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm15 +; AVX512BW-FAST-NEXT: vpshufb %ymm14, %ymm5, %ymm14 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[16],ymm15[16],ymm14[17],ymm15[17],ymm14[18],ymm15[18],ymm14[19],ymm15[19],ymm14[20],ymm15[20],ymm14[21],ymm15[21],ymm14[22],ymm15[22],ymm14[23],ymm15[23] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-FAST-NEXT: vpermw %ymm15, %ymm16, %ymm15 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512BW-FAST-NEXT: movl $1227114788, %ecx # imm = 0x49244924 ; AVX512BW-FAST-NEXT: kmovd %ecx, %k2 -; AVX512BW-FAST-NEXT: vpermw %zmm8, %zmm9, %zmm7 {%k2} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm9, %ymm10, %ymm9 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm8, %ymm10, %ymm9 {%k1} -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm8 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512BW-FAST-NEXT: vmovdqu16 %ymm4, %ymm2 {%k1} -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm8[4,5,6,7] -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512BW-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm4 -; AVX512BW-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm3 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] -; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm13, %zmm14 {%k2} +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm15 +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm13 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[4],ymm15[4],ymm13[5],ymm15[5],ymm13[6],ymm15[6],ymm13[7],ymm15[7],ymm13[16],ymm15[16],ymm13[17],ymm15[17],ymm13[18],ymm15[18],ymm13[19],ymm15[19],ymm13[20],ymm15[20],ymm13[21],ymm15[21],ymm13[22],ymm15[22],ymm13[23],ymm15[23] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] +; AVX512BW-FAST-NEXT: vpermw %ymm15, %ymm16, %ymm15 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm13 ; AVX512BW-FAST-NEXT: movl $-1840700270, %ecx # imm = 0x92492492 +; AVX512BW-FAST-NEXT: kmovd %ecx, %k2 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm13, %zmm14 {%k2} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] +; AVX512BW-FAST-NEXT: vpermw %zmm2, %zmm3, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,25,24,27,26,25,24,27,26,25,24,27,26,29,28,31,30] +; AVX512BW-FAST-NEXT: vpermw %zmm5, %zmm3, %zmm2 {%k1} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,26,25,24,27,26,25,24,27,26,25,24,27,28,28,28,28] +; AVX512BW-FAST-NEXT: movl $1227133513, %ecx # imm = 0x49249249 ; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64 @@ -2690,376 +2677,370 @@ ; ; AVX1-ONLY-LABEL: store_i8_stride6_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $200, %rsp -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: subq $136, %rsp +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0],xmm1[1,2],xmm10[3],xmm1[4,5],xmm10[6],xmm1[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,xmm1[2],zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,xmm1[4],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,xmm0[0,u],zero,zero,zero,zero,xmm0[1,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm7[0,1],xmm10[2],xmm7[3,4],xmm10[5],xmm7[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,3] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm8 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm11, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm0[8,u],zero,zero,zero,zero,xmm0[9,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,xmm1[8],zero,zero,zero,zero,zero,xmm1[9],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm8, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm0[5,u],zero,zero,zero,zero,xmm0[6,u],zero,zero,zero,zero,xmm0[7,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6],xmm8[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm1[5],zero,zero,zero,zero,zero,xmm1[6],zero,zero,zero,zero,zero,xmm1[7] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm2[8,u],zero,zero,zero,zero,xmm2[9,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4],zero,xmm4[6,7,8,9,10],zero,xmm4[12,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm5[8],zero,zero,zero,zero,zero,xmm5[9],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm2[5,u],zero,zero,zero,zero,xmm2[6,u],zero,zero,zero,zero,xmm2[7,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm0[13,u],zero,zero,zero,zero,xmm0[14,u],zero,zero,zero,zero,xmm0[15,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm5[5],zero,zero,zero,zero,zero,xmm5[6],zero,zero,zero,zero,zero,xmm5[7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[13],zero,zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,zero,xmm1[15] ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[13,u],zero,zero,zero,zero,xmm2[14,u],zero,zero,zero,zero,xmm2[15,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm5[13],zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,xmm5[15] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[10,u],zero,zero,zero,zero,xmm2[11,u],zero,zero,zero,zero,xmm2[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2,3,4,5,6],zero,xmm0[8,9,10,11,12],zero,xmm0[14,15] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [128,10,128,128,128,128,128,11,128,128,128,128,128,12,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,u],zero,zero,zero,zero,xmm0[11,u],zero,zero,zero,zero,xmm0[12,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[10],zero,zero,zero,zero,zero,xmm1[11],zero,zero,zero,zero,zero,xmm1[12],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm13, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm11[13,u],zero,zero,zero,zero,xmm11[14,u],zero,zero,zero,zero,xmm11[15,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3],xmm13[4],xmm0[5,6],xmm13[7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm0[13],zero,zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,zero,xmm0[15] -; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[10,u],zero,zero,zero,zero,xmm11[11,u],zero,zero,zero,zero,xmm11[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1,2],xmm13[3],xmm8[4,5],xmm13[6],xmm8[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm9, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm8 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[2,u],zero,zero,zero,zero,xmm2[3,u],zero,zero,zero,zero,xmm2[4,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5],xmm9[6],xmm8[7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2],xmm6[3,4],xmm2[5],xmm6[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[2,u],zero,zero,zero,zero,xmm11[3,u],zero,zero,zero,zero,xmm11[4,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7] -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm14 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm6 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1,2],xmm14[3],xmm1[4,5],xmm14[6],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm14, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm12, %ymm2 -; AVX1-ONLY-NEXT: vmovaps %ymm12, %ymm15 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,128,8,u,128,128,128,128,9,u,128,128,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,8,128,128,128,128,128,9,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,5,u,128,128,128,128,6,u,128,128,128,128,7,u> -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm11, %xmm2 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,5,128,128,128,128,128,6,128,128,128,128,128,7] -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[3,3,3,3] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm14 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7] +; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm14 +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2],xmm1[3,4],xmm5[5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm1 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm11, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,128,128,8,u,128,128,128,128,9,u,128,128,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm10 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2],xmm8[3,4],xmm10[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,128,128,8,128,128,128,128,128,9,128,128,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm8, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm5 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm0[5,u],zero,zero,zero,zero,xmm0[6,u],zero,zero,zero,zero,xmm0[7,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6],xmm8[7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,5,128,128,128,128,128,6,128,128,128,128,128,7] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u> +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,13,128,128,128,128,128,14,128,128,128,128,128,15] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <10,u,128,128,128,128,11,u,128,128,128,128,12,u,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [128,10,128,128,128,128,128,11,128,128,128,128,128,12,128,128] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[2,u],zero,zero,zero,zero,xmm0[3,u],zero,zero,zero,zero,xmm0[4,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0],xmm1[1,2],xmm13[3],xmm1[4,5],xmm13[6],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm13 +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,zero,xmm0[0,u],zero,zero,zero,zero,xmm0[1,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm15[2],xmm12[3,4],xmm15[5],xmm12[6,7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm15, %xmm12, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u> -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm4 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm0[8,u],zero,zero,zero,zero,xmm0[9,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm0[5,u],zero,zero,zero,zero,xmm0[6,u],zero,zero,zero,zero,xmm0[7,u] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,128,13,128,128,128,128,128,14,128,128,128,128,128,15] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[5],zero,zero,zero,zero,zero,xmm1[6],zero,zero,zero,zero,zero,xmm1[7] ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[10,u],zero,zero,zero,zero,xmm3[11,u],zero,zero,zero,zero,xmm3[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm11, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm0[13,u],zero,zero,zero,zero,xmm0[14,u],zero,zero,zero,zero,xmm0[15,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[13],zero,zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,zero,xmm1[15] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,u],zero,zero,zero,zero,xmm0[11,u],zero,zero,zero,zero,xmm0[12,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[10],zero,zero,zero,zero,zero,xmm1[11],zero,zero,zero,zero,zero,xmm1[12],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[10],zero,zero,zero,zero,zero,xmm1[11],zero,zero,zero,zero,zero,xmm1[12],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3],xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,0,1,1] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,2,2] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm8 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[2,u],zero,zero,zero,zero,xmm1[3,u],zero,zero,zero,zero,xmm1[4,u],zero,zero +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm9 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1,2],xmm0[3],xmm9[4,5],xmm0[6],xmm9[7] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,xmm0[2],zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,xmm0[4],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,zero,xmm1[0,u],zero,zero,zero,zero,xmm1[1,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm14 = xmm9[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,2,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm14 = xmm5[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm15, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm2[13,u],zero,zero,zero,zero,xmm2[14,u],zero,zero,zero,zero,xmm2[15,u] -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm15 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3],xmm0[4],xmm15[5,6],xmm0[7] -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm0, %xmm15 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[10,u],zero,zero,zero,zero,xmm2[11,u],zero,zero,zero,zero,xmm2[12,u],zero,zero -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2],xmm13[3],xmm14[4,5],xmm13[6],xmm14[7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,xmm0[10],zero,zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero,zero,xmm0[12],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm10[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm13, %ymm10 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm12, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm10, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm13 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm10[1,2],xmm13[3],xmm10[4,5],xmm13[6],xmm10[7] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm13 -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm10, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2],xmm11[3,4],xmm3[5],xmm11[6,7] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128] -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,0,1,1] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,2,2] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[1,0,2,2,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,3] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[1,0,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[3,3,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm3, %ymm6 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1,2],xmm3[3],xmm7[4,5],xmm3[6],xmm7[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0],zero,xmm3[2,3,4,5,6],zero,xmm3[8,9,10,11,12],zero,xmm3[14,15] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] -; AVX1-ONLY-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0,1,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,3] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[1,0,2,2,4,5,6,7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm7, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm2[8,u],zero,zero,zero,zero,xmm2[9,u],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm7 -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7] -; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,xmm0[9],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[5,u],zero,zero,zero,zero,xmm2[6,u],zero,zero,zero,zero,xmm2[7,u] -; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2],zero,xmm2[4,5,6,7,8],zero,xmm2[10,11,12,13,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm11, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[8,u],zero,zero,zero,zero,xmm1[9,u],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm6 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2],xmm6[3,4],xmm3[5],xmm6[6,7] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,xmm0[9],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm3, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[5,u],zero,zero,zero,zero,xmm1[6,u],zero,zero,zero,zero,xmm1[7,u] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,1,2,2] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[3,3,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm11, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm1[13,u],zero,zero,zero,zero,xmm1[14,u],zero,zero,zero,zero,xmm1[15,u] +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm0[13],zero,zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,zero,xmm0[15] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,3,4,5,6],zero,xmm1[8,9,10,11,12],zero,xmm1[14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10],zero,zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero,zero,xmm0[12],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm5, 48(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm3, 16(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 96(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm0, 352(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm4, 368(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 320(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm7, 336(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm8, 288(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, 304(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm10, 256(%rax) +; AVX1-ONLY-NEXT: vmovdqa %xmm12, 272(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3069,44 +3050,35 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 144(%rax) ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 240(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) -; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 288(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 304(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 112(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 272(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 80(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 368(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 48(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 320(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovaps %xmm0, 336(%rax) -; AVX1-ONLY-NEXT: addq $200, %rsp +; AVX1-ONLY-NEXT: vmovaps %xmm0, 16(%rax) +; AVX1-ONLY-NEXT: addq $136, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i8_stride6_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $696, %rsp # imm = 0x2B8 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX2-SLOW-NEXT: subq $664, %rsp # imm = 0x298 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm12 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 @@ -3116,27 +3088,28 @@ ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm14 -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm8 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 @@ -3144,9 +3117,9 @@ ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm5 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6 @@ -3155,16 +3128,16 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm6, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm7 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm6 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[4],ymm6[4],ymm3[5],ymm6[5],ymm3[6],ymm6[6],ymm3[7],ymm6[7],ymm3[16],ymm6[16],ymm3[17],ymm6[17],ymm3[18],ymm6[18],ymm3[19],ymm6[19],ymm3[20],ymm6[20],ymm3[21],ymm6[21],ymm3[22],ymm6[22],ymm3[23],ymm6[23] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm5 @@ -3172,58 +3145,56 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm7 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm6, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm6 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm4, %ymm5, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm4 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm10, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm15, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] @@ -3237,12 +3208,11 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] @@ -3260,59 +3230,60 @@ ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[4],mem[4],ymm12[5],mem[5],ymm12[6],mem[6],ymm12[7],mem[7],ymm12[16],mem[16],ymm12[17],mem[17],ymm12[18],mem[18],ymm12[19],mem[19],ymm12[20],mem[20],ymm12[21],mem[21],ymm12[22],mem[22],ymm12[23],mem[23] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm8[0],ymm11[0],ymm8[1],ymm11[1],ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[4],ymm11[4],ymm8[5],ymm11[5],ymm8[6],ymm11[6],ymm8[7],ymm11[7],ymm8[16],ymm11[16],ymm8[17],ymm11[17],ymm8[18],ymm11[18],ymm8[19],ymm11[19],ymm8[20],ymm11[20],ymm8[21],ymm11[21],ymm8[22],ymm11[22],ymm8[23],ymm11[23] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm12[0],ymm13[0],ymm12[1],ymm13[1],ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[4],ymm13[4],ymm12[5],ymm13[5],ymm12[6],ymm13[6],ymm12[7],ymm13[7],ymm12[16],ymm13[16],ymm12[17],ymm13[17],ymm12[18],ymm13[18],ymm12[19],ymm13[19],ymm12[20],ymm13[20],ymm12[21],ymm13[21],ymm12[22],ymm13[22],ymm12[23],ymm13[23] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm15, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm13 -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm15 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm15 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm15, %ymm15 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm10 +; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm10 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm14 +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm14 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm14, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm9, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm15, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] ; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm9, %ymm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm7, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm11, %ymm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3347,1509 +3318,1463 @@ ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11],ymm2[12],mem[12],ymm2[13],mem[13],ymm2[14],mem[14],ymm2[15],mem[15],ymm2[24],mem[24],ymm2[25],mem[25],ymm2[26],mem[26],ymm2[27],mem[27],ymm2[28],mem[28],ymm2[29],mem[29],ymm2[30],mem[30],ymm2[31],mem[31] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm14, %ymm14 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm14, %ymm10 -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm14 = ymm12[8],mem[8],ymm12[9],mem[9],ymm12[10],mem[10],ymm12[11],mem[11],ymm12[12],mem[12],ymm12[13],mem[13],ymm12[14],mem[14],ymm12[15],mem[15],ymm12[24],mem[24],ymm12[25],mem[25],ymm12[26],mem[26],ymm12[27],mem[27],ymm12[28],mem[28],ymm12[29],mem[29],ymm12[30],mem[30],ymm12[31],mem[31] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm14[8],mem[8],ymm14[9],mem[9],ymm14[10],mem[10],ymm14[11],mem[11],ymm14[12],mem[12],ymm14[13],mem[13],ymm14[14],mem[14],ymm14[15],mem[15],ymm14[24],mem[24],ymm14[25],mem[25],ymm14[26],mem[26],ymm14[27],mem[27],ymm14[28],mem[28],ymm14[29],mem[29],ymm14[30],mem[30],ymm14[31],mem[31] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm2 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm14 = ymm8[8],ymm11[8],ymm8[9],ymm11[9],ymm8[10],ymm11[10],ymm8[11],ymm11[11],ymm8[12],ymm11[12],ymm8[13],ymm11[13],ymm8[14],ymm11[14],ymm8[15],ymm11[15],ymm8[24],ymm11[24],ymm8[25],ymm11[25],ymm8[26],ymm11[26],ymm8[27],ymm11[27],ymm8[28],ymm11[28],ymm8[29],ymm11[29],ymm8[30],ymm11[30],ymm8[31],ymm11[31] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm14 = ymm12[8],ymm13[8],ymm12[9],ymm13[9],ymm12[10],ymm13[10],ymm12[11],ymm13[11],ymm12[12],ymm13[12],ymm12[13],ymm13[13],ymm12[14],ymm13[14],ymm12[15],ymm13[15],ymm12[24],ymm13[24],ymm12[25],ymm13[25],ymm12[26],ymm13[26],ymm12[27],ymm13[27],ymm12[28],ymm13[28],ymm12[29],ymm13[29],ymm12[30],ymm13[30],ymm12[31],ymm13[31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm14, %ymm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm15, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm15, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm8 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm3, %ymm8, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm10 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm3, %ymm10, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm8 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm4, %ymm8, %ymm4 -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm2 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm9, %ymm9 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm9, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm11, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 352(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 352(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 256(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 256(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) -; AVX2-SLOW-NEXT: addq $696, %rsp # imm = 0x2B8 +; AVX2-SLOW-NEXT: addq $664, %rsp # imm = 0x298 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride6_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $648, %rsp # imm = 0x288 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX2-FAST-NEXT: subq $664, %rsp # imm = 0x298 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm6 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm13 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm12 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm12, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm13 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm4 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm10, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm10, %xmm8 +; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm14 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm6 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm14 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm3 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[4],ymm6[4],ymm3[5],ymm6[5],ymm3[6],ymm6[6],ymm3[7],ymm6[7],ymm3[16],ymm6[16],ymm3[17],ymm6[17],ymm3[18],ymm6[18],ymm3[19],ymm6[19],ymm3[20],ymm6[20],ymm3[21],ymm6[21],ymm3[22],ymm6[22],ymm3[23],ymm6[23] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm12 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm5 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm15 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm13, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm6 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm10 +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm6 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm10, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm10 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm13 +; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[2],mem[2],ymm14[3],mem[3],ymm14[4],mem[4],ymm14[5],mem[5],ymm14[6],mem[6],ymm14[7],mem[7],ymm14[16],mem[16],ymm14[17],mem[17],ymm14[18],mem[18],ymm14[19],mem[19],ymm14[20],mem[20],ymm14[21],mem[21],ymm14[22],mem[22],ymm14[23],mem[23] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm13 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm12[8],ymm0[9],ymm12[9],ymm0[10],ymm12[10],ymm0[11],ymm12[11],ymm0[12],ymm12[12],ymm0[13],ymm12[13],ymm0[14],ymm12[14],ymm0[15],ymm12[15],ymm0[24],ymm12[24],ymm0[25],ymm12[25],ymm0[26],ymm12[26],ymm0[27],ymm12[27],ymm0[28],ymm12[28],ymm0[29],ymm12[29],ymm0[30],ymm12[30],ymm0[31],ymm12[31] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm12 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm7[8],ymm14[8],ymm7[9],ymm14[9],ymm7[10],ymm14[10],ymm7[11],ymm14[11],ymm7[12],ymm14[12],ymm7[13],ymm14[13],ymm7[14],ymm14[14],ymm7[15],ymm14[15],ymm7[24],ymm14[24],ymm7[25],ymm14[25],ymm7[26],ymm14[26],ymm7[27],ymm14[27],ymm7[28],ymm14[28],ymm7[29],ymm14[29],ymm7[30],ymm14[30],ymm7[31],ymm14[31] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm8, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[4],ymm11[4],ymm12[5],ymm11[5],ymm12[6],ymm11[6],ymm12[7],ymm11[7],ymm12[16],ymm11[16],ymm12[17],ymm11[17],ymm12[18],ymm11[18],ymm12[19],ymm11[19],ymm12[20],ymm11[20],ymm12[21],ymm11[21],ymm12[22],ymm11[22],ymm12[23],ymm11[23] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm15 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm15, %ymm15 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[4],ymm9[4],ymm4[5],ymm9[5],ymm4[6],ymm9[6],ymm4[7],ymm9[7],ymm4[16],ymm9[16],ymm4[17],ymm9[17],ymm4[18],ymm9[18],ymm4[19],ymm9[19],ymm4[20],ymm9[20],ymm4[21],ymm9[21],ymm4[22],ymm9[22],ymm4[23],ymm9[23] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm15, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm10, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm14 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm7, %xmm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm3, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm14 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm14 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm14, %ymm10 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw (%rsp), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm14 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm14, %xmm14 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm15 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm15 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11],ymm2[12],mem[12],ymm2[13],mem[13],ymm2[14],mem[14],ymm2[15],mem[15],ymm2[24],mem[24],ymm2[25],mem[25],ymm2[26],mem[26],ymm2[27],mem[27],ymm2[28],mem[28],ymm2[29],mem[29],ymm2[30],mem[30],ymm2[31],mem[31] +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm13[8],mem[8],ymm13[9],mem[9],ymm13[10],mem[10],ymm13[11],mem[11],ymm13[12],mem[12],ymm13[13],mem[13],ymm13[14],mem[14],ymm13[15],mem[15],ymm13[24],mem[24],ymm13[25],mem[25],ymm13[26],mem[26],ymm13[27],mem[27],ymm13[28],mem[28],ymm13[29],mem[29],ymm13[30],mem[30],ymm13[31],mem[31] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm15 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm5 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[16],ymm13[16],ymm15[17],ymm13[17],ymm15[18],ymm13[18],ymm15[19],ymm13[19],ymm15[20],ymm13[20],ymm15[21],ymm13[21],ymm15[22],ymm13[22],ymm15[23],ymm13[23] +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11],ymm12[12],ymm11[12],ymm12[13],ymm11[13],ymm12[14],ymm11[14],ymm12[15],ymm11[15],ymm12[24],ymm11[24],ymm12[25],ymm11[25],ymm12[26],ymm11[26],ymm12[27],ymm11[27],ymm12[28],ymm11[28],ymm12[29],ymm11[29],ymm12[30],ymm11[30],ymm12[31],ymm11[31] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm2 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm7[0],ymm12[0],ymm7[1],ymm12[1],ymm7[2],ymm12[2],ymm7[3],ymm12[3],ymm7[4],ymm12[4],ymm7[5],ymm12[5],ymm7[6],ymm12[6],ymm7[7],ymm12[7],ymm7[16],ymm12[16],ymm7[17],ymm12[17],ymm7[18],ymm12[18],ymm7[19],ymm12[19],ymm7[20],ymm12[20],ymm7[21],ymm12[21],ymm7[22],ymm12[22],ymm7[23],ymm12[23] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11],ymm4[12],ymm9[12],ymm4[13],ymm9[13],ymm4[14],ymm9[14],ymm4[15],ymm9[15],ymm4[24],ymm9[24],ymm4[25],ymm9[25],ymm4[26],ymm9[26],ymm4[27],ymm9[27],ymm4[28],ymm9[28],ymm4[29],ymm9[29],ymm4[30],ymm9[30],ymm4[31],ymm9[31] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm10, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm12 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm12, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm5 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 352(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 288(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 160(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 256(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $648, %rsp # imm = 0x288 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-NEXT: addq $664, %rsp # imm = 0x298 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i8_stride6_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $648, %rsp # imm = 0x288 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX2-FAST-PERLANE-NEXT: subq $664, %rsp # imm = 0x298 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm7, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[4],ymm6[4],ymm3[5],ymm6[5],ymm3[6],ymm6[6],ymm3[7],ymm6[7],ymm3[16],ymm6[16],ymm3[17],ymm6[17],ymm3[18],ymm6[18],ymm3[19],ymm6[19],ymm3[20],ymm6[20],ymm3[21],ymm6[21],ymm3[22],ymm6[22],ymm3[23],ymm6[23] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm12, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm7, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm13, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm5, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm5, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm10, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[2],mem[2],ymm14[3],mem[3],ymm14[4],mem[4],ymm14[5],mem[5],ymm14[6],mem[6],ymm14[7],mem[7],ymm14[16],mem[16],ymm14[17],mem[17],ymm14[18],mem[18],ymm14[19],mem[19],ymm14[20],mem[20],ymm14[21],mem[21],ymm14[22],mem[22],ymm14[23],mem[23] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm12[8],ymm0[9],ymm12[9],ymm0[10],ymm12[10],ymm0[11],ymm12[11],ymm0[12],ymm12[12],ymm0[13],ymm12[13],ymm0[14],ymm12[14],ymm0[15],ymm12[15],ymm0[24],ymm12[24],ymm0[25],ymm12[25],ymm0[26],ymm12[26],ymm0[27],ymm12[27],ymm0[28],ymm12[28],ymm0[29],ymm12[29],ymm0[30],ymm12[30],ymm0[31],ymm12[31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm7[8],ymm14[8],ymm7[9],ymm14[9],ymm7[10],ymm14[10],ymm7[11],ymm14[11],ymm7[12],ymm14[12],ymm7[13],ymm14[13],ymm7[14],ymm14[14],ymm7[15],ymm14[15],ymm7[24],ymm14[24],ymm7[25],ymm14[25],ymm7[26],ymm14[26],ymm7[27],ymm14[27],ymm7[28],ymm14[28],ymm7[29],ymm14[29],ymm7[30],ymm14[30],ymm7[31],ymm14[31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm8, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[4],ymm11[4],ymm12[5],ymm11[5],ymm12[6],ymm11[6],ymm12[7],ymm11[7],ymm12[16],ymm11[16],ymm12[17],ymm11[17],ymm12[18],ymm11[18],ymm12[19],ymm11[19],ymm12[20],ymm11[20],ymm12[21],ymm11[21],ymm12[22],ymm11[22],ymm12[23],ymm11[23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm9, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm8, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[4],ymm9[4],ymm4[5],ymm9[5],ymm4[6],ymm9[6],ymm4[7],ymm9[7],ymm4[16],ymm9[16],ymm4[17],ymm9[17],ymm4[18],ymm9[18],ymm4[19],ymm9[19],ymm4[20],ymm9[20],ymm4[21],ymm9[21],ymm4[22],ymm9[22],ymm4[23],ymm9[23] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm10, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm15, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm10, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm6, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm6, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm7, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm10, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm14, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm11, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm14 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm14, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw (%rsp), %xmm2, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm14 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm15 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11],ymm2[12],mem[12],ymm2[13],mem[13],ymm2[14],mem[14],ymm2[15],mem[15],ymm2[24],mem[24],ymm2[25],mem[25],ymm2[26],mem[26],ymm2[27],mem[27],ymm2[28],mem[28],ymm2[29],mem[29],ymm2[30],mem[30],ymm2[31],mem[31] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm13[8],mem[8],ymm13[9],mem[9],ymm13[10],mem[10],ymm13[11],mem[11],ymm13[12],mem[12],ymm13[13],mem[13],ymm13[14],mem[14],ymm13[15],mem[15],ymm13[24],mem[24],ymm13[25],mem[25],ymm13[26],mem[26],ymm13[27],mem[27],ymm13[28],mem[28],ymm13[29],mem[29],ymm13[30],mem[30],ymm13[31],mem[31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm14, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm15, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[16],ymm13[16],ymm15[17],ymm13[17],ymm15[18],ymm13[18],ymm15[19],ymm13[19],ymm15[20],ymm13[20],ymm15[21],ymm13[21],ymm15[22],ymm13[22],ymm15[23],ymm13[23] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11],ymm12[12],ymm11[12],ymm12[13],ymm11[13],ymm12[14],ymm11[14],ymm12[15],ymm11[15],ymm12[24],ymm11[24],ymm12[25],ymm11[25],ymm12[26],ymm11[26],ymm12[27],ymm11[27],ymm12[28],ymm11[28],ymm12[29],ymm11[29],ymm12[30],ymm11[30],ymm12[31],ymm11[31] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm7[0],ymm12[0],ymm7[1],ymm12[1],ymm7[2],ymm12[2],ymm7[3],ymm12[3],ymm7[4],ymm12[4],ymm7[5],ymm12[5],ymm7[6],ymm12[6],ymm7[7],ymm12[7],ymm7[16],ymm12[16],ymm7[17],ymm12[17],ymm7[18],ymm12[18],ymm7[19],ymm12[19],ymm7[20],ymm12[20],ymm7[21],ymm12[21],ymm7[22],ymm12[22],ymm7[23],ymm12[23] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11],ymm4[12],ymm9[12],ymm4[13],ymm9[13],ymm4[14],ymm9[14],ymm4[15],ymm9[15],ymm4[24],ymm9[24],ymm4[25],ymm9[25],ymm4[26],ymm9[26],ymm4[27],ymm9[27],ymm4[28],ymm9[28],ymm4[29],ymm9[29],ymm4[30],ymm9[30],ymm4[31],ymm9[31] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm15, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm10, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm14, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm3, %ymm12, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm9, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 352(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 256(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $648, %rsp # imm = 0x288 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $664, %rsp # imm = 0x298 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i8_stride6_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $264, %rsp # imm = 0x108 +; AVX512F-SLOW-NEXT: subq $328, %rsp # imm = 0x148 ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm13 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm1 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm26 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm31 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX512F-SLOW-NEXT: vmovdqa %xmm12, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm27 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm5, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm25 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm3, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm28 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm6, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm23 ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm13, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm4, %ymm1 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm4[8],ymm13[8],ymm4[9],ymm13[9],ymm4[10],ymm13[10],ymm4[11],ymm13[11],ymm4[12],ymm13[12],ymm4[13],ymm13[13],ymm4[14],ymm13[14],ymm4[15],ymm13[15],ymm4[24],ymm13[24],ymm4[25],ymm13[25],ymm4[26],ymm13[26],ymm4[27],ymm13[27],ymm4[28],ymm13[28],ymm4[29],ymm13[29],ymm4[30],ymm13[30],ymm4[31],ymm13[31] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm29 -; AVX512F-SLOW-NEXT: vmovdqa %ymm13, %ymm4 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm8, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm21 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm13 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm6, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm1 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm13 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm5, %ymm1 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15],ymm7[24],ymm6[24],ymm7[25],ymm6[25],ymm7[26],ymm6[26],ymm7[27],ymm6[27],ymm7[28],ymm6[28],ymm7[29],ymm6[29],ymm7[30],ymm6[30],ymm7[31],ymm6[31] -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512F-SLOW-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm10 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] -; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm10, %ymm0 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm15 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm25 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm15, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm1, %ymm11 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[4],ymm0[4],ymm11[5],ymm0[5],ymm11[6],ymm0[6],ymm11[7],ymm0[7],ymm11[16],ymm0[16],ymm11[17],ymm0[17],ymm11[18],ymm0[18],ymm11[19],ymm0[19],ymm11[20],ymm0[20],ymm11[21],ymm0[21],ymm11[22],ymm0[22],ymm11[23],ymm0[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm1[8],ymm15[8],ymm1[9],ymm15[9],ymm1[10],ymm15[10],ymm1[11],ymm15[11],ymm1[12],ymm15[12],ymm1[13],ymm15[13],ymm1[14],ymm15[14],ymm1[15],ymm15[15],ymm1[24],ymm15[24],ymm1[25],ymm15[25],ymm1[26],ymm15[26],ymm1[27],ymm15[27],ymm1[28],ymm15[28],ymm1[29],ymm15[29],ymm1[30],ymm15[30],ymm1[31],ymm15[31] -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm11, %ymm11 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm24 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm11, %ymm13 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[16],ymm12[16],ymm13[17],ymm12[17],ymm13[18],ymm12[18],ymm13[19],ymm12[19],ymm13[20],ymm12[20],ymm13[21],ymm12[21],ymm13[22],ymm12[22],ymm13[23],ymm12[23] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm13 = ymm11[8],ymm0[8],ymm11[9],ymm0[9],ymm11[10],ymm0[10],ymm11[11],ymm0[11],ymm11[12],ymm0[12],ymm11[13],ymm0[13],ymm11[14],ymm0[14],ymm11[15],ymm0[15],ymm11[24],ymm0[24],ymm11[25],ymm0[25],ymm11[26],ymm0[26],ymm11[27],ymm0[27],ymm11[28],ymm0[28],ymm11[29],ymm0[29],ymm11[30],ymm0[30],ymm11[31],ymm0[31] -; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm13, %ymm13 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm23 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm12, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm22 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[4],ymm0[4],ymm11[5],ymm0[5],ymm11[6],ymm0[6],ymm11[7],ymm0[7],ymm11[16],ymm0[16],ymm11[17],ymm0[17],ymm11[18],ymm0[18],ymm11[19],ymm0[19],ymm11[20],ymm0[20],ymm11[21],ymm0[21],ymm11[22],ymm0[22],ymm11[23],ymm0[23] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm13 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[4],ymm15[4],ymm1[5],ymm15[5],ymm1[6],ymm15[6],ymm1[7],ymm15[7],ymm1[16],ymm15[16],ymm1[17],ymm15[17],ymm1[18],ymm15[18],ymm1[19],ymm15[19],ymm1[20],ymm15[20],ymm1[21],ymm15[21],ymm1[22],ymm15[22],ymm1[23],ymm15[23] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm1 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm6, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm29 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm28 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15],ymm5[24],ymm7[24],ymm5[25],ymm7[25],ymm5[26],ymm7[26],ymm5[27],ymm7[27],ymm5[28],ymm7[28],ymm5[29],ymm7[29],ymm5[30],ymm7[30],ymm5[31],ymm7[31] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm22 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm9, %ymm2 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm9[8],ymm1[8],ymm9[9],ymm1[9],ymm9[10],ymm1[10],ymm9[11],ymm1[11],ymm9[12],ymm1[12],ymm9[13],ymm1[13],ymm9[14],ymm1[14],ymm9[15],ymm1[15],ymm9[24],ymm1[24],ymm9[25],ymm1[25],ymm9[26],ymm1[26],ymm9[27],ymm1[27],ymm9[28],ymm1[28],ymm9[29],ymm1[29],ymm9[30],ymm1[30],ymm9[31],ymm1[31] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm9, %ymm16 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm14 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm14, %ymm0 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm14, %ymm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm20 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm15 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] ; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm12, %ymm12 -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX512F-SLOW-NEXT: vprold $16, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm26 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm13, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm14, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm10 -; AVX512F-SLOW-NEXT: vprold $16, %xmm9, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm27 -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm11, %ymm17 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm3 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm15, %ymm0 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] ; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm15, %ymm10 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm26 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm10, %ymm13 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm13[0],ymm0[0],ymm13[1],ymm0[1],ymm13[2],ymm0[2],ymm13[3],ymm0[3],ymm13[4],ymm0[4],ymm13[5],ymm0[5],ymm13[6],ymm0[6],ymm13[7],ymm0[7],ymm13[16],ymm0[16],ymm13[17],ymm0[17],ymm13[18],ymm0[18],ymm13[19],ymm0[19],ymm13[20],ymm0[20],ymm13[21],ymm0[21],ymm13[22],ymm0[22],ymm13[23],ymm0[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm13 = ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11],ymm10[12],ymm1[12],ymm10[13],ymm1[13],ymm10[14],ymm1[14],ymm10[15],ymm1[15],ymm10[24],ymm1[24],ymm10[25],ymm1[25],ymm10[26],ymm1[26],ymm10[27],ymm1[27],ymm10[28],ymm1[28],ymm10[29],ymm1[29],ymm10[30],ymm1[30],ymm10[31],ymm1[31] +; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm13, %ymm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm27 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm8, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm13, %ymm3 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm13[8],ymm8[8],ymm13[9],ymm8[9],ymm13[10],ymm8[10],ymm13[11],ymm8[11],ymm13[12],ymm8[12],ymm13[13],ymm8[13],ymm13[14],ymm8[14],ymm13[15],ymm8[15],ymm13[24],ymm8[24],ymm13[25],ymm8[25],ymm13[26],ymm8[26],ymm13[27],ymm8[27],ymm13[28],ymm8[28],ymm13[29],ymm8[29],ymm13[30],ymm8[30],ymm13[31],ymm8[31] +; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm24 ; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm15[0,0,0,1] -; AVX512F-SLOW-NEXT: vprold $16, %ymm19, %ymm15 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm10 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm28 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm6 +; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm29 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm3, %xmm12 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512F-SLOW-NEXT: vprold $16, %xmm12, %xmm12 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm30 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm12 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm12 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[16],ymm7[16],ymm5[17],ymm7[17],ymm5[18],ymm7[18],ymm5[19],ymm7[19],ymm5[20],ymm7[20],ymm5[21],ymm7[21],ymm5[22],ymm7[22],ymm5[23],ymm7[23] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm7, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; AVX512F-SLOW-NEXT: vprold $16, %xmm11, %xmm11 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm11, %zmm31 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm2, %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm13[0],ymm8[0],ymm13[1],ymm8[1],ymm13[2],ymm8[2],ymm13[3],ymm8[3],ymm13[4],ymm8[4],ymm13[5],ymm8[5],ymm13[6],ymm8[6],ymm13[7],ymm8[7],ymm13[16],ymm8[16],ymm13[17],ymm8[17],ymm13[18],ymm8[18],ymm13[19],ymm8[19],ymm13[20],ymm8[20],ymm13[21],ymm8[21],ymm13[22],ymm8[22],ymm13[23],ymm8[23] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512F-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm8 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: # xmm8 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[4],ymm1[4],ymm10[5],ymm1[5],ymm10[6],ymm1[6],ymm10[7],ymm1[7],ymm10[16],ymm1[16],ymm10[17],ymm1[17],ymm10[18],ymm1[18],ymm10[19],ymm1[19],ymm10[20],ymm1[20],ymm10[21],ymm1[21],ymm10[22],ymm1[22],ymm10[23],ymm1[23] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm10 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512F-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm14, %ymm14 +; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm13 +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512F-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm15, %ymm15 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm6 = zmm30[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm16 = mem[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm30, %zmm16 +; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm6 = mem[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm16, %zmm17, %zmm6 +; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm16 = mem[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm6, %zmm21, %zmm16 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX512F-SLOW-NEXT: vprold $16, %ymm18, %ymm6 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm10, %zmm8 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm8, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm3, %ymm15, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm8, %ymm3, %ymm9 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm9[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm30[0,0,0,1] -; AVX512F-SLOW-NEXT: vprold $16, %ymm31, %ymm9 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm21[0,0,0,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm12 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX512F-SLOW-NEXT: vprold $16, %ymm7, %ymm7 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm20[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm17[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm11[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm16[0,0,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm9 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm10, %zmm9 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm9, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm15, %ymm5 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm9, %ymm3, %ymm13 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm13[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm5 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm8, %zmm5 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm26[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm8 = mem[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm9, %zmm8 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm27[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm12 = mem[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm9, %zmm12 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm8, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm9, %ymm30 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm8, %ymm15, %ymm31 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm31[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm8 = mem[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm13, %zmm8 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm12, %ymm15, %ymm11 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm12, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm9, %ymm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm11[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq $64, (%rsp), %zmm11 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm12, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm17, %zmm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm11, %zmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm6, %zmm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm9, %zmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm9, %zmm5 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm3 = zmm31[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm11 = mem[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm13, %zmm11 -; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm1 = mem[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm12 = mem[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm10, %zmm12 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm24[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm13 = zmm23[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm10, %zmm13 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm12, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm3, %ymm6 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm13, %ymm1 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm1, %ymm3, %ymm4 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm12, %ymm9, %ymm0 -; AVX512F-SLOW-NEXT: vpternlogq $184, %ymm13, %ymm9, %ymm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm25[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm30, %zmm11 +; AVX512F-SLOW-NEXT: vpermq $64, (%rsp), %zmm3 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm3 = mem[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm11, %zmm17, %zmm3 +; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm11 = mem[0,0,0,1,4,4,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm3, %zmm21, %zmm11 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm7[2,2,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm3, %zmm17, %zmm2 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm10, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm6, %zmm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm22[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm19[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm17, %zmm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm20[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm30, %zmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm26[2,2,2,3,6,6,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm22[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm2 -; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm2 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm27[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm24[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm17, %zmm4 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm28[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm30, %zmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm29[2,2,2,3,6,6,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512F-SLOW-NEXT: addq $264, %rsp # imm = 0x108 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512F-SLOW-NEXT: addq $328, %rsp # imm = 0x148 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i8_stride6_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $200, %rsp -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm10 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm1 +; AVX512F-FAST-NEXT: subq $424, %rsp # imm = 0x1A8 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm5 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm8 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm1 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm31 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX512F-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm8, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm8, %xmm30 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm6, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm1 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15],ymm8[24],ymm7[24],ymm8[25],ymm7[25],ymm8[26],ymm7[26],ymm8[27],ymm7[27],ymm8[28],ymm7[28],ymm8[29],ymm7[29],ymm8[30],ymm7[30],ymm8[31],ymm7[31] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm26 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm27 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15],ymm6[24],ymm4[24],ymm6[25],ymm4[25],ymm6[26],ymm4[26],ymm6[27],ymm4[27],ymm6[28],ymm4[28],ymm6[29],ymm4[29],ymm6[30],ymm4[30],ymm6[31],ymm4[31] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm26 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm15 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm14, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm1 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15],ymm6[24],ymm2[24],ymm6[25],ymm2[25],ymm6[26],ymm2[26],ymm6[27],ymm2[27],ymm6[28],ymm2[28],ymm6[29],ymm2[29],ymm6[30],ymm2[30],ymm6[31],ymm2[31] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm28 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm29 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm15[8],ymm14[8],ymm15[9],ymm14[9],ymm15[10],ymm14[10],ymm15[11],ymm14[11],ymm15[12],ymm14[12],ymm15[13],ymm14[13],ymm15[14],ymm14[14],ymm15[15],ymm14[15],ymm15[24],ymm14[24],ymm15[25],ymm14[25],ymm15[26],ymm14[26],ymm15[27],ymm14[27],ymm15[28],ymm14[28],ymm15[29],ymm14[29],ymm15[30],ymm14[30],ymm15[31],ymm14[31] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm27 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] -; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa %ymm12, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm2 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm30 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm15 -; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[16],ymm2[16],ymm5[17],ymm2[17],ymm5[18],ymm2[18],ymm5[19],ymm2[19],ymm5[20],ymm2[20],ymm5[21],ymm2[21],ymm5[22],ymm2[22],ymm5[23],ymm2[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm15[8],ymm3[8],ymm15[9],ymm3[9],ymm15[10],ymm3[10],ymm15[11],ymm3[11],ymm15[12],ymm3[12],ymm15[13],ymm3[13],ymm15[14],ymm3[14],ymm15[15],ymm3[15],ymm15[24],ymm3[24],ymm15[25],ymm3[25],ymm15[26],ymm3[26],ymm15[27],ymm3[27],ymm15[28],ymm3[28],ymm15[29],ymm3[29],ymm15[30],ymm3[30],ymm15[31],ymm3[31] -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm9 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[4],ymm2[4],ymm9[5],ymm2[5],ymm9[6],ymm2[6],ymm9[7],ymm2[7],ymm9[16],ymm2[16],ymm9[17],ymm2[17],ymm9[18],ymm2[18],ymm9[19],ymm2[19],ymm9[20],ymm2[20],ymm9[21],ymm2[21],ymm9[22],ymm2[22],ymm9[23],ymm2[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm10[8],ymm4[8],ymm10[9],ymm4[9],ymm10[10],ymm4[10],ymm10[11],ymm4[11],ymm10[12],ymm4[12],ymm10[13],ymm4[13],ymm10[14],ymm4[14],ymm10[15],ymm4[15],ymm10[24],ymm4[24],ymm10[25],ymm4[25],ymm10[26],ymm4[26],ymm10[27],ymm4[27],ymm10[28],ymm4[28],ymm10[29],ymm4[29],ymm10[30],ymm4[30],ymm10[31],ymm4[31] -; AVX512F-FAST-NEXT: vmovdqa %ymm10, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm9, %ymm11 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm10 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm4 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX512F-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm28 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm4 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX512F-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm29 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm22 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm6, %xmm1 ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm11 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm11, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm1 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm6, %xmm18 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm20 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm13 +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm23 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm12 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm0[8],xmm12[9],xmm0[9],xmm12[10],xmm0[10],xmm12[11],xmm0[11],xmm12[12],xmm0[12],xmm12[13],xmm0[13],xmm12[14],xmm0[14],xmm12[15],xmm0[15] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm12 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm12, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm22 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm15[0],ymm6[0],ymm15[1],ymm6[1],ymm15[2],ymm6[2],ymm15[3],ymm6[3],ymm15[4],ymm6[4],ymm15[5],ymm6[5],ymm15[6],ymm6[6],ymm15[7],ymm6[7],ymm15[16],ymm6[16],ymm15[17],ymm6[17],ymm15[18],ymm6[18],ymm15[19],ymm6[19],ymm15[20],ymm6[20],ymm15[21],ymm6[21],ymm15[22],ymm6[22],ymm15[23],ymm6[23] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm9 +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm8 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm9 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[2],ymm0[2],ymm9[3],ymm0[3],ymm9[4],ymm0[4],ymm9[5],ymm0[5],ymm9[6],ymm0[6],ymm9[7],ymm0[7],ymm9[16],ymm0[16],ymm9[17],ymm0[17],ymm9[18],ymm0[18],ymm9[19],ymm0[19],ymm9[20],ymm0[20],ymm9[21],ymm0[21],ymm9[22],ymm0[22],ymm9[23],ymm0[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm26 +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm9 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm10 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[4],ymm1[4],ymm10[5],ymm1[5],ymm10[6],ymm1[6],ymm10[7],ymm1[7],ymm10[16],ymm1[16],ymm10[17],ymm1[17],ymm10[18],ymm1[18],ymm10[19],ymm1[19],ymm10[20],ymm1[20],ymm10[21],ymm1[21],ymm10[22],ymm1[22],ymm10[23],ymm1[23] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm4[8],ymm9[8],ymm4[9],ymm9[9],ymm4[10],ymm9[10],ymm4[11],ymm9[11],ymm4[12],ymm9[12],ymm4[13],ymm9[13],ymm4[14],ymm9[14],ymm4[15],ymm9[15],ymm4[24],ymm9[24],ymm4[25],ymm9[25],ymm4[26],ymm9[26],ymm4[27],ymm9[27],ymm4[28],ymm9[28],ymm4[29],ymm9[29],ymm4[30],ymm9[30],ymm4[31],ymm9[31] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm10 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31,u] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm12, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm12 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,26,u,29,u,28,u,27,u,30,u,u,u,u,u,31] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm11 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm29 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm15[0],ymm14[0],ymm15[1],ymm14[1],ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[4],ymm14[4],ymm15[5],ymm14[5],ymm15[6],ymm14[6],ymm15[7],ymm14[7],ymm15[16],ymm14[16],ymm15[17],ymm14[17],ymm15[18],ymm14[18],ymm15[19],ymm14[19],ymm15[20],ymm14[20],ymm15[21],ymm14[21],ymm15[22],ymm14[22],ymm15[23],ymm14[23] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23] +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 ; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm3 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm6 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm6[0],ymm0[1],ymm6[1],ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[4],ymm6[4],ymm0[5],ymm6[5],ymm0[6],ymm6[6],ymm0[7],ymm6[7],ymm0[16],ymm6[16],ymm0[17],ymm6[17],ymm0[18],ymm6[18],ymm0[19],ymm6[19],ymm0[20],ymm6[20],ymm0[21],ymm6[21],ymm0[22],ymm6[22],ymm0[23],ymm6[23] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm28 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm13 -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm8 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm10 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm11 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX512F-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm15, %ymm17 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512F-FAST-NEXT: vprold $16, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm26 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm14, %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm2 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm31 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[4],ymm9[4],ymm4[5],ymm9[5],ymm4[6],ymm9[6],ymm4[7],ymm9[7],ymm4[16],ymm9[16],ymm4[17],ymm9[17],ymm4[18],ymm9[18],ymm4[19],ymm9[19],ymm4[20],ymm9[20],ymm4[21],ymm9[21],ymm4[22],ymm9[22],ymm4[23],ymm9[23] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm7 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm15, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm13, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm6 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm12 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm9 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-FAST-NEXT: vprold $16, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm4 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512F-FAST-NEXT: vprold $16, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm27 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm14 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,0,0,1] -; AVX512F-FAST-NEXT: vprold $16, %ymm20, %ymm9 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm17[0,0,0,1] +; AVX512F-FAST-NEXT: vprold $16, %ymm21, %ymm3 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm14[0,0,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm11[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm15[0,0,0,1] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm13, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm9, %zmm8 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm8, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm2, %ymm13, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm8, %ymm2, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm7 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm5, %zmm8, %zmm7 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm31[0,0,0,1] -; AVX512F-FAST-NEXT: vprold $16, %ymm28, %ymm10 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm31[0,0,0,1] +; AVX512F-FAST-NEXT: vprold $16, %ymm19, %ymm14 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm18[0,0,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm20[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm16[0,0,0,1] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm29[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm18[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm28 = ymm19[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm29 = ymm30[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm17[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm16[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm9, %zmm10 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm10, %ymm5 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm5, %ymm13, %ymm11 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm10, %ymm2, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm5 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm5 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm8, %zmm5 -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm4 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm8 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm9, %zmm8 -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm4 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm10 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm9, %zmm10 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm8, %ymm4 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm4, %ymm2, %ymm30 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm10, %ymm4 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm4, %ymm2, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm8, %ymm2, %ymm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm4 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm15[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-FAST-NEXT: vpermq $234, (%rsp), %zmm8 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm8 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm9, %zmm8 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm10, %ymm2, %ymm31 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm4 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm31[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm24[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm9, %zmm6 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm26[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm25[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm10, %zmm9 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm27[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm21[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm10, %zmm11 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm9, %ymm4 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm4, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm11, %ymm4 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm4, %ymm2, %ymm14 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm9, %ymm13, %ymm3 -; AVX512F-FAST-NEXT: vpternlogq $184, %ymm11, %ymm13, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm23[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm22[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm4, %zmm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm11, %zmm2 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm7, %zmm2 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm14[2,2,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm13, %zmm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm15, %zmm9 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm4, %zmm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm31, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm9, %zmm7, %zmm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm6, %zmm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm8, %zmm2 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm6, %zmm2 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm3 = mem[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm6, %zmm3 +; AVX512F-FAST-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm0 = mem[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm4, %zmm0 +; AVX512F-FAST-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm3 = mem[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm7, %zmm3 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm1[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm1 = mem[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm6, %zmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm23[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm4, %zmm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm25[0,0,0,1,4,4,4,5] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm7, %zmm1 +; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm0 = mem[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm7 = mem[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm7 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm26[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm27[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm8 +; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm0 = mem[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm7, %zmm6, %zmm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm28[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm8, %zmm6, %zmm4 +; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm6 = mem[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm7, %zmm6 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm29[2,2,2,3,6,6,6,7] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm7, %zmm0 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512F-FAST-NEXT: addq $200, %rsp +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512F-FAST-NEXT: addq $424, %rsp # imm = 0x1A8 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-ONLY-SLOW-LABEL: store_i8_stride6_vf64: ; AVX512BW-ONLY-SLOW: # %bb.0: ; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm0, %zmm7, %zmm0 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm5, %ymm20, %ymm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %ymm19 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm19[0],ymm18[0],ymm19[1],ymm18[1],ymm19[2],ymm18[2],ymm19[3],ymm18[3],ymm19[4],ymm18[4],ymm19[5],ymm18[5],ymm19[6],ymm18[6],ymm19[7],ymm18[7],ymm19[16],ymm18[16],ymm19[17],ymm18[17],ymm19[18],ymm18[18],ymm19[19],ymm18[19],ymm19[20],ymm18[20],ymm19[21],ymm18[21],ymm19[22],ymm18[22],ymm19[23],ymm18[23] -; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %ymm6, %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[4],ymm9[4],ymm11[5],ymm9[5],ymm11[6],ymm9[6],ymm11[7],ymm9[7],ymm11[16],ymm9[16],ymm11[17],ymm9[17],ymm11[18],ymm9[18],ymm11[19],ymm9[19],ymm11[20],ymm9[20],ymm11[21],ymm9[21],ymm11[22],ymm9[22],ymm11[23],ymm9[23] +; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %ymm3, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm16 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm16[0],ymm14[0],ymm16[1],ymm14[1],ymm16[2],ymm14[2],ymm16[3],ymm14[3],ymm16[4],ymm14[4],ymm16[5],ymm14[5],ymm16[6],ymm14[6],ymm16[7],ymm14[7],ymm16[16],ymm14[16],ymm16[17],ymm14[17],ymm16[18],ymm14[18],ymm16[19],ymm14[19],ymm16[20],ymm14[20],ymm16[21],ymm14[21],ymm16[22],ymm14[22],ymm16[23],ymm14[23] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm0, %ymm15, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %xmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm25[8],xmm19[8],xmm25[9],xmm19[9],xmm25[10],xmm19[10],xmm25[11],xmm19[11],xmm25[12],xmm19[12],xmm25[13],xmm19[13],xmm25[14],xmm19[14],xmm25[15],xmm19[15] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm17, %xmm6, %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 ; AVX512BW-ONLY-SLOW-NEXT: movl $613566756, %r10d # imm = 0x24924924 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm5, %zmm0 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm13 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm6, %ymm23, %ymm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm21 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512BW-ONLY-SLOW-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm21, %ymm15 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %xmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm26 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm26, %xmm23, %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512BW-ONLY-SLOW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm18, %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: movl $1227133513, %r10d # imm = 0x49249249 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k2 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm6, %zmm0 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm15 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm22, %ymm23, %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %ymm22 -; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm26 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512BW-ONLY-SLOW-NEXT: # ymm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm26, %ymm22, %ymm27 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,2,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %xmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm28 = +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm28, %xmm24, %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512BW-ONLY-SLOW-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm29, %ymm22, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %r10, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm25, %zmm0 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm27 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm28 = ymm27[0],ymm25[0],ymm27[1],ymm25[1],ymm27[2],ymm25[2],ymm27[3],ymm25[3],ymm27[4],ymm25[4],ymm27[5],ymm25[5],ymm27[6],ymm25[6],ymm27[7],ymm25[7],ymm27[16],ymm25[16],ymm27[17],ymm25[17],ymm27[18],ymm25[18],ymm27[19],ymm25[19],ymm27[20],ymm25[20],ymm27[21],ymm25[21],ymm27[22],ymm25[22],ymm27[23],ymm25[23] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm29 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm29, %zmm28 -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm28, %zmm7, %zmm7 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm28, %ymm20, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm28 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm29 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm30 = ymm29[0],ymm28[0],ymm29[1],ymm28[1],ymm29[2],ymm28[2],ymm29[3],ymm28[3],ymm29[4],ymm28[4],ymm29[5],ymm28[5],ymm29[6],ymm28[6],ymm29[7],ymm28[7],ymm29[16],ymm28[16],ymm29[17],ymm28[17],ymm29[18],ymm28[18],ymm29[19],ymm28[19],ymm29[20],ymm28[20],ymm29[21],ymm28[21],ymm29[22],ymm28[22],ymm29[23],ymm28[23] -; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %ymm30, %ymm30 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm20, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r8), %ymm30 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm30, %ymm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r9), %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm26, %ymm23, %ymm24 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm20, %zmm7 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm24 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm25, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm27, %ymm26 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm26[0],ymm20[0],ymm26[1],ymm20[1],ymm26[2],ymm20[2],ymm26[3],ymm20[3],ymm26[4],ymm20[4],ymm26[5],ymm20[5],ymm26[6],ymm20[6],ymm26[7],ymm20[7],ymm26[16],ymm20[16],ymm26[17],ymm20[17],ymm26[18],ymm20[18],ymm26[19],ymm20[19],ymm26[20],ymm20[20],ymm26[21],ymm20[21],ymm26[22],ymm20[22],ymm26[23],ymm20[23] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm25 = ymm27[8],ymm25[8],ymm27[9],ymm25[9],ymm27[10],ymm25[10],ymm27[11],ymm25[11],ymm27[12],ymm25[12],ymm27[13],ymm25[13],ymm27[14],ymm25[14],ymm27[15],ymm25[15],ymm27[24],ymm25[24],ymm27[25],ymm25[25],ymm27[26],ymm25[26],ymm27[27],ymm25[27],ymm27[28],ymm25[28],ymm27[29],ymm25[29],ymm27[30],ymm25[30],ymm27[31],ymm25[31] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm25, %ymm26, %ymm25 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm27 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm28, %ymm20 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm29, %ymm31 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm31[0],ymm20[0],ymm31[1],ymm20[1],ymm31[2],ymm20[2],ymm31[3],ymm20[3],ymm31[4],ymm20[4],ymm31[5],ymm20[5],ymm31[6],ymm20[6],ymm31[7],ymm20[7],ymm31[16],ymm20[16],ymm31[17],ymm20[17],ymm31[18],ymm20[18],ymm31[19],ymm20[19],ymm31[20],ymm20[20],ymm31[21],ymm20[21],ymm31[22],ymm20[22],ymm31[23],ymm20[23] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm28 = ymm29[8],ymm28[8],ymm29[9],ymm28[9],ymm29[10],ymm28[10],ymm29[11],ymm28[11],ymm29[12],ymm28[12],ymm29[13],ymm28[13],ymm29[14],ymm28[14],ymm29[15],ymm28[15],ymm29[24],ymm28[24],ymm29[25],ymm28[25],ymm29[26],ymm28[26],ymm29[27],ymm28[27],ymm29[28],ymm28[28],ymm29[29],ymm28[29],ymm29[30],ymm28[30],ymm29[31],ymm28[31] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm28, %ymm29, %ymm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm8, %zmm0 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[4],ymm8[4],ymm10[5],ymm8[5],ymm10[6],ymm8[6],ymm10[7],ymm8[7],ymm10[16],ymm8[16],ymm10[17],ymm8[17],ymm10[18],ymm8[18],ymm10[19],ymm8[19],ymm10[20],ymm8[20],ymm10[21],ymm8[21],ymm10[22],ymm8[22],ymm10[23],ymm8[23] +; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %ymm12, %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[16],ymm12[16],ymm13[17],ymm12[17],ymm13[18],ymm12[18],ymm13[19],ymm12[19],ymm13[20],ymm12[20],ymm13[21],ymm12[21],ymm13[22],ymm12[22],ymm13[23],ymm12[23] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm7, %ymm15, %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm17, %xmm15, %xmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm30, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm26, %xmm3, %xmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm15[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm15, %ymm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm17, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm17, %zmm7 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm28, %xmm6, %xmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm17[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm29, %ymm17, %ymm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm26, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm26, %zmm7 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm26 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm26, %xmm19, %xmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm26, %xmm25, %xmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm25[0],xmm19[0],xmm25[1],xmm19[1],xmm25[2],xmm19[2],xmm25[3],xmm19[3],xmm25[4],xmm19[4],xmm25[5],xmm19[5],xmm25[6],xmm19[6],xmm25[7],xmm19[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm19, %ymm25, %ymm19 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm19, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm27 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm27, %xmm20, %xmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm27, %xmm21, %xmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm28 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7] +; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %xmm20, %xmm20 ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm20, %zmm20 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm30[0,1,2,3],zmm14[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm28, %zmm25, %zmm25 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm25 = zmm25[2,2,2,3,6,6,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm20 = zmm20[0,0,0,1,4,4,4,5] ; AVX512BW-ONLY-SLOW-NEXT: movl $-1840700270, %ecx # imm = 0x92492492 ; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm12[4,5,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm25, %zmm23, %zmm23 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm23 = zmm23[2,2,2,3,6,6,6,7] -; AVX512BW-ONLY-SLOW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 -; AVX512BW-ONLY-SLOW-NEXT: kmovq %rcx, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm23, %zmm20 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm16, %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm17, %ymm24 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm23 = ymm24[0],ymm23[0],ymm24[1],ymm23[1],ymm24[2],ymm23[2],ymm24[3],ymm23[3],ymm24[4],ymm23[4],ymm24[5],ymm23[5],ymm24[6],ymm23[6],ymm24[7],ymm23[7],ymm24[16],ymm23[16],ymm24[17],ymm23[17],ymm24[18],ymm23[18],ymm24[19],ymm23[19],ymm24[20],ymm23[20],ymm24[21],ymm23[21],ymm24[22],ymm23[22],ymm24[23],ymm23[23] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,2,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm16 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm16, %ymm26, %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm23, %zmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm18, %ymm16 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm19, %ymm23 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm16 = ymm23[0],ymm16[0],ymm23[1],ymm16[1],ymm23[2],ymm16[2],ymm23[3],ymm16[3],ymm23[4],ymm16[4],ymm23[5],ymm16[5],ymm23[6],ymm16[6],ymm23[7],ymm16[7],ymm23[16],ymm16[16],ymm23[17],ymm16[17],ymm23[18],ymm16[18],ymm23[19],ymm16[19],ymm23[20],ymm16[20],ymm23[21],ymm16[21],ymm23[22],ymm16[22],ymm23[23],ymm16[23] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm18 = ymm19[8],ymm18[8],ymm19[9],ymm18[9],ymm19[10],ymm18[10],ymm19[11],ymm18[11],ymm19[12],ymm18[12],ymm19[13],ymm18[13],ymm19[14],ymm18[14],ymm19[15],ymm18[15],ymm19[24],ymm18[24],ymm19[25],ymm18[25],ymm19[26],ymm18[26],ymm19[27],ymm18[27],ymm19[28],ymm18[28],ymm19[29],ymm18[29],ymm19[30],ymm18[30],ymm19[31],ymm18[31] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm18, %ymm29, %ymm18 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm17, %zmm16 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm14, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm28, %zmm14, %zmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,2,2,3,6,6,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm14, %zmm16 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm12, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm25, %zmm12, %zmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,2,2,3,6,6,6,7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm12, %zmm16 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm14 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm9, %ymm11, %ymm9 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9 -; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm17 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm18 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %xmm8, %xmm8 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm8, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm8, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm9 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm13, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm20, %zmm19 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm23, %xmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm23, %xmm23 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm28, %zmm23, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm23 = zmm23[0,0,0,1,4,4,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm23, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm23 = +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm23, %xmm24, %xmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm28 = +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm28, %xmm24, %xmm24 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm29, %zmm24, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm24 = zmm24[0,0,0,1,4,4,4,5] ; AVX512BW-ONLY-SLOW-NEXT: movabsq $585610922974906400, %rcx # imm = 0x820820820820820 ; AVX512BW-ONLY-SLOW-NEXT: kmovq %rcx, %k3 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm8, %zmm9 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm1, %ymm11, %ymm1 -; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm3 -; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm8 -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %xmm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm24, %zmm19 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm24 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm14, %ymm29 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm16, %ymm30 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm29 = ymm30[0],ymm29[0],ymm30[1],ymm29[1],ymm30[2],ymm29[2],ymm30[3],ymm29[3],ymm30[4],ymm29[4],ymm30[5],ymm29[5],ymm30[6],ymm29[6],ymm30[7],ymm29[7],ymm30[16],ymm29[16],ymm30[17],ymm29[17],ymm30[18],ymm29[18],ymm30[19],ymm29[19],ymm30[20],ymm29[20],ymm30[21],ymm29[21],ymm30[22],ymm29[22],ymm30[23],ymm29[23] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm29[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm16 = ymm16[8],ymm14[8],ymm16[9],ymm14[9],ymm16[10],ymm14[10],ymm16[11],ymm14[11],ymm16[12],ymm14[12],ymm16[13],ymm14[13],ymm16[14],ymm14[14],ymm16[15],ymm14[15],ymm16[24],ymm14[24],ymm16[25],ymm14[25],ymm16[26],ymm14[26],ymm16[27],ymm14[27],ymm16[28],ymm14[28],ymm16[29],ymm14[29],ymm16[30],ymm14[30],ymm16[31],ymm14[31] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm16, %ymm14, %ymm16 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm29, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm16 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm16, %ymm9, %ymm30 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm16, %ymm11, %ymm31 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm30 = ymm31[0],ymm30[0],ymm31[1],ymm30[1],ymm31[2],ymm30[2],ymm31[3],ymm30[3],ymm31[4],ymm30[4],ymm31[5],ymm30[5],ymm31[6],ymm30[6],ymm31[7],ymm30[7],ymm31[16],ymm30[16],ymm31[17],ymm30[17],ymm31[18],ymm30[18],ymm31[19],ymm30[19],ymm31[20],ymm30[20],ymm31[21],ymm30[21],ymm31[22],ymm30[22],ymm31[23],ymm30[23] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11],ymm11[12],ymm9[12],ymm11[13],ymm9[13],ymm11[14],ymm9[14],ymm11[15],ymm9[15],ymm11[24],ymm9[24],ymm11[25],ymm9[25],ymm11[26],ymm9[26],ymm11[27],ymm9[27],ymm11[28],ymm9[28],ymm11[29],ymm9[29],ymm11[30],ymm9[30],ymm11[31],ymm9[31] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm9, %ymm11, %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm29, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm29, %zmm18, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm18 = zmm18[2,2,2,3,6,6,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm18, %zmm9 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm18, %zmm22, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,2,2,3,6,6,6,7] +; AVX512BW-ONLY-SLOW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 +; AVX512BW-ONLY-SLOW-NEXT: kmovq %rcx, %k4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm22, %zmm9 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm26, %xmm4, %xmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm26, %xmm5, %xmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm26[8],xmm22[8],xmm26[9],xmm22[9],xmm26[10],xmm22[10],xmm26[11],xmm22[11],xmm26[12],xmm22[12],xmm26[13],xmm22[13],xmm26[14],xmm22[14],xmm26[15],xmm22[15] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm22[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm4, %ymm25, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm4, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm27, %xmm1, %xmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm27, %xmm2, %xmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm22[0],xmm5[0],xmm22[1],xmm5[1],xmm22[2],xmm5[2],xmm22[3],xmm5[3],xmm22[4],xmm5[4],xmm22[5],xmm5[5],xmm22[6],xmm5[6],xmm22[7],xmm5[7] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %xmm1, %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm1, %zmm4 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm3, %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm3, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm23, %xmm6, %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm28, %xmm6, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm1, %zmm4 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm12, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm13, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15],ymm13[24],ymm12[24],ymm13[25],ymm12[25],ymm13[26],ymm12[26],ymm13[27],ymm12[27],ymm13[28],ymm12[28],ymm13[29],ymm12[29],ymm13[30],ymm12[30],ymm13[31],ymm12[31] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm2, %ymm14, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm16, %ymm8, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm16, %ymm10, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11],ymm10[12],ymm8[12],ymm10[13],ymm8[13],ymm10[14],ymm8[14],ymm10[15],ymm8[15],ymm10[24],ymm8[24],ymm10[25],ymm8[25],ymm10[26],ymm8[26],ymm10[27],ymm8[27],ymm10[28],ymm8[28],ymm10[29],ymm8[29],ymm10[30],ymm8[30],ymm10[31],ymm8[31] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm3, %ymm11, %ymm3 ; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,0,1,4,4,4,5] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k2} -; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm1 {%k1} -; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,1,2,3] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm2 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k3} -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rax) -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm29, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm1, %zmm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm18, %zmm17, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm1, %zmm2 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, (%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 256(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-ONLY-SLOW-NEXT: vzeroupper @@ -4857,414 +4782,415 @@ ; ; AVX512BW-FAST-LABEL: store_i8_stride6_vf64: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm3 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm3, %ymm8, %ymm3 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm4 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15],ymm6[24],ymm5[24],ymm6[25],ymm5[25],ymm6[26],ymm5[26],ymm6[27],ymm5[27],ymm6[28],ymm5[28],ymm6[29],ymm5[29],ymm6[30],ymm5[30],ymm6[31],ymm5[31] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm4, %ymm14, %ymm4 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: movl $613566756, %eax # imm = 0x24924924 -; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm3[0,1,2,3],zmm9[4,5,6,7] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = -; AVX512BW-FAST-NEXT: vpshufb %zmm15, %zmm4, %zmm4 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,2,2,3,6,6,6,7] +; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm5 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm0 +; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm6, %xmm3 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6] +; AVX512BW-FAST-NEXT: vpermw %ymm3, %ymm15, %ymm3 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm17 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-FAST-NEXT: vpshufb %xmm17, %xmm8, %xmm0 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512BW-FAST-NEXT: vpshufb %xmm17, %xmm9, %xmm7 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512BW-FAST-NEXT: vpermw %ymm7, %ymm21, %ymm7 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 ; AVX512BW-FAST-NEXT: movl $-1840700270, %eax # imm = 0x92492492 ; AVX512BW-FAST-NEXT: kmovd %eax, %k2 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm4, %zmm0 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm4[0,1,2,3],zmm10[4,5,6,7] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = -; AVX512BW-FAST-NEXT: vpshufb %zmm16, %zmm11, %zmm11 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,2,2,3,6,6,6,7] -; AVX512BW-FAST-NEXT: movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208 +; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm22 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX512BW-FAST-NEXT: vpshufb %xmm22, %xmm10, %xmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm23 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm10, %xmm7 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm7, %zmm4 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,0,0,1,4,4,4,5] +; AVX512BW-FAST-NEXT: movl $613566756, %eax # imm = 0x24924924 +; AVX512BW-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm4, %zmm0 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm24 = +; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm4, %xmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm25 = +; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm4, %xmm11 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm11, %zmm7 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,0,1,4,4,4,5] +; AVX512BW-FAST-NEXT: movabsq $585610922974906400, %rax # imm = 0x820820820820820 ; AVX512BW-FAST-NEXT: kmovq %rax, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm11, %zmm0 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm11 -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm17 -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm13 -; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm7 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm7[0],ymm17[0],ymm7[1],ymm17[1],ymm7[2],ymm17[2],ymm7[3],ymm17[3],ymm7[4],ymm17[4],ymm7[5],ymm17[5],ymm7[6],ymm17[6],ymm7[7],ymm17[7],ymm7[16],ymm17[16],ymm7[17],ymm17[17],ymm7[18],ymm17[18],ymm7[19],ymm17[19],ymm7[20],ymm17[20],ymm7[21],ymm17[21],ymm7[22],ymm17[22],ymm7[23],ymm17[23] +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm7, %zmm0 {%k3} +; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm26 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-FAST-NEXT: vpshufb %ymm26, %ymm13, %ymm7 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512BW-FAST-NEXT: vpshufb %ymm26, %ymm14, %ymm11 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[2],ymm7[2],ymm11[3],ymm7[3],ymm11[4],ymm7[4],ymm11[5],ymm7[5],ymm11[6],ymm7[6],ymm11[7],ymm7[7],ymm11[16],ymm7[16],ymm11[17],ymm7[17],ymm11[18],ymm7[18],ymm11[19],ymm7[19],ymm11[20],ymm7[20],ymm11[21],ymm7[21],ymm11[22],ymm7[22],ymm11[23],ymm7[23] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm17 = ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15],ymm13[24],ymm11[24],ymm13[25],ymm11[25],ymm13[26],ymm11[26],ymm13[27],ymm11[27],ymm13[28],ymm11[28],ymm13[29],ymm11[29],ymm13[30],ymm11[30],ymm13[31],ymm11[31] -; AVX512BW-FAST-NEXT: vpermw %ymm17, %ymm8, %ymm8 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm17, %ymm7 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm18, %ymm12 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[16],ymm7[16],ymm12[17],ymm7[17],ymm12[18],ymm7[18],ymm12[19],ymm7[19],ymm12[20],ymm7[20],ymm12[21],ymm7[21],ymm12[22],ymm7[22],ymm12[23],ymm7[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-FAST-NEXT: vpermw %ymm11, %ymm28, %ymm11 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm11 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %ymm19 +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm27 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-FAST-NEXT: vpshufb %ymm27, %ymm19, %ymm7 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %ymm20 +; AVX512BW-FAST-NEXT: vpshufb %ymm27, %ymm20, %ymm16 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm16[0],ymm7[0],ymm16[1],ymm7[1],ymm16[2],ymm7[2],ymm16[3],ymm7[3],ymm16[4],ymm7[4],ymm16[5],ymm7[5],ymm16[6],ymm7[6],ymm16[7],ymm7[7],ymm16[16],ymm7[16],ymm16[17],ymm7[17],ymm16[18],ymm7[18],ymm16[19],ymm7[19],ymm16[20],ymm7[20],ymm16[21],ymm7[21],ymm16[22],ymm7[22],ymm16[23],ymm7[23] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm18[8],ymm17[8],ymm18[9],ymm17[9],ymm18[10],ymm17[10],ymm18[11],ymm17[11],ymm18[12],ymm17[12],ymm18[13],ymm17[13],ymm18[14],ymm17[14],ymm18[15],ymm17[15],ymm18[24],ymm17[24],ymm18[25],ymm17[25],ymm18[26],ymm17[26],ymm18[27],ymm17[27],ymm18[28],ymm17[28],ymm18[29],ymm17[29],ymm18[30],ymm17[30],ymm18[31],ymm17[31] -; AVX512BW-FAST-NEXT: vpermw %ymm12, %ymm14, %ymm12 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm7 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm8, %zmm7 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm9 -; AVX512BW-FAST-NEXT: vpshufb %zmm15, %zmm9, %zmm9 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,2,2,3,6,6,6,7] -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm9, %zmm7 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa (%r9), %ymm9 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm10 -; AVX512BW-FAST-NEXT: vpshufb %zmm16, %zmm10, %zmm10 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,2,2,3,6,6,6,7] -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm10, %zmm7 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm21 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm23 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm12, %xmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %xmm22 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm14, %xmm15 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6] -; AVX512BW-FAST-NEXT: vpermw %ymm15, %ymm24, %ymm15 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm16 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm25 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm15, %xmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm19 -; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm19, %xmm20 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm20[8],xmm10[8],xmm20[9],xmm10[9],xmm20[10],xmm10[10],xmm20[11],xmm10[11],xmm20[12],xmm10[12],xmm20[13],xmm10[13],xmm20[14],xmm10[14],xmm20[15],xmm10[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm16 = ymm20[8],ymm19[8],ymm20[9],ymm19[9],ymm20[10],ymm19[10],ymm20[11],ymm19[11],ymm20[12],ymm19[12],ymm20[13],ymm19[13],ymm20[14],ymm19[14],ymm20[15],ymm19[15],ymm20[24],ymm19[24],ymm20[25],ymm19[25],ymm20[26],ymm19[26],ymm20[27],ymm19[27],ymm20[28],ymm19[28],ymm20[29],ymm19[29],ymm20[30],ymm19[30],ymm20[31],ymm19[31] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512BW-FAST-NEXT: vpermw %ymm16, %ymm29, %ymm16 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm11, %zmm7 {%k1} +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = +; AVX512BW-FAST-NEXT: vpshufb %zmm30, %zmm18, %zmm11 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,2,2,3,6,6,6,7] +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm11, %zmm7 {%k2} +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = +; AVX512BW-FAST-NEXT: vpshufb %zmm31, %zmm11, %zmm16 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,2,2,3,6,6,6,7] +; AVX512BW-FAST-NEXT: movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208 +; AVX512BW-FAST-NEXT: kmovq %rax, %k4 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm16, %zmm7 {%k4} +; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm16 +; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm12 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm16[0],xmm12[1],xmm16[1],xmm12[2],xmm16[2],xmm12[3],xmm16[3],xmm12[4],xmm16[4],xmm12[5],xmm16[5],xmm12[6],xmm16[6],xmm12[7],xmm16[7] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512BW-FAST-NEXT: vpermw %ymm16, %ymm15, %ymm15 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm16 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm15 +; AVX512BW-FAST-NEXT: vpshufb %xmm17, %xmm3, %xmm12 +; AVX512BW-FAST-NEXT: vpshufb %xmm17, %xmm16, %xmm17 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm17[8],xmm12[8],xmm17[9],xmm12[9],xmm17[10],xmm12[10],xmm17[11],xmm12[11],xmm17[12],xmm12[12],xmm17[13],xmm12[13],xmm17[14],xmm12[14],xmm17[15],xmm12[15] +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm16[0],xmm3[0],xmm16[1],xmm3[1],xmm16[2],xmm3[2],xmm16[3],xmm3[3],xmm16[4],xmm3[4],xmm16[5],xmm3[5],xmm16[6],xmm3[6],xmm16[7],xmm3[7] +; AVX512BW-FAST-NEXT: vpermw %ymm17, %ymm21, %ymm17 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm17, %zmm12 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %xmm17 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm15, %zmm12 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa 32(%r9), %xmm15 +; AVX512BW-FAST-NEXT: vpshufb %xmm22, %xmm17, %xmm21 +; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm17, %xmm22 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm21, %zmm22, %zmm21 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm21 = zmm21[0,0,0,1,4,4,4,5] +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm21, %zmm12 {%k1} +; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm15, %xmm21 +; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm15, %xmm22 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm21, %zmm22, %zmm21 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm21 = zmm21[0,0,0,1,4,4,4,5] +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm21, %zmm12 {%k3} +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %ymm22 +; AVX512BW-FAST-NEXT: vpshufb %ymm26, %ymm22, %ymm21 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %ymm23 +; AVX512BW-FAST-NEXT: vpshufb %ymm26, %ymm23, %ymm24 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm21 = ymm24[0],ymm21[0],ymm24[1],ymm21[1],ymm24[2],ymm21[2],ymm24[3],ymm21[3],ymm24[4],ymm21[4],ymm24[5],ymm21[5],ymm24[6],ymm21[6],ymm24[7],ymm21[7],ymm24[16],ymm21[16],ymm24[17],ymm21[17],ymm24[18],ymm21[18],ymm24[19],ymm21[19],ymm24[20],ymm21[20],ymm24[21],ymm21[21],ymm24[22],ymm21[22],ymm24[23],ymm21[23] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,2,2,3] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm24 = ymm23[8],ymm22[8],ymm23[9],ymm22[9],ymm23[10],ymm22[10],ymm23[11],ymm22[11],ymm23[12],ymm22[12],ymm23[13],ymm22[13],ymm23[14],ymm22[14],ymm23[15],ymm22[15],ymm23[24],ymm22[24],ymm23[25],ymm22[25],ymm23[26],ymm22[26],ymm23[27],ymm22[27],ymm23[28],ymm22[28],ymm23[29],ymm22[29],ymm23[30],ymm22[30],ymm23[31],ymm22[31] +; AVX512BW-FAST-NEXT: vpermw %ymm24, %ymm28, %ymm24 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm21, %zmm24 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rcx), %ymm26 +; AVX512BW-FAST-NEXT: vpshufb %ymm27, %ymm26, %ymm21 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %ymm28 +; AVX512BW-FAST-NEXT: vpshufb %ymm27, %ymm28, %ymm25 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm21 = ymm25[0],ymm21[0],ymm25[1],ymm21[1],ymm25[2],ymm21[2],ymm25[3],ymm21[3],ymm25[4],ymm21[4],ymm25[5],ymm21[5],ymm25[6],ymm21[6],ymm25[7],ymm21[7],ymm25[16],ymm21[16],ymm25[17],ymm21[17],ymm25[18],ymm21[18],ymm25[19],ymm21[19],ymm25[20],ymm21[20],ymm25[21],ymm21[21],ymm25[22],ymm21[22],ymm25[23],ymm21[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} ymm25 = ymm28[8],ymm26[8],ymm28[9],ymm26[9],ymm28[10],ymm26[10],ymm28[11],ymm26[11],ymm28[12],ymm26[12],ymm28[13],ymm26[13],ymm28[14],ymm26[14],ymm28[15],ymm26[15],ymm28[24],ymm26[24],ymm28[25],ymm26[25],ymm28[26],ymm26[26],ymm28[27],ymm26[27],ymm28[28],ymm26[28],ymm28[29],ymm26[29],ymm28[30],ymm26[30],ymm28[31],ymm26[31] +; AVX512BW-FAST-NEXT: vpermw %ymm25, %ymm29, %ymm25 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,2,2,3] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm21, %zmm21 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm21 {%k1} +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpshufb %zmm30, %zmm25, %zmm24 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm24 = zmm24[2,2,2,3,6,6,6,7] +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm21 {%k2} +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpshufb %zmm31, %zmm24, %zmm27 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm27 = zmm27[2,2,2,3,6,6,6,7] +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm27, %zmm21 {%k4} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm20[0],ymm19[0],ymm20[1],ymm19[1],ymm20[2],ymm19[2],ymm20[3],ymm19[3],ymm20[4],ymm19[4],ymm20[5],ymm19[5],ymm20[6],ymm19[6],ymm20[7],ymm19[7],ymm20[16],ymm19[16],ymm20[17],ymm19[17],ymm20[18],ymm19[18],ymm20[19],ymm19[19],ymm20[20],ymm19[20],ymm20[21],ymm19[21],ymm20[22],ymm19[22],ymm20[23],ymm19[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm5, %zmm5 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] +; AVX512BW-FAST-NEXT: vpermw %zmm6, %zmm8, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,25,24,27,26,25,24,27,26,25,24,27,26,29,28,31,30] +; AVX512BW-FAST-NEXT: vpermw %zmm5, %zmm9, %zmm6 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm10, %xmm10 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm19[0],xmm15[0],xmm19[1],xmm15[1],xmm19[2],xmm15[2],xmm19[3],xmm15[3],xmm19[4],xmm15[4],xmm19[5],xmm15[5],xmm19[6],xmm15[6],xmm19[7],xmm15[7] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] -; AVX512BW-FAST-NEXT: vpermw %ymm20, %ymm26, %ymm20 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm20, %zmm10 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm16, %zmm10 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %xmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm27 = <8,u,9,u,u,u,u,u,u,u,5,u,6,u,7,u> -; AVX512BW-FAST-NEXT: vpshufb %xmm27, %xmm16, %xmm20 -; AVX512BW-FAST-NEXT: vpmovzxbw {{.*#+}} xmm28 = xmm16[0],zero,xmm16[1],zero,xmm16[2],zero,xmm16[3],zero,xmm16[4],zero,xmm16[5],zero,xmm16[6],zero,xmm16[7],zero -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm28, %zmm20 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] -; AVX512BW-FAST-NEXT: vpermw %zmm20, %zmm28, %zmm10 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %xmm20 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm29 = -; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm20, %xmm30 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm31 = xmm20[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm30, %zmm31, %zmm30 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %xmm31 -; AVX512BW-FAST-NEXT: vpermw %zmm30, %zmm28, %zmm30 -; AVX512BW-FAST-NEXT: movabsq $585610922974906400, %rax # imm = 0x820820820820820 -; AVX512BW-FAST-NEXT: kmovq %rax, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm30, %zmm10 {%k3} -; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm21, %xmm30 -; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm22, %xmm23 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm23[0],xmm30[0],xmm23[1],xmm30[1],xmm23[2],xmm30[2],xmm23[3],xmm30[3],xmm23[4],xmm30[4],xmm23[5],xmm30[5],xmm23[6],xmm30[6],xmm23[7],xmm30[7] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm30 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] -; AVX512BW-FAST-NEXT: vpermw %ymm30, %ymm24, %ymm24 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm30 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm24, %zmm24 -; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm31, %xmm23 -; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm30, %xmm25 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm25[8],xmm23[8],xmm25[9],xmm23[9],xmm25[10],xmm23[10],xmm25[11],xmm23[11],xmm25[12],xmm23[12],xmm25[13],xmm23[13],xmm25[14],xmm23[14],xmm25[15],xmm23[15] -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm30[0],xmm31[0],xmm30[1],xmm31[1],xmm30[2],xmm31[2],xmm30[3],xmm31[3],xmm30[4],xmm31[4],xmm30[5],xmm31[5],xmm30[6],xmm31[6],xmm30[7],xmm31[7] -; AVX512BW-FAST-NEXT: vpermw %ymm25, %ymm26, %ymm25 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm25, %zmm23 -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %xmm25 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm23 {%k2} -; AVX512BW-FAST-NEXT: vpshufb %xmm27, %xmm25, %xmm24 -; AVX512BW-FAST-NEXT: vpmovzxbw {{.*#+}} xmm26 = xmm25[0],zero,xmm25[1],zero,xmm25[2],zero,xmm25[3],zero,xmm25[4],zero,xmm25[5],zero,xmm25[6],zero,xmm25[7],zero -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm26, %zmm24 -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %xmm26 -; AVX512BW-FAST-NEXT: vpermw %zmm24, %zmm28, %zmm23 {%k1} -; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm26, %xmm24 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm26[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm24, %zmm27, %zmm24 -; AVX512BW-FAST-NEXT: vpermw %zmm24, %zmm28, %zmm24 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm24, %zmm23 {%k3} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm17 = ymm18[0],ymm17[0],ymm18[1],ymm17[1],ymm18[2],ymm17[2],ymm18[3],ymm17[3],ymm18[4],ymm17[4],ymm18[5],ymm17[5],ymm18[6],ymm17[6],ymm18[7],ymm17[7],ymm18[16],ymm17[16],ymm18[17],ymm17[17],ymm18[18],ymm17[18],ymm18[19],ymm17[19],ymm18[20],ymm17[20],ymm18[21],ymm17[21],ymm18[22],ymm17[22],ymm18[23],ymm17[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[16],ymm11[16],ymm13[17],ymm11[17],ymm13[18],ymm11[18],ymm13[19],ymm11[19],ymm13[20],ymm11[20],ymm13[21],ymm11[21],ymm13[22],ymm11[22],ymm13[23],ymm11[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm30[8],xmm31[8],xmm30[9],xmm31[9],xmm30[10],xmm31[10],xmm30[11],xmm31[11],xmm30[12],xmm31[12],xmm30[13],xmm31[13],xmm30[14],xmm31[14],xmm30[15],xmm31[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] -; AVX512BW-FAST-NEXT: vpermw %zmm11, %zmm13, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm18 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,25,24,27,26,25,24,27,26,25,24,27,26,29,28,31,30] -; AVX512BW-FAST-NEXT: vpermw %zmm17, %zmm18, %zmm11 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[16],ymm5[16],ymm6[17],ymm5[17],ymm6[18],ymm5[18],ymm6[19],ymm5[19],ymm6[20],ymm5[20],ymm6[21],ymm5[21],ymm6[22],ymm5[22],ymm6[23],ymm5[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm19[8],xmm15[8],xmm19[9],xmm15[9],xmm19[10],xmm15[10],xmm19[11],xmm15[11],xmm19[12],xmm15[12],xmm19[13],xmm15[13],xmm19[14],xmm15[14],xmm19[15],xmm15[15] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm13, %zmm1 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm25[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FAST-NEXT: vpermw %zmm5, %zmm18, %zmm1 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] -; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm5, %ymm2 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512BW-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm8 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512BW-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm18, %ymm14 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm10 ; AVX512BW-FAST-NEXT: movl $1227133513, %eax # imm = 0x49249249 -; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm2, %zmm11 {%k1} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm26[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm5, %ymm2 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512BW-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm9 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512BW-FAST-NEXT: movabsq $2342443691899625602, %rax # imm = 0x2082082082082082 -; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm11 {%k2} -; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm2 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm16[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm3, %ymm5, %ymm3 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512BW-FAST-NEXT: kmovd %eax, %k2 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm10, %zmm6 {%k2} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm28[0],ymm26[0],ymm28[1],ymm26[1],ymm28[2],ymm26[2],ymm28[3],ymm26[3],ymm28[4],ymm26[4],ymm28[5],ymm26[5],ymm28[6],ymm26[6],ymm28[7],ymm26[7],ymm28[16],ymm26[16],ymm28[17],ymm26[17],ymm28[18],ymm26[18],ymm28[19],ymm26[19],ymm28[20],ymm26[20],ymm28[21],ymm26[21],ymm28[22],ymm26[22],ymm28[23],ymm26[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm23[0],ymm22[0],ymm23[1],ymm22[1],ymm23[2],ymm22[2],ymm23[3],ymm22[3],ymm23[4],ymm22[4],ymm23[5],ymm22[5],ymm23[6],ymm22[6],ymm23[7],ymm22[7],ymm23[16],ymm22[16],ymm23[17],ymm22[17],ymm23[18],ymm22[18],ymm23[19],ymm22[19],ymm23[20],ymm22[20],ymm23[21],ymm22[21],ymm23[22],ymm22[22],ymm23[23],ymm22[23] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm16[8],xmm3[8],xmm16[9],xmm3[9],xmm16[10],xmm3[10],xmm16[11],xmm3[11],xmm16[12],xmm3[12],xmm16[13],xmm3[13],xmm16[14],xmm3[14],xmm16[15],xmm3[15] ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm20[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm5, %ymm2 -; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm3 +; AVX512BW-FAST-NEXT: vpermw %zmm2, %zmm8, %zmm2 +; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm9, %zmm2 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm3 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512BW-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm8 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 +; AVX512BW-FAST-NEXT: movabsq $2342443691899625602, %rax # imm = 0x2082082082082082 +; AVX512BW-FAST-NEXT: kmovq %rax, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm6 {%k1} +; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm17, %xmm3 +; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm25, %ymm5 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm3, %zmm2 {%k2} +; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm24, %ymm3 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm1 {%k2} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm2 {%k1} ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm23, (%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, 320(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, 192(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq ; ; AVX512DQBW-SLOW-LABEL: store_i8_stride6_vf64: ; AVX512DQBW-SLOW: # %bb.0: ; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm12 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %ymm16 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23] -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] -; AVX512DQBW-SLOW-NEXT: vpermw %zmm0, %zmm7, %zmm0 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm5, %ymm20, %ymm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %ymm18 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm19 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm19[0],ymm18[0],ymm19[1],ymm18[1],ymm19[2],ymm18[2],ymm19[3],ymm18[3],ymm19[4],ymm18[4],ymm19[5],ymm18[5],ymm19[6],ymm18[6],ymm19[7],ymm18[7],ymm19[16],ymm18[16],ymm19[17],ymm18[17],ymm19[18],ymm18[18],ymm19[19],ymm18[19],ymm19[20],ymm18[20],ymm19[21],ymm18[21],ymm19[22],ymm18[22],ymm19[23],ymm18[23] -; AVX512DQBW-SLOW-NEXT: vprold $16, %ymm6, %ymm6 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %xmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15] +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[4],ymm9[4],ymm11[5],ymm9[5],ymm11[6],ymm9[6],ymm11[7],ymm9[7],ymm11[16],ymm9[16],ymm11[17],ymm9[17],ymm11[18],ymm9[18],ymm11[19],ymm9[19],ymm11[20],ymm9[20],ymm11[21],ymm9[21],ymm11[22],ymm9[22],ymm11[23],ymm9[23] +; AVX512DQBW-SLOW-NEXT: vprold $16, %ymm3, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm16 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm16[0],ymm14[0],ymm16[1],ymm14[1],ymm16[2],ymm14[2],ymm16[3],ymm14[3],ymm16[4],ymm14[4],ymm16[5],ymm14[5],ymm16[6],ymm14[6],ymm16[7],ymm14[7],ymm16[16],ymm14[16],ymm16[17],ymm14[17],ymm16[18],ymm14[18],ymm16[19],ymm14[19],ymm16[20],ymm14[20],ymm16[21],ymm14[21],ymm16[22],ymm14[22],ymm16[23],ymm14[23] +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [8,11,10,9,8,11,10,9,8,11,10,9,12,13,14,13] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm0, %ymm15, %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %xmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm25[8],xmm19[8],xmm25[9],xmm19[9],xmm25[10],xmm19[10],xmm25[11],xmm19[11],xmm25[12],xmm19[12],xmm25[13],xmm19[13],xmm25[14],xmm19[14],xmm25[15],xmm19[15] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm17, %xmm6, %xmm6 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 ; AVX512DQBW-SLOW-NEXT: movl $613566756, %r10d # imm = 0x24924924 ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm5, %zmm0 {%k1} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %xmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r8), %xmm13 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm6, %ymm23, %ymm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %ymm21 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512DQBW-SLOW-NEXT: # ymm24 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm21, %ymm15 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %xmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm26 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm26, %xmm23, %xmm6 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512DQBW-SLOW-NEXT: # ymm27 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm18, %ymm8 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 ; AVX512DQBW-SLOW-NEXT: movl $1227133513, %r10d # imm = 0x49249249 ; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k2 ; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm6, %zmm0 {%k2} -; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %xmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r9), %xmm15 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm22, %ymm23, %ymm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %ymm22 -; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512DQBW-SLOW-NEXT: # ymm26 = mem[0,1,0,1] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm26, %ymm22, %ymm27 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,2,2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %xmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r9), %xmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm28 = +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm28, %xmm24, %xmm8 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm29 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512DQBW-SLOW-NEXT: # ymm29 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm29, %ymm22, %ymm10 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 ; AVX512DQBW-SLOW-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082 ; AVX512DQBW-SLOW-NEXT: kmovq %r10, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm25, %zmm0 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm25 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm27 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm28 = ymm27[0],ymm25[0],ymm27[1],ymm25[1],ymm27[2],ymm25[2],ymm27[3],ymm25[3],ymm27[4],ymm25[4],ymm27[5],ymm25[5],ymm27[6],ymm25[6],ymm27[7],ymm25[7],ymm27[16],ymm25[16],ymm27[17],ymm25[17],ymm27[18],ymm25[18],ymm27[19],ymm25[19],ymm27[20],ymm25[20],ymm27[21],ymm25[21],ymm27[22],ymm25[22],ymm27[23],ymm25[23] -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm29 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm29, %zmm28 -; AVX512DQBW-SLOW-NEXT: vpermw %zmm28, %zmm7, %zmm7 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm28, %ymm20, %ymm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm28 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm29 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm30 = ymm29[0],ymm28[0],ymm29[1],ymm28[1],ymm29[2],ymm28[2],ymm29[3],ymm28[3],ymm29[4],ymm28[4],ymm29[5],ymm28[5],ymm29[6],ymm28[6],ymm29[7],ymm28[7],ymm29[16],ymm28[16],ymm29[17],ymm28[17],ymm29[18],ymm28[18],ymm29[19],ymm28[19],ymm29[20],ymm28[20],ymm29[21],ymm28[21],ymm29[22],ymm28[22],ymm29[23],ymm28[23] -; AVX512DQBW-SLOW-NEXT: vprold $16, %ymm30, %ymm30 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm20, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k1} -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r8), %ymm30 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm30, %ymm24 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k2} -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r9), %ymm23 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm26, %ymm23, %ymm24 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3] -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm20, %zmm7 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm24 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm25, %ymm20 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm27, %ymm26 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm26[0],ymm20[0],ymm26[1],ymm20[1],ymm26[2],ymm20[2],ymm26[3],ymm20[3],ymm26[4],ymm20[4],ymm26[5],ymm20[5],ymm26[6],ymm20[6],ymm26[7],ymm20[7],ymm26[16],ymm20[16],ymm26[17],ymm20[17],ymm26[18],ymm20[18],ymm26[19],ymm20[19],ymm26[20],ymm20[20],ymm26[21],ymm20[21],ymm26[22],ymm20[22],ymm26[23],ymm20[23] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm25 = ymm27[8],ymm25[8],ymm27[9],ymm25[9],ymm27[10],ymm25[10],ymm27[11],ymm25[11],ymm27[12],ymm25[12],ymm27[13],ymm25[13],ymm27[14],ymm25[14],ymm27[15],ymm25[15],ymm27[24],ymm25[24],ymm27[25],ymm25[25],ymm27[26],ymm25[26],ymm27[27],ymm25[27],ymm27[28],ymm25[28],ymm27[29],ymm25[29],ymm27[30],ymm25[30],ymm27[31],ymm25[31] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm25, %ymm26, %ymm25 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm27 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm28, %ymm20 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm29, %ymm31 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm31[0],ymm20[0],ymm31[1],ymm20[1],ymm31[2],ymm20[2],ymm31[3],ymm20[3],ymm31[4],ymm20[4],ymm31[5],ymm20[5],ymm31[6],ymm20[6],ymm31[7],ymm20[7],ymm31[16],ymm20[16],ymm31[17],ymm20[17],ymm31[18],ymm20[18],ymm31[19],ymm20[19],ymm31[20],ymm20[20],ymm31[21],ymm20[21],ymm31[22],ymm20[22],ymm31[23],ymm20[23] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm28 = ymm29[8],ymm28[8],ymm29[9],ymm28[9],ymm29[10],ymm28[10],ymm29[11],ymm28[11],ymm29[12],ymm28[12],ymm29[13],ymm28[13],ymm29[14],ymm28[14],ymm29[15],ymm28[15],ymm29[24],ymm28[24],ymm29[25],ymm28[25],ymm29[26],ymm28[26],ymm29[27],ymm28[27],ymm29[28],ymm28[28],ymm29[29],ymm28[29],ymm29[30],ymm28[30],ymm29[31],ymm28[31] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm28, %ymm29, %ymm28 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm8, %zmm0 {%k3} +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm7, %xmm8, %xmm7 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[4],ymm8[4],ymm10[5],ymm8[5],ymm10[6],ymm8[6],ymm10[7],ymm8[7],ymm10[16],ymm8[16],ymm10[17],ymm8[17],ymm10[18],ymm8[18],ymm10[19],ymm8[19],ymm10[20],ymm8[20],ymm10[21],ymm8[21],ymm10[22],ymm8[22],ymm10[23],ymm8[23] +; AVX512DQBW-SLOW-NEXT: vprold $16, %ymm12, %ymm12 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rsi), %ymm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[16],ymm12[16],ymm13[17],ymm12[17],ymm13[18],ymm12[18],ymm13[19],ymm12[19],ymm13[20],ymm12[20],ymm13[21],ymm12[21],ymm13[22],ymm12[22],ymm13[23],ymm12[23] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm7, %ymm15, %ymm7 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm17, %xmm15, %xmm15 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm30, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm26, %xmm3, %xmm15 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm15[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm15, %ymm26 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm17, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm17, %zmm7 {%k2} +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm28, %xmm6, %xmm17 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm17[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm29, %ymm17, %ymm27 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm26, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm26, %zmm7 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm26 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm26, %xmm19, %xmm27 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm26, %xmm25, %xmm28 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm25[0],xmm19[0],xmm25[1],xmm19[1],xmm25[2],xmm19[2],xmm25[3],xmm19[3],xmm25[4],xmm19[4],xmm25[5],xmm19[5],xmm25[6],xmm19[6],xmm25[7],xmm19[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm19, %ymm25, %ymm19 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm19, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm27 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm27, %xmm20, %xmm28 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm27, %xmm21, %xmm29 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm28 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7] +; AVX512DQBW-SLOW-NEXT: vprold $16, %xmm20, %xmm20 ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm20, %zmm20 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm30[0,1,2,3],zmm14[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512DQBW-SLOW-NEXT: vpshufb %zmm28, %zmm25, %zmm25 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm25 = zmm25[2,2,2,3,6,6,6,7] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm20 = zmm20[0,0,0,1,4,4,4,5] ; AVX512DQBW-SLOW-NEXT: movl $-1840700270, %ecx # imm = 0x92492492 ; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k2} -; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm12[4,5,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512DQBW-SLOW-NEXT: vpshufb %zmm25, %zmm23, %zmm23 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm23 = zmm23[2,2,2,3,6,6,6,7] -; AVX512DQBW-SLOW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 -; AVX512DQBW-SLOW-NEXT: kmovq %rcx, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm23, %zmm20 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm16, %ymm23 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm17, %ymm24 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm23 = ymm24[0],ymm23[0],ymm24[1],ymm23[1],ymm24[2],ymm23[2],ymm24[3],ymm23[3],ymm24[4],ymm23[4],ymm24[5],ymm23[5],ymm24[6],ymm23[6],ymm24[7],ymm23[7],ymm24[16],ymm23[16],ymm24[17],ymm23[17],ymm24[18],ymm23[18],ymm24[19],ymm23[19],ymm24[20],ymm23[20],ymm24[21],ymm23[21],ymm24[22],ymm23[22],ymm24[23],ymm23[23] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,2,2,3] -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm16 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm16, %ymm26, %ymm16 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm23, %zmm17 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm18, %ymm16 -; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm19, %ymm23 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm16 = ymm23[0],ymm16[0],ymm23[1],ymm16[1],ymm23[2],ymm16[2],ymm23[3],ymm16[3],ymm23[4],ymm16[4],ymm23[5],ymm16[5],ymm23[6],ymm16[6],ymm23[7],ymm16[7],ymm23[16],ymm16[16],ymm23[17],ymm16[17],ymm23[18],ymm16[18],ymm23[19],ymm16[19],ymm23[20],ymm16[20],ymm23[21],ymm16[21],ymm23[22],ymm16[22],ymm23[23],ymm16[23] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,2,3] -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm18 = ymm19[8],ymm18[8],ymm19[9],ymm18[9],ymm19[10],ymm18[10],ymm19[11],ymm18[11],ymm19[12],ymm18[12],ymm19[13],ymm18[13],ymm19[14],ymm18[14],ymm19[15],ymm18[15],ymm19[24],ymm18[24],ymm19[25],ymm18[25],ymm19[26],ymm18[26],ymm19[27],ymm18[27],ymm19[28],ymm18[28],ymm19[29],ymm18[29],ymm19[30],ymm18[30],ymm19[31],ymm18[31] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm18, %ymm29, %ymm18 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm17, %zmm16 {%k1} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm14, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpshufb %zmm28, %zmm14, %zmm14 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,2,2,3,6,6,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm14, %zmm16 {%k2} -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm12, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpshufb %zmm25, %zmm12, %zmm12 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,2,2,3,6,6,6,7] -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm12, %zmm16 {%k3} -; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm14 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm17 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm9, %ymm11, %ymm9 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9 -; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm17 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm18 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX512DQBW-SLOW-NEXT: vprold $16, %xmm8, %xmm8 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm8, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} -; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm8, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] -; AVX512DQBW-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm9 {%k1} -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm13, %zmm8 -; AVX512DQBW-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm20, %zmm19 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm23, %xmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm23, %xmm23 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm28, %zmm23, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm23 = zmm23[0,0,0,1,4,4,4,5] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm23, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm23 = +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm23, %xmm24, %xmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm28 = +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm28, %xmm24, %xmm24 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm29, %zmm24, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm24 = zmm24[0,0,0,1,4,4,4,5] ; AVX512DQBW-SLOW-NEXT: movabsq $585610922974906400, %rcx # imm = 0x820820820820820 ; AVX512DQBW-SLOW-NEXT: kmovq %rcx, %k3 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm8, %zmm9 {%k3} -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm8 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512DQBW-SLOW-NEXT: vpermw %ymm1, %ymm11, %ymm1 -; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm3 -; AVX512DQBW-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm8 -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512DQBW-SLOW-NEXT: vprold $16, %xmm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm24, %zmm19 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm24 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm14, %ymm29 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm16, %ymm30 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm29 = ymm30[0],ymm29[0],ymm30[1],ymm29[1],ymm30[2],ymm29[2],ymm30[3],ymm29[3],ymm30[4],ymm29[4],ymm30[5],ymm29[5],ymm30[6],ymm29[6],ymm30[7],ymm29[7],ymm30[16],ymm29[16],ymm30[17],ymm29[17],ymm30[18],ymm29[18],ymm30[19],ymm29[19],ymm30[20],ymm29[20],ymm30[21],ymm29[21],ymm30[22],ymm29[22],ymm30[23],ymm29[23] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm29 = ymm29[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm16 = ymm16[8],ymm14[8],ymm16[9],ymm14[9],ymm16[10],ymm14[10],ymm16[11],ymm14[11],ymm16[12],ymm14[12],ymm16[13],ymm14[13],ymm16[14],ymm14[14],ymm16[15],ymm14[15],ymm16[24],ymm14[24],ymm16[25],ymm14[25],ymm16[26],ymm14[26],ymm16[27],ymm14[27],ymm16[28],ymm14[28],ymm16[29],ymm14[29],ymm16[30],ymm14[30],ymm16[31],ymm14[31] +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm16, %ymm14, %ymm16 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm29, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm16 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm16, %ymm9, %ymm30 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm16, %ymm11, %ymm31 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm30 = ymm31[0],ymm30[0],ymm31[1],ymm30[1],ymm31[2],ymm30[2],ymm31[3],ymm30[3],ymm31[4],ymm30[4],ymm31[5],ymm30[5],ymm31[6],ymm30[6],ymm31[7],ymm30[7],ymm31[16],ymm30[16],ymm31[17],ymm30[17],ymm31[18],ymm30[18],ymm31[19],ymm30[19],ymm31[20],ymm30[20],ymm31[21],ymm30[21],ymm31[22],ymm30[22],ymm31[23],ymm30[23] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11],ymm11[12],ymm9[12],ymm11[13],ymm9[13],ymm11[14],ymm9[14],ymm11[15],ymm9[15],ymm11[24],ymm9[24],ymm11[25],ymm9[25],ymm11[26],ymm9[26],ymm11[27],ymm9[27],ymm11[28],ymm9[28],ymm11[29],ymm9[29],ymm11[30],ymm9[30],ymm11[31],ymm9[31] +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm9, %ymm11, %ymm9 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm29, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = +; AVX512DQBW-SLOW-NEXT: vpshufb %zmm29, %zmm18, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm18 = zmm18[2,2,2,3,6,6,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm18, %zmm9 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = +; AVX512DQBW-SLOW-NEXT: vpshufb %zmm18, %zmm22, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,2,2,3,6,6,6,7] +; AVX512DQBW-SLOW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 +; AVX512DQBW-SLOW-NEXT: kmovq %rcx, %k4 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm22, %zmm9 {%k4} +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm26, %xmm4, %xmm22 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm26, %xmm5, %xmm26 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm26[8],xmm22[8],xmm26[9],xmm22[9],xmm26[10],xmm22[10],xmm26[11],xmm22[11],xmm26[12],xmm22[12],xmm26[13],xmm22[13],xmm26[14],xmm22[14],xmm26[15],xmm22[15] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm22[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm4, %ymm25, %ymm4 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm4, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm27, %xmm1, %xmm5 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm27, %xmm2, %xmm22 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm22[0],xmm5[0],xmm22[1],xmm5[1],xmm22[2],xmm5[2],xmm22[3],xmm5[3],xmm22[4],xmm5[4],xmm22[5],xmm5[5],xmm22[6],xmm5[6],xmm22[7],xmm5[7] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQBW-SLOW-NEXT: vprold $16, %xmm1, %xmm1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm1, %zmm4 {%k2} +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm3, %xmm1 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm3, %xmm2 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm1, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm23, %xmm6, %xmm1 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm28, %xmm6, %xmm2 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm4 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm12, %ymm1 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm13, %ymm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15],ymm13[24],ymm12[24],ymm13[25],ymm12[25],ymm13[26],ymm12[26],ymm13[27],ymm12[27],ymm13[28],ymm12[28],ymm13[29],ymm12[29],ymm13[30],ymm12[30],ymm13[31],ymm12[31] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm2, %ymm14, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm16, %ymm8, %ymm2 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm16, %ymm10, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11],ymm10[12],ymm8[12],ymm10[13],ymm8[13],ymm10[14],ymm8[14],ymm10[15],ymm8[15],ymm10[24],ymm8[24],ymm10[25],ymm8[25],ymm10[26],ymm8[26],ymm10[27],ymm8[27],ymm10[28],ymm8[28],ymm10[29],ymm8[29],ymm10[30],ymm8[30],ymm10[31],ymm8[31] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm3, %ymm11, %ymm3 ; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,0,1,4,4,4,5] -; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k2} -; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm1 {%k1} -; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,1,2,3] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 -; AVX512DQBW-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm2 -; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k3} -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rax) -; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufb %zmm29, %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm1, %zmm2 {%k2} +; AVX512DQBW-SLOW-NEXT: vpshufb %zmm18, %zmm17, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm2 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, (%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 256(%rax) ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQBW-SLOW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -23,11 +23,11 @@ ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa (%r8), %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,5,6,7] @@ -62,12 +62,12 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa (%rdx), %xmm1 ; AVX-NEXT: vmovdqa (%r8), %xmm2 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,2,6,10,14,3,7,11,u,u] ; AVX-NEXT: vpextrw $6, %xmm0, 12(%rax) ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rax) @@ -98,113 +98,136 @@ ; SSE: # %bb.0: ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa (%rdx), %xmm3 -; SSE-NEXT: movdqa (%r8), %xmm5 -; SSE-NEXT: movdqa (%r10), %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] -; SSE-NEXT: pxor %xmm7, %xmm7 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[0,1,0,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm4, %xmm6 +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa (%rsi), %xmm0 +; SSE-NEXT: movdqa (%rdx), %xmm7 +; SSE-NEXT: movdqa (%rcx), %xmm6 +; SSE-NEXT: movdqa (%r8), %xmm9 +; SSE-NEXT: movdqa (%r9), %xmm8 +; SSE-NEXT: movdqa (%r10), %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,1,3] +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: pandn %xmm5, %xmm10 +; SSE-NEXT: por %xmm1, %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm6, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,2,0,0] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,6] -; SSE-NEXT: packuswb %xmm8, %xmm6 -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm10, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm2[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: pand %xmm10, %xmm11 +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,1,0] +; SSE-NEXT: pandn %xmm12, %xmm10 +; SSE-NEXT: por %xmm11, %xmm10 +; SSE-NEXT: pand %xmm1, %xmm10 +; SSE-NEXT: por %xmm5, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; SSE-NEXT: pand %xmm5, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm9[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm12, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,0,2,1] +; SSE-NEXT: pandn %xmm13, %xmm12 +; SSE-NEXT: por %xmm11, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] +; SSE-NEXT: pand %xmm11, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm3[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,0,0,0] +; SSE-NEXT: pandn %xmm13, %xmm11 +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: pandn %xmm11, %xmm5 +; SSE-NEXT: por %xmm10, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] -; SSE-NEXT: pandn %xmm9, %xmm7 -; SSE-NEXT: por %xmm8, %xmm7 -; SSE-NEXT: pandn %xmm7, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,3,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,2,0,4,5,6,7] -; SSE-NEXT: packuswb %xmm6, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[3,1,2,1] -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,0,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm6 -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm1 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,0,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,1,3] +; SSE-NEXT: pandn %xmm8, %xmm4 +; SSE-NEXT: por %xmm9, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm8, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: por %xmm7, %xmm8 +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,0,0,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movq %xmm3, 16(%rax) -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE-NEXT: pand %xmm6, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: psllq $24, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movq %xmm4, 16(%rax) +; SSE-NEXT: movdqa %xmm5, (%rax) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE-NEXT: movd %xmm0, 24(%rax) -; SSE-NEXT: movdqa %xmm4, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride7_vf4: ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,4,8,12],zero,zero,zero,xmm0[1,5,9,13],zero,zero,zero,xmm0[2,6] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[0,4,8],zero,zero,zero,zero,xmm1[1,5,9],zero,zero +; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm0 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX1-ONLY-NEXT: vbroadcastss (%r10), %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0,2] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,4,8,12],zero,zero,zero,xmm1[1,5,9,13],zero,zero,zero,xmm1[2,6] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm0[0,4,8],zero,zero,zero,zero,xmm0[1,5,9],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,14],zero,zero,zero,xmm0[3,7,11,15],zero,zero,zero,xmm0[u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[2,6,10],zero,zero,zero,zero,xmm1[3,7,11,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,14],zero,zero,zero,xmm1[3,7,11,15],zero,zero,zero,xmm1[u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[2,6,10],zero,zero,zero,zero,xmm0[3,7,11,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vpextrd $2, %xmm0, 24(%rax) ; AVX1-ONLY-NEXT: vmovq %xmm0, 16(%rax) ; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rax) +; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-ONLY-LABEL: store_i8_stride7_vf4: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-ONLY-NEXT: vpbroadcastd (%r10), %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX2-ONLY-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -221,14 +244,17 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512F-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512F-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-NEXT: vmovdqa (%r8), %xmm1 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512F-NEXT: vpbroadcastd (%r10), %ymm2 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX512F-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512F-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27,u,u,u,u] ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -245,14 +271,17 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm1 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512BW-NEXT: vpbroadcastd (%r10), %ymm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,4,8,12],zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6],zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -452,94 +481,106 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u],zero,zero,xmm2[5,13,u,u,u],zero,zero,xmm2[6,14,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,5,13],zero,zero,xmm1[u,u,u,6,14],zero,zero,xmm1[u,u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,12],zero,xmm0[u,u,u,u,5,13],zero,xmm0[u,u,u,u,6,14] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[4,u,u,u,u],zero,zero,xmm3[5,u,u,u,u],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm5, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u],zero,zero,xmm2[7,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,7,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm0[u,u,u,u,7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[6,u,u,u,u],zero,zero,xmm3[7,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,0,255,255,255,255,0,0,0] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] -; AVX1-ONLY-NEXT: vpblendvb %xmm7, %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm2[0,8,u,u,u],zero,zero,xmm2[1,9,u,u,u],zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[0,8],zero,zero,xmm1[u,u,u,1,9],zero,zero,xmm1[u,u,u,2,10] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,0,8],zero,xmm0[u,u,u,u,1,9],zero,xmm0[u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u],zero,zero,xmm3[0,u,u,u,u],zero,zero,xmm3[1,u,u] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm6[6,7],zero,zero,zero,zero,zero,xmm6[8,9],zero,zero +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[4,5],zero,zero,zero,zero,zero,xmm8[6,7],zero,zero,zero,zero,zero,xmm8[8,9] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,1],zero,zero,zero,zero,zero,xmm8[2,3],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1],zero,zero,zero,zero,zero,xmm6[2,3],zero,zero,zero,zero,zero,xmm6[4,5] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,4,5],zero,xmm4[u,u,u,u,6,7],zero,xmm4[u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u],zero,zero,xmm5[2,u,u,u,u],zero,zero,xmm5[3,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[u,u,u,3,11],zero,zero,xmm1[u,u,u,4,12],zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,10,u,u,u],zero,zero,xmm2[3,11,u,u,u],zero,zero,xmm2[4,12] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,2,10],zero,xmm0[u,u,u,u,3,11],zero,xmm0[u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u],zero,zero,xmm3[2,u,u,u,u],zero,zero,xmm3[3,u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, 16(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rax) -; AVX1-ONLY-NEXT: vmovq %xmm5, 48(%rax) -; AVX1-ONLY-NEXT: vmovdqa %xmm4, 32(%rax) +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[4,5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[12,15,u,u,u,u,u,14,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpsrldq {{.*#+}} xmm7 = xmm4[9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [255,255,0,0,0,0,0,255,255,255,0,0,0,0,0,255] +; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[8,11],zero,zero,zero,zero,zero,xmm3[10,13],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9],zero,zero,zero,zero,zero,xmm4[10,11],zero,zero,zero,zero,zero,xmm4[12,13] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-ONLY-NEXT: vpsrlq $56, %xmm0, %xmm2 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm2 = xmm1[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,10,11],zero,xmm1[u,u,u,u,12,13],zero,xmm1[u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,xmm0[5,u,u,u,u],zero,zero,xmm0[6,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vmovlps %xmm1, 48(%rax) +; AVX1-ONLY-NEXT: vmovaps %xmm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, (%rax) +; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i8_stride7_vf8: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,zero,ymm2[6,14],zero,zero,zero,zero,zero,zero,zero,ymm2[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero -; AVX2-SLOW-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,ymm1[18],zero,zero,zero,zero,zero,zero,ymm1[19],zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm0 -; AVX2-SLOW-NEXT: vmovq %xmm0, 48(%rax) -; AVX2-SLOW-NEXT: vmovdqa %xmm3, 32(%rax) +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[1,1,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm8 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[2,10],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero +; AVX2-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[4,13],zero,zero,zero,zero,zero,ymm3[5,14],zero,zero,zero,zero,zero,ymm3[22,31],zero,zero,zero,zero,zero,ymm3[23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,12],zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,5,13,u,u,u,u,u,6,14,u,u,u,u,u,23,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vmovq %xmm1, 48(%rax) +; AVX2-SLOW-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -547,41 +588,47 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,3,5,u,5,1,3,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,4,8],zero,zero,zero,zero,ymm3[1,5,9],zero,zero,zero,zero,ymm3[2,6,18],zero,zero,zero,zero,ymm3[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero -; AVX2-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX2-FAST-NEXT: vmovq %xmm0, 48(%rax) -; AVX2-FAST-NEXT: vmovdqa %xmm1, 32(%rax) +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[2,3,2,3,0,1,0,1,u,u,u,u,2,3,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm8 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[2,10],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero +; AVX2-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[4,13],zero,zero,zero,zero,zero,ymm2[5,14],zero,zero,zero,zero,zero,ymm2[22,31],zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,12],zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,5,13,u,u,u,u,u,6,14,u,u,u,u,u,23,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vmovq %xmm1, 48(%rax) +; AVX2-FAST-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -589,41 +636,47 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm2[5,13],zero,zero,zero,zero,zero,ymm2[6,14],zero,zero,zero,zero,zero,zero,zero,ymm2[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,ymm1[18],zero,zero,zero,zero,zero,zero,ymm1[19],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[2,3,2,3,0,1,0,1,u,u,u,u,2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,zero,zero,zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[2,10],zero,zero,zero,zero,zero,ymm8[19,27],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero +; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[4,13],zero,zero,zero,zero,zero,ymm2[5,14],zero,zero,zero,zero,zero,ymm2[22,31],zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,12],zero,zero,zero,zero,zero,ymm4[5,13],zero,zero,zero,zero,zero,ymm4[6,14],zero,zero,zero,zero,zero,ymm4[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,5,13,u,u,u,u,u,6,14,u,u,u,u,u,23,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,u,u,u,u,u,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, 48(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%rax) +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovq %xmm1, 48(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -639,11 +692,11 @@ ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,5,13],zero,zero,ymm2[u,u,u,6,14],zero,zero,ymm2[u,u,u],zero,zero,ymm2[23,31,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 @@ -680,29 +733,29 @@ ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512F-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,u> -; AVX512F-FAST-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6] -; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpermi2q %ymm4, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6] +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,4,8],zero,zero,zero,zero,ymm2[1,5,9],zero,zero,zero,zero,ymm2[18,22,26],zero,zero,zero,zero,ymm2[19,23,27],zero,zero,zero,zero ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,3,5,u,5,1,3,u> ; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm2[19,27,u,u,u],zero,zero,ymm2[20,28] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,8],zero,zero,ymm0[u,u,u,1,9],zero,zero,ymm0[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm0[19,27,u,u,u],zero,zero,ymm0[20,28] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,zero,zero,ymm3[u,u,u,19,27],zero,zero,ymm3[u,u,u,20,28],zero,zero -; AVX512F-FAST-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7] -; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,ymm1[u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,3,5,7,1,3,5,7] +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6,10,14],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,ymm0[u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512F-FAST-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512F-FAST-NEXT: vmovq %xmm1, 48(%rax) @@ -722,19 +775,19 @@ ; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm1 ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm0[0,2,1,3,4,6,5,7] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm0[0,2,1,3,4,6,5,7] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zmm2[18],zero,zero,zero,zero,zero,zero,zmm2[19],zero,zero,zero,zero,zmm2[36,44],zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46,54],zero,zero,zero,zero,zero,zero,zmm2[55],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-SLOW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] +; AVX512BW-SLOW-NEXT: vporq %zmm2, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm2 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zmm2[2,10,18,26],zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zero,zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[2,3,0,1,2,3,0,1] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-SLOW-NEXT: vporq %zmm2, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870 @@ -759,33 +812,33 @@ ; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512BW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,u> -; AVX512BW-FAST-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,3,5,7,1,3,5,7] -; AVX512BW-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm0 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6,10,14],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vpermi2q %ymm4, %ymm0, %ymm1 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,3,5,7,1,3,5,7] +; AVX512BW-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm2[1,5,9,13],zero,zero,zero,ymm2[2,6,10,14],zero,zero,zero,ymm2[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,3,5,u,5,1,3,u> ; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,4,8],zero,zero,zero,zero,ymm3[1,5,9],zero,zero,zero,zero,ymm3[2,6,18],zero,zero,zero,zero,ymm3[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm3, %ymm0 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero -; AVX512BW-FAST-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512BW-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,8],zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[2,10,18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,ymm0[20,28] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0,8],zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero +; AVX512BW-FAST-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] ; AVX512BW-FAST-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX512BW-FAST-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm1[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa %ymm2, (%rax) -; AVX512BW-FAST-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) -; AVX512BW-FAST-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa %ymm0, (%rax) +; AVX512BW-FAST-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) +; AVX512BW-FAST-NEXT: vextracti32x4 $3, %zmm1, %xmm0 ; AVX512BW-FAST-NEXT: vmovq %xmm0, 48(%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -812,348 +865,345 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride7_vf16: ; SSE: # %bb.0: -; SSE-NEXT: subq $72, %rsp +; SSE-NEXT: subq $56, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa (%rsi), %xmm5 -; SSE-NEXT: movdqa (%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rcx), %xmm7 -; SSE-NEXT: movdqa (%r8), %xmm4 -; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm14 +; SSE-NEXT: movdqa (%rdx), %xmm4 +; SSE-NEXT: movdqa (%rcx), %xmm6 +; SSE-NEXT: movdqa (%r8), %xmm2 +; SSE-NEXT: movdqa (%r9), %xmm7 +; SSE-NEXT: movdqa (%rax), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,1,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm12 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: por %xmm3, %xmm10 -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: por %xmm1, %xmm10 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,3] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm8, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; SSE-NEXT: pand %xmm12, %xmm10 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: movdqa (%rax), %xmm3 -; SSE-NEXT: por %xmm10, %xmm12 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,7,7,7] -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm12 -; SSE-NEXT: por %xmm12, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,0,3] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: por %xmm1, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,0,3] +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm1 -; SSE-NEXT: por %xmm14, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,0] -; SSE-NEXT: movdqa %xmm5, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm14 -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm3[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm5, %xmm12 -; SSE-NEXT: por %xmm12, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] -; SSE-NEXT: pand %xmm12, %xmm14 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: por %xmm14, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: pandn %xmm12, %xmm7 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,6,6,6] -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,1,2,3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm9, %xmm10 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,1,2,3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,0] +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: por %xmm3, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm14[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[1,1,2,3] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm14 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,3,3] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: movdqa %xmm9, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm12[8],xmm5[9],xmm12[9],xmm5[10],xmm12[10],xmm5[11],xmm12[11],xmm5[12],xmm12[12],xmm5[13],xmm12[13],xmm5[14],xmm12[14],xmm5[15],xmm12[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,3,3] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,2,2,2] +; SSE-NEXT: pand %xmm2, %xmm11 +; SSE-NEXT: por %xmm11, %xmm12 +; SSE-NEXT: pand %xmm1, %xmm12 +; SSE-NEXT: por %xmm3, %xmm12 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: pandn %xmm3, %xmm11 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,2,2,3] +; SSE-NEXT: pand %xmm4, %xmm14 +; SSE-NEXT: por %xmm11, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] ; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm0 ; SSE-NEXT: por %xmm12, %xmm0 +; SSE-NEXT: pshufhw $254, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[0,1,2,3,6,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm11[2,1,3,2] +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: pandn %xmm14, %xmm7 +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa %xmm15, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,6,6,6] +; SSE-NEXT: movdqa %xmm13, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] +; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm6, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,3,3] +; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: pandn %xmm10, %xmm13 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm10, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,1] -; SSE-NEXT: movdqa %xmm13, %xmm12 -; SSE-NEXT: pandn %xmm7, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm14[0,0,0,0,4,5,6,7] -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm7 -; SSE-NEXT: por %xmm7, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,0,0,0,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: pandn %xmm7, %xmm9 -; SSE-NEXT: pand %xmm5, %xmm12 -; SSE-NEXT: por %xmm12, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] +; SSE-NEXT: movdqa %xmm15, %xmm10 +; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm14, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: pand %xmm4, %xmm10 +; SSE-NEXT: por %xmm10, %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] ; SSE-NEXT: pand %xmm14, %xmm0 -; SSE-NEXT: pandn %xmm9, %xmm14 +; SSE-NEXT: pandn %xmm13, %xmm14 ; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,1,3] -; SSE-NEXT: movdqa %xmm10, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,1,3] +; SSE-NEXT: movdqa %xmm8, %xmm9 ; SSE-NEXT: pandn %xmm0, %xmm9 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm5, %xmm12 -; SSE-NEXT: pandn %xmm9, %xmm12 -; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: # xmm6 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: por %xmm12, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm10 +; SSE-NEXT: pandn %xmm9, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm10, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: pand %xmm9, %xmm5 ; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: por %xmm6, %xmm9 +; SSE-NEXT: por %xmm5, %xmm9 ; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,5,7] -; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pandn %xmm0, %xmm13 -; SSE-NEXT: por %xmm3, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: por %xmm3, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] ; SSE-NEXT: pand %xmm3, %xmm9 -; SSE-NEXT: pandn %xmm13, %xmm3 +; SSE-NEXT: pandn %xmm15, %xmm3 ; SSE-NEXT: por %xmm9, %xmm3 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,1,3,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,2,2] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: pandn %xmm0, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: pand %xmm6, %xmm12 -; SSE-NEXT: por %xmm9, %xmm12 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm9 ; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: pand %xmm10, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] +; SSE-NEXT: movdqa %xmm12, %xmm10 ; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: por %xmm9, %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: pand %xmm0, %xmm10 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2] -; SSE-NEXT: pshuflw $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,2,1] -; SSE-NEXT: pand %xmm5, %xmm9 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufd $101, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = mem[1,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,4] -; SSE-NEXT: pandn %xmm9, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: pand %xmm6, %xmm11 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: por %xmm11, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[1,1,1,1,4,5,6,7] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: pand %xmm5, %xmm10 +; SSE-NEXT: por %xmm9, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: pandn %xmm2, %xmm15 -; SSE-NEXT: por %xmm0, %xmm15 -; SSE-NEXT: pand %xmm4, %xmm15 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,1,1,1,4,5,6,7] +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,1,3] +; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm9, %xmm13 +; SSE-NEXT: pandn %xmm0, %xmm13 +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: por %xmm2, %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: pand %xmm9, %xmm10 +; SSE-NEXT: pandn %xmm13, %xmm9 +; SSE-NEXT: por %xmm10, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: por %xmm15, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: pshuflw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: pshuflw $234, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[2,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] +; SSE-NEXT: pandn %xmm2, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: pand %xmm8, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: por %xmm12, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm0, 16(%rax) -; SSE-NEXT: movdqa %xmm10, 32(%rax) +; SSE-NEXT: movdqa %xmm8, 32(%rax) +; SSE-NEXT: movdqa %xmm9, 16(%rax) ; SSE-NEXT: movdqa %xmm3, 64(%rax) ; SSE-NEXT: movdqa %xmm14, (%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 96(%rax) -; SSE-NEXT: addq $72, %rsp +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: addq $56, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride7_vf16: @@ -1167,25 +1217,24 @@ ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm9[6,7],zero,zero,zero,zero,zero,xmm9[8,9],zero,zero -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[4,5],zero,zero,zero,zero,zero,xmm10[6,7],zero,zero,zero,zero,zero,xmm10[8,9] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm10[0,1],zero,zero,zero,zero,zero,xmm10[2,3],zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[0,1],zero,zero,zero,zero,zero,xmm9[2,3],zero,zero,zero,zero,zero,xmm9[4,5] -; AVX1-ONLY-NEXT: vpor %xmm8, %xmm11, %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm8, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,4,5],zero,xmm8[u,u,u,u,6,7],zero,xmm8[u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm8[6,7],zero,zero,zero,zero,zero,xmm8[8,9],zero,zero +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[4,5],zero,zero,zero,zero,zero,xmm9[6,7],zero,zero,zero,zero,zero,xmm9[8,9] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm9[0,1],zero,zero,zero,zero,zero,xmm9[2,3],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[0,1],zero,zero,zero,zero,zero,xmm8[2,3],zero,zero,zero,zero,zero,xmm8[4,5] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,4,5],zero,xmm10[u,u,u,u,6,7],zero,xmm10[u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u],zero,zero,xmm1[2,u,u,u,u],zero,zero,xmm1[3,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u] ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm12, %ymm11 +; AVX1-ONLY-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm11 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm11, %ymm3 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u],zero,xmm5[7,u,u,u,u,u],zero,xmm5[8,u,u,u,u,u],zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,7],zero,xmm4[u,u,u,u,u,8],zero,xmm4[u,u,u,u,u,9] @@ -1195,24 +1244,24 @@ ; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = ; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm11, %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,zero,xmm10[10,11],zero,zero,zero,zero,zero,xmm10[12,13],zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[10,11],zero,zero,zero,zero,zero,xmm9[12,13],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX1-ONLY-NEXT: vandps %ymm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,xmm9[10,11],zero,zero,zero,zero,zero,xmm9[12,13],zero,zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[10,11],zero,zero,zero,zero,zero,xmm8[12,13],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX1-ONLY-NEXT: vandps %ymm9, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,u],zero,xmm2[7,u,u,u,u,u],zero,xmm2[8,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,7],zero,xmm0[u,u,u,u,u,8],zero,xmm0[u,u] ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,xmm11[u,u,u,u,5,6],zero,xmm11[u,u,u,u,12,13],zero,xmm11[u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[6,u,u,u,u],zero,zero,xmm1[7,u,u,u,u],zero,zero,xmm1[8,u] ; AVX1-ONLY-NEXT: vpor %xmm12, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8,9],zero,xmm8[u,u,u,u,10,11],zero,xmm8[u,u,u,u,12,13] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9],zero,xmm10[u,u,u,u,10,11],zero,xmm10[u,u,u,u,12,13] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm1[4,u,u,u,u],zero,zero,xmm1[5,u,u,u,u],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm10, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm8, %ymm9, %ymm8 +; AVX1-ONLY-NEXT: vpor %xmm12, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm9, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,xmm9[8,9],zero,zero,zero,zero,zero,xmm9[10,11],zero,zero,zero ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] @@ -1232,8 +1281,7 @@ ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u],zero,zero,xmm1[9,u,u,u,u],zero,zero,xmm1[10,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm10, %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm6[12,13],zero,zero,zero,zero,zero,xmm6[14,15],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm9[12,13],zero,zero,zero,zero,zero,xmm9[14,15],zero,zero,zero,zero,zero @@ -1347,9 +1395,8 @@ ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm10 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[1,1,0,0,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm10[0,2,0,2] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = @@ -1376,10 +1423,8 @@ ; AVX2-FAST-NEXT: vpor %ymm13, %ymm12, %ymm12 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm2[0,1,2,3,4,5,5,6] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm13, %ymm12 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,1,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = @@ -1415,73 +1460,73 @@ ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm6[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[5],zero,zero,zero,zero,zero,zero,ymm9[6],zero,zero,zero,zero,zero,ymm9[23],zero,zero,zero,zero,zero,zero,ymm9[24],zero,zero,zero,zero,zero,zero,ymm9[25] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,ymm11[5],zero,zero,zero,zero,zero,zero,ymm11[6],zero,zero,zero,zero,zero,ymm11[23],zero,zero,zero,zero,zero,zero,ymm11[24],zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[4],zero,zero,zero,zero,zero,zero,ymm10[5],zero,zero,zero,zero,zero,zero,ymm10[6],zero,zero,zero,zero,zero,zero,zero,ymm10[23],zero,zero,zero,zero,zero,zero,ymm10[24],zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = zero,ymm11[4],zero,zero,zero,zero,zero,zero,ymm11[5],zero,zero,zero,zero,zero,zero,ymm11[6],zero,zero,zero,zero,zero,ymm11[23],zero,zero,zero,zero,zero,zero,ymm11[24],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm8, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,ymm11[5],zero,zero,zero,zero,zero,zero,ymm11[6],zero,zero,zero,zero,zero,ymm11[23],zero,zero,zero,zero,zero,zero,ymm11[24],zero,zero,zero,zero,zero,zero,ymm11[25] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm8[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm12[5],zero,zero,zero,zero,zero,zero,ymm12[6],zero,zero,zero,zero,zero,ymm12[23],zero,zero,zero,zero,zero,zero,ymm12[24],zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpor %ymm12, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[4],zero,zero,zero,zero,zero,zero,ymm10[5],zero,zero,zero,zero,zero,zero,ymm10[6],zero,zero,zero,zero,zero,zero,zero,ymm10[23],zero,zero,zero,zero,zero,zero,ymm10[24],zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4],zero,zero,zero,zero,zero,zero,ymm12[5],zero,zero,zero,zero,zero,zero,ymm12[6],zero,zero,zero,zero,zero,ymm12[23],zero,zero,zero,zero,zero,zero,ymm12[24],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpor %ymm12, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,0] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm10[0,2,0,2] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm8[0,2,0,2] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,ymm12[0,8],zero,zero,zero,zero,zero,ymm12[1,9],zero,zero,zero,zero,zero,ymm12[18,26],zero,zero,zero,zero,zero,ymm12[19,27],zero,zero,zero,zero,zero,ymm12[20,28] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm6[0,2,0,2] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm7[0,2,0,2] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,8],zero,zero,zero,zero,zero,ymm13[1,9],zero,zero,zero,zero,zero,ymm13[2,10],zero,zero,zero,zero,zero,ymm13[19,27],zero,zero,zero,zero,zero,ymm13[20,28],zero,zero ; AVX2-FAST-PERLANE-NEXT: vpor %ymm12, %ymm13, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,1,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[3,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,zero,zero,zero,ymm6[10,2],zero,zero,zero,zero,zero,ymm6[11,3],zero,zero,zero,zero,zero,ymm6[20,28],zero,zero,zero,zero,zero,ymm6[21,29],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,3,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[2,10],zero,zero,zero,zero,zero,ymm7[3,19],zero,zero,zero,zero,zero,ymm7[28,20],zero,zero,zero,zero,zero,ymm7[29,21],zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm6, %ymm10, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[12,13],zero,zero,zero,zero,zero,xmm4[14,15],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9],zero,zero,zero,zero,zero,ymm8[2,10],zero,zero,zero,zero,zero,ymm8[3,19],zero,zero,zero,zero,zero,ymm8[28,20],zero,zero,zero,zero,zero,ymm8[29,21],zero +; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm10, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm5[12,13],zero,zero,zero,zero,zero,xmm5[14,15],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[12,13],zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm8[13,14,15,4,5],zero,zero,xmm8[14,15,14,15,12],zero,zero,xmm8[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,14,15,4,5],zero,zero,xmm2[14,15,14,15,12],zero,zero,xmm2[15] ; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -1491,77 +1536,76 @@ ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm3 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, (%r9), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[12,13,u,u,u],zero,zero,xmm6[14,15,u,u,u] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,12,13],zero,zero,xmm4[u,u,u,14,15],zero,zero,xmm4[u,u,u] -; AVX512F-SLOW-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10],zero,xmm5[u,u,u,u,13,12],zero,xmm5[u,u,u,u,15,14],zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm1[13,u,u,u,u],zero,zero,xmm1[14,u,u,u,u],zero,zero,xmm1[15] -; AVX512F-SLOW-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm5 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[3,1,1,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[1],zero,zero,ymm4[u,u,u,10,2],zero,zero,ymm4[u,u,u,11,3],zero,zero,ymm4[u,u,u,20,28],zero,zero,ymm4[u,u,u,21,29],zero,zero,ymm4[u] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm3[1,3,3,1] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[1,9,u,u,u],zero,zero,ymm6[2,10,u,u,u],zero,zero,ymm6[3,19,u,u,u],zero,zero,ymm6[28,20,u,u,u],zero,zero,ymm6[29,21,u] -; AVX512F-SLOW-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,5,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,3,1,3] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm2 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm7 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm9[u,u,u,u,u,5],zero,ymm9[u,u,u,u,u,6],zero,ymm9[u,u,u,u,u],zero,ymm9[23,u,u,u,u,u],zero,ymm9[24,u,u,u,u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u],zero,ymm11[5,u,u,u,u,u],zero,ymm11[6,u,u,u,u,u,23],zero,ymm11[u,u,u,u,u,24],zero,ymm11[u,u,u,u] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX512F-SLOW-NEXT: vpternlogq $50, %ymm10, %ymm12, %ymm11 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[u,u,u,5],zero,ymm8[u,u,u,u,u,6],zero,ymm8[u,u,u,u,u],zero,ymm8[23,u,u,u,u,u],zero,ymm8[24,u,u,u,u,u],zero +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u],zero,ymm13[5,u,u,u,u,u],zero,ymm13[6,u,u,u,u,u,23],zero,ymm13[u,u,u,u,u,24],zero,ymm13[u,u,u,u,u,25] +; AVX512F-SLOW-NEXT: vpternlogq $200, %ymm11, %ymm12, %ymm13 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,2,0,2] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,8],zero,zero,ymm11[u,u,u,1,9],zero,zero,ymm11[u,u,u,2,10],zero,zero,ymm11[u,u,u,19,27],zero,zero,ymm11[u,u,u,20,28],zero,zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11 +; AVX512F-SLOW-NEXT: vporq %zmm10, %zmm11, %zmm10 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[4],zero,ymm7[u,u,u,u,u,5],zero,ymm7[u,u,u,u,u,6],zero,ymm7[u,u,u,u,u],zero,ymm7[23,u,u,u,u,u],zero,ymm7[24,u,u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4,u,u,u,u,u],zero,ymm12[5,u,u,u,u,u],zero,ymm12[6,u,u,u,u,u,23],zero,ymm12[u,u,u,u,u,24],zero,ymm12[u,u] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; AVX512F-SLOW-NEXT: vpternlogq $200, %ymm11, %ymm13, %ymm12 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,0,2] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8],zero,ymm11[u,u,u,u,1,9],zero,ymm11[u,u,u,u,18,26],zero,ymm11[u,u,u,u,19,27],zero,ymm11[u,u,u,u] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] +; AVX512F-SLOW-NEXT: vpandn %ymm12, %ymm13, %ymm12 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[1,1,0,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] +; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 +; AVX512F-SLOW-NEXT: vporq %zmm12, %zmm11, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,1,1,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,ymm8[u,u,u,10,2],zero,zero,ymm8[u,u,u,11,3],zero,zero,ymm8[u,u,u,20,28],zero,zero,ymm8[u,u,u,21,29],zero,zero,ymm8[u] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9,u,u,u],zero,zero,ymm9[2,10,u,u,u],zero,zero,ymm9[3,19,u,u,u],zero,zero,ymm9[28,20,u,u,u],zero,zero,ymm9[29,21,u] +; AVX512F-SLOW-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,5,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21] -; AVX512F-SLOW-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm7 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm7 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm7, %zmm4 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $50, %ymm6, %ymm8, %ymm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,0,2] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,ymm3[18,26,u,u,u],zero,zero,ymm3[19,27,u,u,u],zero,zero,ymm3[20,28] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[u,u,u,5],zero,ymm2[u,u,u,u,u,6],zero,ymm2[u,u,u,u,u],zero,ymm2[23,u,u,u,u,u],zero,ymm2[24,u,u,u,u,u],zero -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u,u,25] -; AVX512F-SLOW-NEXT: vpternlogq $200, %ymm6, %ymm8, %ymm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10],zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vporq %zmm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] -; AVX512F-SLOW-NEXT: vpandn %ymm3, %ymm6, %ymm3 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,0,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0] -; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[4],zero,ymm0[u,u,u,u,u,5],zero,ymm0[u,u,u,u,u,6],zero,ymm0[u,u,u,u,u],zero,ymm0[23,u,u,u,u,u],zero,ymm0[24,u,u] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4,u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u] -; AVX512F-SLOW-NEXT: vpternlogq $200, %ymm3, %ymm6, %ymm7 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,8],zero,ymm0[u,u,u,u,1,9],zero,ymm0[u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, 96(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa %ymm4, 64(%rax) +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm7 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm7 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] +; AVX512F-SLOW-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] +; AVX512F-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm7, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, 96(%rax) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -1569,70 +1613,65 @@ ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm5 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm5 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm6 ; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm0 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm6 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7 -; AVX512F-FAST-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[12,13,u,u,u],zero,zero,xmm4[14,15,u,u,u] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,12,13],zero,zero,xmm2[u,u,u,14,15],zero,zero,xmm2[u,u,u] -; AVX512F-FAST-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10],zero,xmm3[u,u,u,u,13,12],zero,xmm3[u,u,u,u,15,14],zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] -; AVX512F-FAST-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm7[3,1,1,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1],zero,zero,ymm2[u,u,u,10,2],zero,zero,ymm2[u,u,u,11,3],zero,zero,ymm2[u,u,u,20,28],zero,zero,ymm2[u,u,u,21,29],zero,zero,ymm2[u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm6[1,3,3,1] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[1,9,u,u,u],zero,zero,ymm4[2,10,u,u,u],zero,zero,ymm4[3,19,u,u,u],zero,zero,ymm4[28,20,u,u,u],zero,zero,ymm4[29,21,u] -; AVX512F-FAST-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,5,6] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,2,3,3,2,2,3,3] -; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,1,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] -; AVX512F-FAST-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm5 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm2 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm6[0,2,0,2] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8,u,u,u],zero,zero,ymm4[1,9,u,u,u],zero,zero,ymm4[18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,2,6,1,5,2,6] -; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm5, %ymm6 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u],zero,zero,ymm6[1,5,u,u,u],zero,zero,ymm6[2,6,u,u,u],zero,zero,ymm6[19,23,u,u,u],zero,zero,ymm6[24,28,u,u,u],zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,2,0,2] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,u,u,1,9],zero,zero,ymm6[u,u,u,2,10],zero,zero,ymm6[u,u,u,19,27],zero,zero,ymm6[u,u,u,20,28],zero,zero -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm5, %ymm7 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,5],zero,zero,ymm7[u,u,u,2,6],zero,zero,ymm7[u,u,u,19,23],zero,zero,ymm7[u,u,u,24,28],zero,zero,ymm7[u,u,u,25] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vporq %zmm4, %zmm6, %zmm4 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[1,1,0,0,4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm6 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm7 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm8 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm9[3,1,1,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[1],zero,zero,ymm10[u,u,u,10,2],zero,zero,ymm10[u,u,u,11,3],zero,zero,ymm10[u,u,u,20,28],zero,zero,ymm10[u,u,u,21,29],zero,zero,ymm10[u] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm8[1,3,3,1] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = zero,ymm11[1,9,u,u,u],zero,zero,ymm11[2,10,u,u,u],zero,zero,ymm11[3,19,u,u,u],zero,zero,ymm11[28,20,u,u,u],zero,zero,ymm11[29,21,u] +; AVX512F-FAST-NEXT: vpor %ymm10, %ymm11, %ymm10 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm7[1,3,1,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,1,9],zero,ymm12[u,u,u,u,2,10],zero,ymm12[u,u,u,u,19,27],zero,ymm12[u,u,u,u,20,28],zero,ymm12[u,u,u,u,21] +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm12 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm12 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[12,13,u,u,u],zero,zero,xmm3[14,15,u,u,u] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,12,13],zero,zero,xmm1[u,u,u,14,15],zero,zero,xmm1[u,u,u] +; AVX512F-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10],zero,xmm2[u,u,u,u,13,12],zero,xmm2[u,u,u,u,15,14],zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] +; AVX512F-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm12, %zmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,2,0,2] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,ymm3[18,26,u,u,u],zero,zero,ymm3[19,27,u,u,u],zero,zero,ymm3[20,28] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,5,2,6,1,5,2,6] +; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u],zero,zero,ymm5[1,5,u,u,u],zero,zero,ymm5[2,6,u,u,u],zero,zero,ymm5[19,23,u,u,u],zero,zero,ymm5[24,28,u,u,u],zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm9[0,2,0,2] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,ymm5[u,u,u,1,9],zero,zero,ymm5[u,u,u,2,10],zero,zero,ymm5[u,u,u,19,27],zero,zero,ymm5[u,u,u,20,28],zero,zero +; AVX512F-FAST-NEXT: vpermd %ymm9, %ymm4, %ymm6 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,5],zero,zero,ymm6[u,u,u,2,6],zero,zero,ymm6[u,u,u,19,23],zero,zero,ymm6[u,u,u,24,28],zero,zero,ymm6[u,u,u,25] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512F-FAST-NEXT: vporq %zmm3, %zmm5, %zmm3 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm7[0,2,0,2] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,0,8],zero,ymm5[u,u,u,u,1,9],zero,ymm5[u,u,u,u,18,26],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u] +; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,4],zero,ymm4[u,u,u,u,1,5],zero,ymm4[u,u,u,u,2,6],zero,ymm4[u,u,u,u,19,23],zero,ymm4[u,u,u,u,24,28],zero,ymm4[u] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,0,2] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,0,8],zero,ymm6[u,u,u,u,1,9],zero,ymm6[u,u,u,u,18,26],zero,ymm6[u,u,u,u,19,27],zero,ymm6[u,u,u,u] -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4],zero,ymm1[u,u,u,u,1,5],zero,ymm1[u,u,u,u,2,6],zero,ymm1[u,u,u,u,19,23],zero,ymm1[u,u,u,u,24,28],zero,ymm1[u] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm3, 96(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-FAST-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,1,0,4,4,5,4] +; AVX512F-FAST-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa %xmm2, 96(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-FAST-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -1645,19 +1684,18 @@ ; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm4 ; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm5 ; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm7 +; AVX512BW-SLOW-NEXT: vmovdqa (%r10), %xmm8 +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm9 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti128 $1, (%r9), %ymm6, %ymm6 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, (%r10), %zmm6, %zmm6 -; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm6, %ymm8 -; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm8[0,1,2,3,6,7,7,7] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,3,2] +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,6,7,7,7] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,3,2] ; AVX512BW-SLOW-NEXT: movw $-32510, %cx # imm = 0x8102 ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %xmm9, %xmm7 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1} ; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[12,13],zero,zero,zero,zero,zero,xmm2[14,15],zero,zero,zero,zero,zero ; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] @@ -1665,11 +1703,11 @@ ; AVX512BW-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-SLOW-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %xmm7, %xmm2 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu8 %xmm6, %xmm2 {%k1} ; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] ; AVX512BW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512BW-SLOW-NEXT: vpermw %ymm8, %ymm3, %ymm3 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm6[1,3,1,3] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm9[1,3,1,3] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX512BW-SLOW-NEXT: movl $67637280, %ecx # imm = 0x4081020 ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 @@ -1683,13 +1721,16 @@ ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 ; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k1} ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] -; AVX512BW-SLOW-NEXT: vpermi2w %zmm6, %zmm8, %zmm4 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0] +; AVX512BW-SLOW-NEXT: vpermw %ymm8, %ymm4, %ymm4 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm9[4],zero,zero,zero,zero,zero,zero,ymm9[5],zero,zero,zero,zero,zero,zero,ymm9[6],zero,zero,zero,zero,zero,zero,zero,ymm9[23],zero,zero,zero,zero,zero,zero,ymm9[24],zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm9[2,3,0,1] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm9[0,2,0,2] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 ; AVX512BW-SLOW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 @@ -1730,66 +1771,66 @@ ; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm3 ; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm4 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX512BW-FAST-NEXT: vinserti128 $1, (%r9), %ymm4, %ymm4 -; AVX512BW-FAST-NEXT: vinserti32x4 $2, (%r10), %zmm4, %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm5 +; AVX512BW-FAST-NEXT: vmovdqa (%r10), %xmm6 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm7 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm9 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512BW-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpermw %ymm6, %ymm10, %ymm10 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm7[1,3,1,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512BW-FAST-NEXT: movl $67637280, %ecx # imm = 0x4081020 +; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm10, %ymm11 {%k1} +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm9[1,3,3,1] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,ymm10[1,9],zero,zero,zero,zero,zero,ymm10[2,10],zero,zero,zero,zero,zero,ymm10[3,19],zero,zero,zero,zero,zero,ymm10[28,20],zero,zero,zero,zero,zero,ymm10[29,21],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm8[3,1,1,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[1],zero,zero,zero,zero,zero,ymm12[10,2],zero,zero,zero,zero,zero,ymm12[11,3],zero,zero,zero,zero,zero,ymm12[20,28],zero,zero,zero,zero,zero,ymm12[21,29],zero,zero,zero +; AVX512BW-FAST-NEXT: vpor %ymm10, %ymm12, %ymm10 +; AVX512BW-FAST-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 +; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %ymm11, %ymm10 {%k1} ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[12,13],zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[12,13],zero,zero,zero,zero,zero,xmm1[14,15],zero,zero,zero ; AVX512BW-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero -; AVX512BW-FAST-NEXT: vextracti64x4 $1, %zmm4, %ymm2 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm2[13],zero,zero,zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,zero,zero,xmm2[15] -; AVX512BW-FAST-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm6[13],zero,zero,zero,zero,zero,zero,xmm6[14],zero,zero,zero,zero,zero,zero,xmm6[15] +; AVX512BW-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512BW-FAST-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 ; AVX512BW-FAST-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm1, %ymm1 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm4[1,3,1,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] -; AVX512BW-FAST-NEXT: movl $67637280, %ecx # imm = 0x4081020 -; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm1, %ymm3 {%k1} -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm6[1,3,3,1] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[3,19],zero,zero,zero,zero,zero,ymm1[28,20],zero,zero,zero,zero,zero,ymm1[29,21],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm5[3,1,1,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero -; AVX512BW-FAST-NEXT: vpor %ymm1, %ymm7, %ymm1 -; AVX512BW-FAST-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 -; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,50,50,50,52,50,50,50,52,51,51,51,51,50,50,50,52] -; AVX512BW-FAST-NEXT: vpermi2w %zmm4, %zmm2, %zmm3 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm10, %zmm1 ; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,5,2,6,1,5,2,6] ; AVX512BW-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm7 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] +; AVX512BW-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm3 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm8[0,2,0,2] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zero,zmm3[33,37],zero,zero,zero,zero,zero,zmm3[34,38],zero,zero,zero,zero,zero,zmm3[51,55],zero,zero,zero,zero,zero,zmm3[56,60],zero,zero,zero,zero,zero,zmm3[57] +; AVX512BW-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm4 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm9[0,2,0,2] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zmm4[0,8],zero,zero,zero,zero,zero,zmm4[1,9],zero,zero,zero,zero,zero,zmm4[18,26],zero,zero,zero,zero,zero,zmm4[19,27],zero,zero,zero,zero,zero,zmm4[20,28],zero,zero,zero,zero,zero,zmm4[33,37],zero,zero,zero,zero,zero,zmm4[34,38],zero,zero,zero,zero,zero,zmm4[51,55],zero,zero,zero,zero,zero,zmm4[56,60],zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vporq %zmm3, %zmm4, %zmm3 +; AVX512BW-FAST-NEXT: vpermd %ymm7, %ymm2, %ymm2 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm7[0,2,0,2] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512BW-FAST-NEXT: vpermw %zmm4, %zmm5, %zmm4 ; AVX512BW-FAST-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm4 {%k1} -; AVX512BW-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm3 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zero,zmm3[33,37],zero,zero,zero,zero,zero,zmm3[34,38],zero,zero,zero,zero,zero,zmm3[51,55],zero,zero,zero,zero,zero,zmm3[56,60],zero,zero,zero,zero,zero,zmm3[57] -; AVX512BW-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm2 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm6[0,2,0,2] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zmm2[18,26],zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zmm2[33,37],zero,zero,zero,zero,zero,zmm2[34,38],zero,zero,zero,zero,zero,zmm2[51,55],zero,zero,zero,zero,zero,zmm2[56,60],zero,zero,zero,zero -; AVX512BW-FAST-NEXT: vporq %zmm3, %zmm2, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm4, %zmm2 {%k1} ; AVX512BW-FAST-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm4, %zmm2 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-FAST-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -1816,974 +1857,962 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride7_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $344, %rsp # imm = 0x158 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rsi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm8 -; SSE-NEXT: movdqa 16(%rcx), %xmm12 -; SSE-NEXT: movdqa 16(%r8), %xmm7 -; SSE-NEXT: movdqa 16(%r9), %xmm14 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; SSE-NEXT: subq $312, %rsp # imm = 0x138 +; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdx), %xmm3 +; SSE-NEXT: movdqa 16(%rcx), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r8), %xmm6 +; SSE-NEXT: movdqa 16(%r9), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,6,6,6] +; SSE-NEXT: movdqa %xmm1, %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,5,5,7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,6,6,6] +; SSE-NEXT: movdqa %xmm3, %xmm9 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm12[8],xmm4[9],xmm12[9],xmm4[10],xmm12[10],xmm4[11],xmm12[11],xmm4[12],xmm12[12],xmm4[13],xmm12[13],xmm4[14],xmm12[14],xmm4[15],xmm12[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,1,2,3] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,1,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm8 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,6,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] +; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: pand %xmm15, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa 16(%rax), %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5,6,6] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: movdqa 16(%rax), %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6,6] +; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm11, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,7,7,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] ; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm6, %xmm4 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,0,3] -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm15, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm12, %xmm7 -; SSE-NEXT: pandn %xmm1, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm7, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: pandn %xmm4, %xmm9 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: por %xmm7, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm9, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: por %xmm8, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rsi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm14 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,0,3] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: movdqa (%rdi), %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: movdqa (%rcx), %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,1,2,3] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: movdqa (%rdx), %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm8, %xmm1 -; SSE-NEXT: por %xmm7, %xmm1 -; SSE-NEXT: movdqa (%r9), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,1,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,3] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] -; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa (%rcx), %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm8, %xmm7 ; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: movdqa (%r8), %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pand %xmm10, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm4 ; SSE-NEXT: por %xmm4, %xmm7 -; SSE-NEXT: pand %xmm5, %xmm7 -; SSE-NEXT: movdqa (%rax), %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: por %xmm7, %xmm5 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: pandn %xmm7, %xmm9 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm6, %xmm9 +; SSE-NEXT: movdqa (%r9), %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,1,2,3] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,0] +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: pandn %xmm6, %xmm12 +; SSE-NEXT: movdqa (%r8), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: por %xmm6, %xmm12 +; SSE-NEXT: movdqa (%rax), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm2, %xmm14 +; SSE-NEXT: pandn %xmm6, %xmm14 +; SSE-NEXT: pand %xmm2, %xmm12 +; SSE-NEXT: por %xmm12, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm14 -; SSE-NEXT: por %xmm4, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,5,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm9 +; SSE-NEXT: por %xmm9, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[3,1,0,3] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm1, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: por %xmm6, %xmm9 +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm8, %xmm12 +; SSE-NEXT: pandn %xmm6, %xmm12 +; SSE-NEXT: pshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: por %xmm6, %xmm12 +; SSE-NEXT: pand %xmm4, %xmm9 +; SSE-NEXT: pandn %xmm12, %xmm4 +; SSE-NEXT: por %xmm9, %xmm4 +; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[2,1,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,0] +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: pshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; SSE-NEXT: pand %xmm5, %xmm6 +; SSE-NEXT: por %xmm6, %xmm9 +; SSE-NEXT: pand %xmm2, %xmm9 +; SSE-NEXT: pshuflw $255, (%rsp), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: # xmm6 = mem[3,3,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4] +; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: por %xmm9, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,5,6,6,7] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm12, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,6,5,7] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: pandn %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[2,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm14 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: por %xmm4, %xmm14 -; SSE-NEXT: pand %xmm9, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufd $101, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: movdqa %xmm11, %xmm15 -; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,1,3] -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm8, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,6,6,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm4, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,2] +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm6, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] -; SSE-NEXT: pand %xmm12, %xmm4 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pand %xmm3, %xmm8 -; SSE-NEXT: por %xmm8, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,2,2] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: movdqa %xmm15, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,5,6,6] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: pandn %xmm6, %xmm12 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm12, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,1] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[1,1,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,3] -; SSE-NEXT: movdqa %xmm13, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm4 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 -; SSE-NEXT: pand %xmm12, %xmm5 -; SSE-NEXT: por %xmm5, %xmm8 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm8, %xmm4 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm14[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,0] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm4, %xmm6 -; SSE-NEXT: pshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pandn %xmm4, %xmm12 +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3],xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,2,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,1,0] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm4, %xmm6 -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm12, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm0, %xmm12 +; SSE-NEXT: pandn %xmm4, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: por %xmm4, %xmm12 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm2, %xmm14 -; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[0,0,0,0,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, %xmm14 +; SSE-NEXT: pandn %xmm4, %xmm14 +; SSE-NEXT: pand %xmm5, %xmm12 +; SSE-NEXT: por %xmm12, %xmm14 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[0,0,2,1] +; SSE-NEXT: pand %xmm5, %xmm12 +; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; SSE-NEXT: movdqa %xmm15, %xmm4 +; SSE-NEXT: pandn %xmm12, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,5,6,4] +; SSE-NEXT: movdqa %xmm13, %xmm14 +; SSE-NEXT: pandn %xmm12, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm9[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,0,2,1] +; SSE-NEXT: pand %xmm13, %xmm12 +; SSE-NEXT: por %xmm12, %xmm14 +; SSE-NEXT: pand %xmm15, %xmm14 +; SSE-NEXT: por %xmm4, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,1,3] +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: pandn %xmm4, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[1,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm4 +; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; SSE-NEXT: movdqa %xmm8, %xmm15 -; SSE-NEXT: pandn %xmm3, %xmm15 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: por %xmm6, %xmm15 -; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,5,5,7] +; SSE-NEXT: pand %xmm8, %xmm12 +; SSE-NEXT: por %xmm12, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm14 +; SSE-NEXT: por %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm0, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: pand %xmm15, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,6,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,2] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm10[2,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: por %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pandn %xmm3, %xmm12 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: por %xmm4, %xmm12 +; SSE-NEXT: pand %xmm9, %xmm12 +; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,1,3] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; SSE-NEXT: # xmm5 = mem[1,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,2] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm14 +; SSE-NEXT: pandn %xmm0, %xmm14 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[1,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa %xmm8, %xmm14 +; SSE-NEXT: pandn %xmm3, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,4,6,5] +; SSE-NEXT: movdqa %xmm10, %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,5,7] +; SSE-NEXT: pand %xmm8, %xmm3 +; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: movdqa %xmm14, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pand %xmm14, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: pand %xmm1, %xmm14 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: por %xmm14, %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; SSE-NEXT: pand %xmm14, %xmm1 -; SSE-NEXT: pandn %xmm6, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: pshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm13, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshuflw $216, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,2,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,0] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm1 +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm14 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: pshufhw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm3, %xmm0 +; SSE-NEXT: por %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,6,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,2] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm3, %xmm10 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,2,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] +; SSE-NEXT: movdqa %xmm13, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; SSE-NEXT: pand %xmm0, %xmm10 ; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm11, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm11, %xmm1 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: por %xmm3, %xmm5 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: pandn %xmm5, %xmm8 -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: movdqa (%rsp), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: por %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: movdqa %xmm14, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa (%rsp), %xmm14 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm10, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,2,2] +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[1,1,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: pand %xmm15, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: pshufhw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,5,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pandn %xmm1, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] +; SSE-NEXT: pandn %xmm3, %xmm11 +; SSE-NEXT: por %xmm0, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm11 +; SSE-NEXT: por %xmm11, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm9 +; SSE-NEXT: por %xmm6, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3] +; SSE-NEXT: movdqa %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm15 +; SSE-NEXT: por %xmm3, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[2,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] ; SSE-NEXT: pandn %xmm1, %xmm13 ; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: pandn %xmm11, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[2,2,2,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm13 ; SSE-NEXT: por %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm15, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,2] -; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: # xmm1 = mem[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] +; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[2,2,3,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[2,1,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] -; SSE-NEXT: pand %xmm10, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm10 -; SSE-NEXT: por %xmm3, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: pand %xmm12, %xmm1 -; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[0,1,2,3,5,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] -; SSE-NEXT: pandn %xmm3, %xmm12 -; SSE-NEXT: por %xmm1, %xmm12 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; SSE-NEXT: pand %xmm1, %xmm12 -; SSE-NEXT: pandn %xmm10, %xmm1 -; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[2,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: pshufhw $216, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,6,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,2] -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: por %xmm1, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[1,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: por %xmm1, %xmm8 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: por %xmm8, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm2, 32(%rax) -; SSE-NEXT: movdqa %xmm0, 96(%rax) -; SSE-NEXT: movdqa %xmm8, 112(%rax) -; SSE-NEXT: movdqa %xmm14, 176(%rax) -; SSE-NEXT: movdqa %xmm15, (%rax) +; SSE-NEXT: movdqa %xmm1, 176(%rax) +; SSE-NEXT: movdqa %xmm0, 144(%rax) +; SSE-NEXT: movdqa %xmm9, 128(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 16(%rax) +; SSE-NEXT: movaps %xmm0, 112(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rax) +; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rax) +; SSE-NEXT: movaps %xmm0, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rax) +; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 160(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 208(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 192(%rax) -; SSE-NEXT: addq $344, %rsp # imm = 0x158 +; SSE-NEXT: addq $312, %rsp # imm = 0x138 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride7_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $168, %rsp +; AVX1-ONLY-NEXT: subq $152, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm12 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u],zero,zero,xmm12[9,u,u,u,u],zero,zero,xmm12[10,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm15 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,3],zero,xmm1[u,u,u,u,4,5],zero,xmm1[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u> -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u> -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,u,7],zero,xmm11[u,u,u,u,u,8],zero,xmm11[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm9 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u],zero,xmm9[7,u,u,u,u,u],zero,xmm9[8,u,u,u,u,u],zero -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,7],zero,xmm5[u,u,u,u,u,8],zero,xmm5[u,u,u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm7, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm1, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <8,9,128,u,u,u,u,10,11,128,u,u,u,u,12,13> +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm10 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm10[4,u,u,u,u],zero,zero,xmm10[5,u,u,u,u],zero,zero -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[8,9],zero,xmm2[u,u,u,u,10,11],zero,xmm2[u,u,u,u,12,13] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u],zero,zero,xmm12[2,u,u,u,u],zero,zero,xmm12[3,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,4,5],zero,xmm2[u,u,u,u,6,7],zero,xmm2[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm8 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,7],zero,xmm7[u,u,u,u,u,8],zero,xmm7[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm4 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u],zero,xmm4[7,u,u,u,u,u],zero,xmm4[8,u,u,u,u,u],zero -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm4, %ymm9 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm9, %ymm13, %ymm9 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm13, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm13, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u],zero,xmm0[7,u,u,u,u,u],zero,xmm0[8,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,7],zero,xmm1[u,u,u,u,u,8],zero,xmm1[u,u] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm6[u,u,u,u,5,6],zero,xmm6[u,u,u,u,12,13],zero,xmm6[u] +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[6,u,u,u,u],zero,zero,xmm6[7,u,u,u,u],zero,zero,xmm6[8,u] +; AVX1-ONLY-NEXT: vpor %xmm13, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm7 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u],zero,xmm0[7,u,u,u,u,u],zero,xmm0[8,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,7],zero,xmm9[u,u,u,u,u,8],zero,xmm9[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u],zero,xmm7[7,u,u,u,u,u],zero,xmm7[8,u,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,7],zero,xmm5[u,u,u,u,u,8],zero,xmm5[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm13, %xmm13 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm1, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm12[4,u,u,u,u],zero,zero,xmm12[5,u,u,u,u],zero,zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[8,9],zero,xmm6[u,u,u,u,10,11],zero,xmm6[u,u,u,u,12,13] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u],zero,zero,xmm12[9,u,u,u,u],zero,zero,xmm12[10,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,2,3],zero,xmm4[u,u,u,u,4,5],zero,xmm4[u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u],zero,zero,xmm12[2,u,u,u,u],zero,zero,xmm12[3,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,4,5],zero,xmm6[u,u,u,u,6,7],zero,xmm6[u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[u,u,u,u,u],zero,xmm15[7,u,u,u,u,u],zero,xmm15[8,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,7],zero,xmm2[u,u,u,u,u,8],zero,xmm2[u,u] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm13, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm13 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm15, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3],xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm14, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm12, %ymm13 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u,u,u,u,5,6],zero,xmm1[u,u,u,u,12,13],zero,xmm1[u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[6,u,u,u,u],zero,zero,xmm12[7,u,u,u,u],zero,zero,xmm12[8,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,u],zero,xmm10[7,u,u,u,u,u],zero,xmm10[8,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm13, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u],zero,xmm11[7,u,u,u,u,u],zero,xmm11[8,u,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,7],zero,xmm8[u,u,u,u,u,8],zero,xmm8[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm5, %ymm11 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm11[8],xmm8[9],xmm11[9],xmm8[10],xmm11[10],xmm8[11],xmm11[11],xmm8[12],xmm11[12],xmm8[13],xmm11[13],xmm8[14],xmm11[14],xmm8[15],xmm11[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm11 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,6,7,u,u,u,u,u,8,9,u,u,u,u,u,10] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm13, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm13, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm11 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0],zero,xmm11[2,3,4,5,6,7],zero,xmm11[9,10,11,12,13,14],zero -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm15 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = zero,xmm2[13],zero,zero,zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,zero,zero,xmm2[15] -; AVX1-ONLY-NEXT: vpor %xmm13, %xmm11, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,6,7,u,u,u,u,u,8,9,u,u,u,u,u,10] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,3,4,5,6,7],zero,xmm1[9,10,11,12,13,14],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm12[13],zero,zero,zero,zero,zero,zero,xmm12[14],zero,zero,zero,zero,zero,zero,xmm12[15] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8,9],zero,xmm0[11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm2[11],zero,zero,zero,zero,zero,zero,xmm2[12],zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm12[11],zero,zero,zero,zero,zero,zero,xmm12[12],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm10 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[u,u,u],zero,zero,xmm10[9,u,u,u,u],zero,zero,xmm10[10,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,2,3],zero,xmm4[u,u,u,u,4,5],zero,xmm4[u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,6,7],zero,xmm4[u,u,u,u,8,9],zero,xmm4[u,u,u,u,10] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u],zero,zero,xmm10[11,u,u,u,u],zero,zero,xmm10[12,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5> -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[4,5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm11[8],xmm8[9],xmm11[9],xmm8[10],xmm11[10],xmm8[11],xmm11[11],xmm8[12],xmm11[12],xmm8[13],xmm11[13],xmm8[14],xmm11[14],xmm8[15],xmm11[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10],zero,xmm6[u,u,u,u,13,12],zero,xmm6[u,u,u,u,15,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm10[13,u,u,u,u],zero,zero,xmm10[14,u,u,u,u],zero,zero,xmm10[15] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[u,u],zero,zero,xmm10[2,u,u,u,u],zero,zero,xmm10[3,u,u,u,u] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,4,5],zero,xmm6[u,u,u,u,6,7],zero,xmm6[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5> +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm8 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u],zero,zero,xmm6[2,u,u,u,u],zero,zero,xmm6[3,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,4,5],zero,xmm2[u,u,u,u,6,7],zero,xmm2[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[4,5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm9[8],xmm2[8],xmm9[9],xmm2[9],xmm9[10],xmm2[10],xmm9[11],xmm2[11],xmm9[12],xmm2[12],xmm9[13],xmm2[13],xmm9[14],xmm2[14],xmm9[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u],zero,zero,xmm6[9,u,u,u,u],zero,zero,xmm6[10,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,2,3],zero,xmm11[u,u,u,u,4,5],zero,xmm11[u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm14, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,6,7],zero,xmm11[u,u,u,u,8,9],zero,xmm11[u,u,u,u,10] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u],zero,zero,xmm6[11,u,u,u,u],zero,zero,xmm6[12,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpor %xmm14, %xmm11, %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm11, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10],zero,xmm3[u,u,u,u,13,12],zero,xmm3[u,u,u,u,15,14],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm6[13,u,u,u,u],zero,zero,xmm6[14,u,u,u,u],zero,zero,xmm6[15] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 128(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, (%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 208(%rax) -; AVX1-ONLY-NEXT: addq $168, %rsp +; AVX1-ONLY-NEXT: addq $152, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -2828,70 +2857,70 @@ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm11 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm14 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm9, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm13 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm15 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5,5,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm15 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm11 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm0 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[1,1,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm9, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm8, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u],zero,xmm0[7],zero,xmm0[5,u,u,u],zero,xmm0[8],zero,xmm0[6,u,u,u],zero -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u,u,9] -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm7, %xmm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u] -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm15[8],xmm14[9],xmm15[9],xmm14[10],xmm15[10],xmm14[11],xmm15[11],xmm14[12],xmm15[12],xmm14[13],xmm15[13],xmm14[14],xmm15[14],xmm14[15],xmm15[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,4,5,5,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm9, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm9 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u,u],zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9] +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm7, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[1,1,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm15[u,u,u,7],zero,xmm15[5],zero,xmm15[u,u,u,8],zero,xmm15[6],zero,xmm15[u,u] +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm10 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm10[4,u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[4],zero,xmm11[u,u,u,7],zero,xmm11[5],zero,xmm11[u,u,u,8],zero,xmm11[6],zero +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm7, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm10 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] @@ -2964,11 +2993,11 @@ ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, 32(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm9, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: popq %rax @@ -2982,119 +3011,115 @@ ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm9 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm15 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm11 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm12 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm11 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm11[0,1,2,3,4,5,5,6] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm14 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm13 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm14 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm15 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6,u,u,u],zero -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9] -; AVX2-FAST-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u] -; AVX2-FAST-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero -; AVX2-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 ; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm8 ; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm6 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u,u,9] +; AVX2-FAST-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm13, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[1,1,0,0,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] +; AVX2-FAST-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = zero,xmm14[4,u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[4],zero,xmm15[u,u,u,7],zero,xmm15[5],zero,xmm15[u,u,u,8],zero,xmm15[6],zero +; AVX2-FAST-NEXT: vpor %xmm11, %xmm12, %xmm11 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm11, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[17,18,19,30],zero,ymm0[28],zero,ymm0[28,29,30,31],zero,ymm0[29],zero,ymm0[31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,4,5,5,7,4,5] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,ymm7[27,28,29,30],zero,ymm7[28],zero,ymm7[26,27,30,31],zero,ymm7[29] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm8[27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u> +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[17,18,19,30],zero,ymm0[28],zero,ymm0[28,29,30,31],zero,ymm0[29],zero,ymm0[31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,ymm7[27,28,29,30],zero,ymm7[28],zero,ymm7[26,27,30,31],zero,ymm7[29] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm8[27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u> -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] @@ -3137,12 +3162,12 @@ ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm10, 128(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm12, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, (%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm11, 192(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm11, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm10, 192(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -3153,69 +3178,69 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm11[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6,u,u,u],zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u,u,9] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm13, %ymm9, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm11 = zero,xmm14[4,u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[4],zero,xmm15[u,u,u,7],zero,xmm15[5],zero,xmm15[u,u,u,8],zero,xmm15[6],zero +; AVX2-FAST-PERLANE-NEXT: vpor %xmm11, %xmm12, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm11, %ymm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[17,18,19,30],zero,ymm0[28],zero,ymm0[28,29,30,31],zero,ymm0[29],zero,ymm0[31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] @@ -3240,51 +3265,51 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm13, %ymm14, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm13, %ymm14, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18],zero @@ -3304,12 +3329,12 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -3317,13 +3342,13 @@ ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 (%r10), %ymm17 ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6,u,u,u],zero ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm9 @@ -3360,25 +3385,23 @@ ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm0[0,0,1,0,4,4,5,4] ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm7 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14,u,u],zero,zero,zero,zero,ymm1[15,u,u],zero,zero,zero,zero,ymm1[16,u,u],zero,zero,zero,zero,ymm1[17,u,u],zero,zero,zero,zero,ymm1[18] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[0,1,14],zero,ymm2[u,u,0,1,14,15],zero,ymm2[u,u,13,2,3,16],zero,ymm2[u,u,28,29,16,17],zero,ymm2[u,u,19,28,29,18],zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm15, %ymm0 ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u],zero,ymm3[14,u,u,u,u,u],zero,ymm3[15,u,u,u,u,u],zero,ymm3[16,u,u,u,u,u],zero,ymm3[17,u,u,u,u,u] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm2 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm2[u,u,u,u,14],zero,ymm2[u,u,u,u,u,15],zero,ymm2[u,u,u,u,u,16],zero,ymm2[u,u,u,u,u,17],zero,ymm2[u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm5[u,u,u,u],zero,ymm5[14,u,u,u,u,u],zero,ymm5[15,u,u,u,u,u],zero,ymm5[16,u,u,u,u,u],zero,ymm5[17,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = ymm6[u,u,u,u,14],zero,ymm6[u,u,u,u,u,15],zero,ymm6[u,u,u,u,u,16],zero,ymm6[u,u,u,u,u,17],zero,ymm6[u,u,u,u,u] ; AVX512F-SLOW-NEXT: vpor %ymm13, %ymm14, %ymm13 ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm9 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm9 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm5[u,u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm2[u,u,u,u,u,14],zero,ymm2[u,u,u,u,u,15],zero,ymm2[u,u,u,u,u,16],zero,ymm2[u,u,u,u,u,17],zero,ymm2[u,u,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[13,u,u,u,u,u],zero,ymm1[14,u,u,u,u,u],zero,ymm1[15,u,u,u,u,u],zero,ymm1[16,u,u,u,u,u],zero,ymm1[17,u,u,u] ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm8, %ymm0 ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] @@ -3389,80 +3412,70 @@ ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] ; AVX512F-SLOW-NEXT: vpandn %ymm8, %ymm10, %ymm8 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,ymm4[13,u,u,u,u],zero,zero,ymm4[14,u,u,u,u],zero,zero,ymm4[15,u,u,u,u],zero,zero,ymm4[16,u,u,u,u],zero,zero,ymm4[17,u,u] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm12 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,ymm12[13,u,u,u,u],zero,zero,ymm12[14,u,u,u,u],zero,zero,ymm12[15,u,u,u,u],zero,zero,ymm12[16,u,u,u,u],zero,zero,ymm12[17,u,u] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 ; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] -; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpand %ymm0, %ymm9, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm14 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero,zero,zero,ymm14[27],zero,ymm14[25] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm6[18,19,20,21],zero,ymm6[19],zero,ymm6[25,26,27,22],zero,ymm6[20],zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm4[23,u,u,u],zero,ymm4[26],zero,ymm4[24,u,u,u],zero,ymm4[27],zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vporq %zmm0, %zmm9, %zmm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20],zero,zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm3[19],zero,ymm3[21,20,21,22],zero,ymm3[20],zero,ymm3[22,23] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm6[23],zero,ymm6[21,22,23,26],zero,ymm6[24],zero,ymm6[28,29,26,27] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vporq %zmm10, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] -; AVX512F-SLOW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpand %ymm11, %ymm10, %ymm10 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm2[23],zero,ymm2[21,22,23,26],zero,ymm2[24],zero,ymm2[28,29,26,27] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vporq %zmm12, %zmm10, %zmm10 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vporq %zmm9, %zmm10, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm9 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[20],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm5[18],zero,ymm5[20,21,20,21],zero,ymm5[19],zero,ymm5[19,20,21,22],zero -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm5[23],zero,ymm5[23,24,25,26],zero,ymm5[24],zero,ymm5[30,31] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vporq %zmm0, %zmm12, %zmm0 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,2] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm12 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm12 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm2[18],zero,ymm2[20,21,20,21],zero,ymm2[19],zero,ymm2[19,20,21,22],zero +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm2[23],zero,ymm2[23,24,25,26],zero,ymm2[24],zero,ymm2[30,31] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vporq %zmm0, %zmm10, %zmm0 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm12[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm10 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm5[30],zero,ymm5[28,u,u,u],zero,ymm5[31],zero,ymm5[29,u] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm9, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm5, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm11, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero +; AVX512F-SLOW-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm1[27,u,u,u],zero,ymm1[30],zero,ymm1[28,u,u,u],zero,ymm1[31],zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa %ymm3, 192(%rax) +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq @@ -3471,148 +3484,141 @@ ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm3 ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm5 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 (%r10), %ymm17 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6,u,u,u],zero -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u,u,9] -; AVX512F-FAST-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm10, %zmm7 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u] -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] -; AVX512F-FAST-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm13, %zmm10 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm15 = zmm10[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm5 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm2 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%r10), %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero +; AVX512F-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm9 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u],zero,ymm5[14,u,u,u,u,u],zero,ymm5[15,u,u,u,u,u],zero,ymm5[16,u,u,u,u,u],zero,ymm5[17,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,14],zero,ymm6[u,u,u,u,u,15],zero,ymm6[u,u,u,u,u,16],zero,ymm6[u,u,u,u,u,17],zero,ymm6[u,u,u,u,u] +; AVX512F-FAST-NEXT: vpor %ymm10, %ymm11, %ymm10 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm14 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm15 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm15 -; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm10 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[1,1,0,0,4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512F-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm7 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm13 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm13[4,u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6] -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm14 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero -; AVX512F-FAST-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm0[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm2[u,u,u,u,u,14],zero,ymm2[u,u,u,u,u,15],zero,ymm2[u,u,u,u,u,16],zero,ymm2[u,u,u,u,u,17],zero,ymm2[u,u,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[13,u,u,u,u,u],zero,ymm0[14,u,u,u,u,u],zero,ymm0[15,u,u,u,u,u],zero,ymm0[16,u,u,u,u,u],zero,ymm0[17,u,u,u] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-FAST-NEXT: vpor %ymm7, %ymm10, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm11 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm12, %zmm16 +; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm12 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-FAST-NEXT: vpandnq %ymm7, %ymm17, %ymm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm1[13,u,u,u,u],zero,zero,ymm1[14,u,u,u,u],zero,zero,ymm1[15,u,u,u,u],zero,zero,ymm1[16,u,u,u,u],zero,zero,ymm1[17,u,u] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm7 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[0,1,14],zero,ymm2[u,u,0,1,14,15],zero,ymm2[u,u,13,2,3,16],zero,ymm2[u,u,28,29,16,17],zero,ymm2[u,u,19,28,29,18],zero -; AVX512F-FAST-NEXT: vpor %ymm0, %ymm15, %ymm0 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm11, %zmm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %ymm11, %ymm12, %ymm11 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm9 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u,u],zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9] +; AVX512F-FAST-NEXT: vpor %xmm0, %xmm15, %xmm0 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm13, %zmm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[u,u,u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u] +; AVX512F-FAST-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm13, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm8[0,1,0,1,4,5,4,5] ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm9 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm1[u,u,u,u,u,14],zero,ymm1[u,u,u,u,u,15],zero,ymm1[u,u,u,u,u,16],zero,ymm1[u,u,u,u,u,17],zero,ymm1[u,u,u] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[13,u,u,u,u,u],zero,ymm3[14,u,u,u,u,u],zero,ymm3[15,u,u,u,u,u],zero,ymm3[16,u,u,u,u,u],zero,ymm3[17,u,u,u] -; AVX512F-FAST-NEXT: vpor %ymm0, %ymm8, %ymm0 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,5,5,6] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] -; AVX512F-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-FAST-NEXT: vpandn %ymm8, %ymm10, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm13 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,ymm13[13,u,u,u,u],zero,zero,ymm13[14,u,u,u,u],zero,zero,ymm13[15,u,u,u,u],zero,zero,ymm13[16,u,u,u,u],zero,zero,ymm13[17,u,u] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm10[4,u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[4],zero,xmm11[u,u,u,7],zero,xmm11[5],zero,xmm11[u,u,u,8],zero,xmm11[6],zero +; AVX512F-FAST-NEXT: vpor %xmm0, %xmm8, %xmm0 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm8, %zmm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,1,0,4,4,5,4] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] -; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpand %ymm0, %ymm9, %ymm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero,ymm2[25] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm6[18,19,20,21],zero,ymm6[19],zero,ymm6[25,26,27,22],zero,ymm6[20],zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm4[23,u,u,u],zero,ymm4[26],zero,ymm4[24,u,u,u],zero,ymm4[27],zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vporq %zmm0, %zmm9, %zmm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20],zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm3[19],zero,ymm3[21,20,21,22],zero,ymm3[20],zero,ymm3[22,23] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm6[23],zero,ymm6[21,22,23,26],zero,ymm6[24],zero,ymm6[28,29,26,27] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vporq %zmm10, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20],zero,zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero +; AVX512F-FAST-NEXT: vporq %zmm9, %zmm10, %zmm9 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm9 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[20],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm2[18],zero,ymm2[20,21,20,21],zero,ymm2[19],zero,ymm2[19,20,21,22],zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm2[23],zero,ymm2[23,24,25,26],zero,ymm2[24],zero,ymm2[30,31] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm2[19],zero,ymm2[21,20,21,22],zero,ymm2[20],zero,ymm2[22,23] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vporq %zmm10, %zmm11, %zmm10 +; AVX512F-FAST-NEXT: vporq %zmm0, %zmm10, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm12 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[20],zero,ymm3[18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm1[18],zero,ymm1[20,21,20,21],zero,ymm1[19],zero,ymm1[19,20,21,22],zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vporq %zmm0, %zmm11, %zmm0 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [5,5,4,0,5,5,4,0] -; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm11 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm11 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm10 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm5[30],zero,ymm5[28,u,u,u],zero,ymm5[31],zero,ymm5[29,u] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm5 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm5, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512F-FAST-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm2[28],zero,ymm2[30,31,30,31],zero,ymm2[29],zero,ymm2[31,28,29] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512F-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm1[27,u,u,u],zero,ymm1[30],zero,ymm1[28,u,u,u],zero,ymm1[31],zero +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm9, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -3620,154 +3626,157 @@ ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,1,0,1,14],zero,ymm1[14,15,0,1,14,15],zero,ymm1[13,14,15,16,17,16],zero,ymm1[30,31,30,31,16,17],zero,ymm1[31,28,29,30,31] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero -; AVX512BW-SLOW-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm5 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[0,1,14],zero,ymm4[12,13,0,1,14,15],zero,ymm4[3,12,13,2,3,16],zero,ymm4[30,31,28,29,16,17],zero,ymm4[31,18,19,28,29,18],zero -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero,zero,ymm2[18] -; AVX512BW-SLOW-NEXT: vpor %ymm0, %ymm6, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512BW-SLOW-NEXT: vmovdqa (%r10), %xmm8 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0] +; AVX512BW-SLOW-NEXT: vpermw %ymm8, %ymm4, %ymm4 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,0] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm9 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm9[4,u,u,u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6] +; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm10 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[4],zero,xmm10[u,u,u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero +; AVX512BW-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm6, %zmm5 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: movabsq $4647998506761461824, %r11 # imm = 0x4081020408102040 +; AVX512BW-SLOW-NEXT: kmovq %r11, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm4, %zmm5 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u],zero,xmm6[7],zero,xmm6[5,u,u,u],zero,xmm6[8],zero,xmm6[6,u,u] +; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,7],zero,xmm7[5],zero,xmm7[u,u,u,8],zero,xmm7[6],zero,xmm7[u,u] +; AVX512BW-SLOW-NEXT: vpor %xmm4, %xmm11, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm11, %zmm4 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm4[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm12 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u,u],zero +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9] +; AVX512BW-SLOW-NEXT: vpor %xmm4, %xmm14, %xmm4 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm14, %zmm4 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C +; AVX512BW-SLOW-NEXT: kmovq %rcx, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm11, %zmm4 {%k1} +; AVX512BW-SLOW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 +; AVX512BW-SLOW-NEXT: kmovq %rcx, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm5, %zmm4 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm0[0,1,0,1,14],zero,ymm0[14,15,0,1,14,15],zero,ymm0[13,14,15,16,17,16],zero,ymm0[30,31,30,31,16,17],zero,ymm0[31,28,29,30,31] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vpor %ymm5, %ymm11, %ymm5 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm11 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[0,1,14],zero,ymm3[12,13,0,1,14,15],zero,ymm3[3,12,13,2,3,16],zero,ymm3[30,31,28,29,16,17],zero,ymm3[31,18,19,28,29,18],zero +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18] +; AVX512BW-SLOW-NEXT: vpor %ymm5, %ymm12, %ymm5 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 ; AVX512BW-SLOW-NEXT: movabsq $435749858791416001, %rcx # imm = 0x60C1830183060C1 ; AVX512BW-SLOW-NEXT: kmovq %rcx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm5, %zmm0 {%k1} -; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm11, %zmm5 {%k1} ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero -; AVX512BW-SLOW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm13 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm8 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[13],zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero ; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-SLOW-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512BW-SLOW-NEXT: vpermw %ymm7, %ymm15, %ymm15 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm16 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 +; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512BW-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-SLOW-NEXT: vpermw %ymm8, %ymm10, %ymm10 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 ; AVX512BW-SLOW-NEXT: movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020 ; AVX512BW-SLOW-NEXT: kmovq %rcx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm9 {%k1} ; AVX512BW-SLOW-NEXT: movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38 ; AVX512BW-SLOW-NEXT: kmovq %rcx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm8, %zmm0 {%k1} -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm8 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm8 = zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm8[18,19,20,21],zero,zmm8[19],zero,zmm8[25,26,27,22],zero,zmm8[20],zero,zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm8[55],zero,zero,zero,zero,zmm8[58],zero,zmm8[56],zero,zero,zero,zero,zmm8[59],zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm15 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm15 = zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm15[18],zero,zero,zero,zero,zmm15[21],zero,zmm15[19],zero,zero,zero,zero,zmm15[22],zero,zmm15[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm15[55],zero,zero,zero,zero,zmm15[58],zero,zmm15[56],zero,zero,zero,zero,zmm15[59],zero,zmm15[57] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm15 = zmm15[2,3,2,3,6,7,6,7] -; AVX512BW-SLOW-NEXT: vporq %zmm8, %zmm15, %zmm8 -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,1,1,4,4,5,5] +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm9, %zmm5 {%k1} +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm9 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm9 = zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm9[18,19,20,21],zero,zmm9[19],zero,zmm9[25,26,27,22],zero,zmm9[20],zero,zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm9[55],zero,zero,zero,zero,zmm9[58],zero,zmm9[56],zero,zero,zero,zero,zmm9[59],zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm10 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm10[18],zero,zero,zero,zero,zmm10[21],zero,zmm10[19],zero,zero,zero,zero,zmm10[22],zero,zmm10[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm10[55],zero,zero,zero,zero,zmm10[58],zero,zmm10[56],zero,zero,zero,zero,zmm10[59],zero,zmm10[57] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] +; AVX512BW-SLOW-NEXT: vporq %zmm9, %zmm10, %zmm9 +; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,1,1,4,4,5,5] ; AVX512BW-SLOW-NEXT: movl $676341840, %ecx # imm = 0x28502850 ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm15 {%k1} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm16 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm1[23],zero,ymm1[21,22,23,26],zero,ymm1[24],zero,ymm1[28,29,26,27] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm17 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,3,2,3] -; AVX512BW-SLOW-NEXT: vporq %ymm16, %ymm17, %ymm16 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm10 {%k1} = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm0[23],zero,ymm0[21,22,23,26],zero,ymm0[24],zero,ymm0[28,29,26,27] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 ; AVX512BW-SLOW-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 ; AVX512BW-SLOW-NEXT: kmovq %rcx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512BW-SLOW-NEXT: vpermw %zmm7, %zmm15, %zmm15 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm17 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[20],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm17 = zmm17[2,3,2,3,6,7,6,7] -; AVX512BW-SLOW-NEXT: vporq %zmm16, %zmm17, %zmm16 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm9 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512BW-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm10 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm11 = zmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm7[18],zero,zmm7[20,21,20,21],zero,zmm7[19],zero,zmm7[19,20,21,22],zero,zmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm7[55],zero,zmm7[55,56,57,58],zero,zmm7[56],zero,zmm7[62,63] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm12 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm6[20],zero,zmm6[18],zero,zero,zero,zero,zmm6[21],zero,zmm6[19],zero,zero,zero,zero,zmm6[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm6[57],zero,zmm6[55],zero,zero,zero,zero,zmm6[58],zero,zmm6[56],zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,3,2,3,6,7,6,7] +; AVX512BW-SLOW-NEXT: vporq %zmm11, %zmm12, %zmm11 ; AVX512BW-SLOW-NEXT: movabsq $1161999626690365456, %rcx # imm = 0x1020408102040810 ; AVX512BW-SLOW-NEXT: kmovq %rcx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm16 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm11 {%k2} ; AVX512BW-SLOW-NEXT: movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C ; AVX512BW-SLOW-NEXT: kmovq %rcx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k2} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] -; AVX512BW-SLOW-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u,u],zero -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u,u,9] -; AVX512BW-SLOW-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm9, %zmm9 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,1,0,1,4,5,4,5] -; AVX512BW-SLOW-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C -; AVX512BW-SLOW-NEXT: kmovq %rcx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm12, %zmm9 {%k2} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero -; AVX512BW-SLOW-NEXT: vpor %xmm10, %xmm12, %xmm10 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm11, %zmm10 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,1,0,1,4,5,4,5] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm11 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512BW-SLOW-NEXT: vpermw %zmm11, %zmm12, %zmm11 -; AVX512BW-SLOW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 -; AVX512BW-SLOW-NEXT: kmovq %rcx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm11, %zmm10 {%k2} -; AVX512BW-SLOW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 -; AVX512BW-SLOW-NEXT: kmovq %rcx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm9 {%k2} -; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm11, %zmm9 {%k2} +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] ; AVX512BW-SLOW-NEXT: movl $338170920, %ecx # imm = 0x14281428 ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 {%k2} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm4[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] -; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,3,3,4,6,7,7] -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm1, %ymm3 {%k1} +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 {%k2} = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,3,3,4,6,7,7] +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1} +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,2,3] ; AVX512BW-SLOW-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} -; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] -; AVX512BW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-SLOW-NEXT: vpermw %ymm7, %ymm2, %ymm2 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} +; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] +; AVX512BW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-SLOW-NEXT: vpermw %ymm8, %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512BW-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512BW-SLOW-NEXT: movl $-2130574328, %ecx # imm = 0x81020408 ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm3 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm1, %ymm2 {%k1} ; AVX512BW-SLOW-NEXT: movl $-507279602, %ecx # imm = 0xE1C3870E ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa %ymm0, 192(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, 128(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; @@ -3791,9 +3800,9 @@ ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[0,1,14],zero,ymm4[12,13,0,1,14,15],zero,ymm4[3,12,13,2,3,16],zero,ymm4[30,31,28,29,16,17],zero,ymm4[31,18,19,28,29,18],zero ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18] ; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm6, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] +; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm11 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 @@ -3803,89 +3812,90 @@ ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero -; AVX512BW-FAST-NEXT: vpor %ymm7, %ymm10, %ymm7 -; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm11 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vpor %ymm7, %ymm12, %ymm7 +; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm12 ; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm13 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm10 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm14, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqa (%r10), %xmm15 +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512BW-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpermw %ymm15, %ymm7, %ymm16 ; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpermw %ymm7, %ymm15, %ymm15 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm16 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm17 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16 ; AVX512BW-FAST-NEXT: movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020 ; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm15, %zmm10 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm16, %zmm14 {%k1} ; AVX512BW-FAST-NEXT: movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38 ; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm10, %zmm0 {%k1} -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm10 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm10[19],zero,zmm10[21,20,21,22],zero,zmm10[20],zero,zmm10[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm10[55],zero,zmm10[53,54,55,58],zero,zmm10[56],zero,zmm10[60,61,58,59] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm15 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm15 = zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm15[21],zero,zmm15[19],zero,zero,zero,zero,zmm15[22],zero,zmm15[20],zero,zero,zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm15[57],zero,zmm15[55],zero,zero,zero,zero,zmm15[58],zero,zmm15[56],zero,zero,zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm15 = zmm15[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm10, %zmm15, %zmm15 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm10 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm10[18,19,20,21],zero,zmm10[19],zero,zmm10[25,26,27,22],zero,zmm10[20],zero,zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm10[55],zero,zero,zero,zero,zmm10[58],zero,zmm10[56],zero,zero,zero,zero,zmm10[59],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm16 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm16 = zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[18],zero,zero,zero,zero,zmm16[21],zero,zmm16[19],zero,zero,zero,zero,zmm16[22],zero,zmm16[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[55],zero,zero,zero,zero,zmm16[58],zero,zmm16[56],zero,zero,zero,zero,zmm16[59],zero,zmm16[57] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm10, %zmm16, %zmm10 -; AVX512BW-FAST-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 -; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm15, %zmm10 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512BW-FAST-NEXT: vpermw %zmm7, %zmm15, %zmm15 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm17 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[20],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm17 = zmm17[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm16, %zmm17, %zmm16 -; AVX512BW-FAST-NEXT: movabsq $1161999626690365456, %rcx # imm = 0x1020408102040810 -; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm15, %zmm16 {%k1} -; AVX512BW-FAST-NEXT: movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C -; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm16, %zmm10 {%k1} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] -; AVX512BW-FAST-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9] -; AVX512BW-FAST-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm14, %zmm0 {%k1} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[u,u,u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u] +; AVX512BW-FAST-NEXT: vporq %xmm14, %xmm16, %xmm14 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm10, %zmm10 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9] +; AVX512BW-FAST-NEXT: vpor %xmm11, %xmm14, %xmm11 ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm14, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm8, %zmm8 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5] ; AVX512BW-FAST-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C ; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm12, %zmm8 {%k1} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero -; AVX512BW-FAST-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm11, %zmm9 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm10, %zmm8 {%k1} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero +; AVX512BW-FAST-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm9 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,1,0,1,4,5,4,5] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512BW-FAST-NEXT: vpermw %zmm11, %zmm12, %zmm11 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm10 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512BW-FAST-NEXT: vpermw %zmm10, %zmm11, %zmm10 ; AVX512BW-FAST-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm11, %zmm9 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm10, %zmm9 {%k1} ; AVX512BW-FAST-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm9, %zmm8 {%k1} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm9 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm9 = zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm9[19],zero,zmm9[21,20,21,22],zero,zmm9[20],zero,zmm9[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm9[55],zero,zmm9[53,54,55,58],zero,zmm9[56],zero,zmm9[60,61,58,59] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm10 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm10[21],zero,zmm10[19],zero,zero,zero,zero,zmm10[22],zero,zmm10[20],zero,zero,zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm10[57],zero,zmm10[55],zero,zero,zero,zero,zmm10[58],zero,zmm10[56],zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vporq %zmm9, %zmm10, %zmm9 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm10 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm10[18,19,20,21],zero,zmm10[19],zero,zmm10[25,26,27,22],zero,zmm10[20],zero,zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm10[55],zero,zero,zero,zero,zmm10[58],zero,zmm10[56],zero,zero,zero,zero,zmm10[59],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm11 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm11 = zmm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm11[18],zero,zero,zero,zero,zmm11[21],zero,zmm11[19],zero,zero,zero,zero,zmm11[22],zero,zmm11[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm11[55],zero,zero,zero,zero,zmm11[58],zero,zmm11[56],zero,zero,zero,zero,zmm11[59],zero,zmm11[57] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vporq %zmm10, %zmm11, %zmm10 +; AVX512BW-FAST-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 +; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm9, %zmm10 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512BW-FAST-NEXT: vpermw %zmm7, %zmm9, %zmm9 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm11 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm12 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[20],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vporq %zmm11, %zmm12, %zmm11 +; AVX512BW-FAST-NEXT: movabsq $1161999626690365456, %rcx # imm = 0x1020408102040810 +; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm9, %zmm11 {%k1} +; AVX512BW-FAST-NEXT: movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C +; AVX512BW-FAST-NEXT: kmovq %rcx, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm11, %zmm10 {%k1} ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm4[28],zero,ymm4[30,31,30,31],zero,ymm4[29],zero,ymm4[31,28,29] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero @@ -3914,8 +3924,8 @@ ; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 ; AVX512BW-FAST-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -5338,542 +5348,515 @@ ; ; AVX1-ONLY-LABEL: store_i8_stride7_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $632, %rsp # imm = 0x278 +; AVX1-ONLY-NEXT: subq $552, %rsp # imm = 0x228 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm6[13,u,u,u,u],zero,zero,xmm6[14,u,u,u,u],zero,zero,xmm6[15] -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u],zero,zero,xmm6[11,u,u,u,u],zero,zero,xmm6[12,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,u],zero,xmm1[7,u,u,u,u,u],zero,xmm1[8,u,u] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u,u,u,u,5,6],zero,xmm0[u,u,u,u,12,13],zero,xmm0[u] +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm4 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[6,u,u,u,u],zero,zero,xmm4[7,u,u,u,u],zero,zero,xmm4[8,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm4[4,u,u,u,u],zero,zero,xmm4[5,u,u,u,u],zero,zero ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm9 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,6,7],zero,xmm0[u,u,u,u,8,9],zero,xmm0[u,u,u,u,10] -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm10 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm7[8],xmm10[9],xmm7[9],xmm10[10],xmm7[10],xmm10[11],xmm7[11],xmm10[12],xmm7[12],xmm10[13],xmm7[13],xmm10[14],xmm7[14],xmm10[15],xmm7[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <8,9,128,u,u,u,u,10,11,128,u,u,u,u,12,13> +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u],zero,xmm0[7,u,u,u,u,u],zero,xmm0[8,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,7],zero,xmm1[u,u,u,u,u,8],zero,xmm1[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u],zero,xmm7[7,u,u,u,u,u],zero,xmm7[8,u,u,u,u,u],zero +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[u,7],zero,xmm9[u,u,u,u,u,8],zero,xmm9[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm2, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,6,7],zero,xmm10[u,u,u,u,8,9],zero,xmm10[u,u,u,u,10] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm11, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm0, %ymm7 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm14 +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm12 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm12[4,u,u,u,u],zero,zero,xmm12[5,u,u,u,u],zero,zero +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u],zero,zero,xmm12[2,u,u,u,u],zero,zero,xmm12[3,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,4,5],zero,xmm2[u,u,u,u,6,7],zero,xmm2[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm15 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm9 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,u,u,u],zero,xmm11[7,u,u,u,u,u],zero,xmm11[8,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm15, %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm3[u,u,u,u,5,6],zero,xmm3[u,u,u,u,12,13],zero,xmm3[u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[6,u,u,u,u],zero,zero,xmm12[7,u,u,u,u],zero,zero,xmm12[8,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm15, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm15 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm15, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm5 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm13[8],xmm10[9],xmm13[9],xmm10[10],xmm13[10],xmm10[11],xmm13[11],xmm10[12],xmm13[12],xmm10[13],xmm13[13],xmm10[14],xmm13[14],xmm10[15],xmm13[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10],zero,xmm0[u,u,u,u,13,12],zero,xmm0[u,u,u,u,15,14],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm12[13,u,u,u,u],zero,zero,xmm12[14,u,u,u,u],zero,zero,xmm12[15] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,6,7],zero,xmm6[u,u,u,u,8,9],zero,xmm6[u,u,u,u,10] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u],zero,zero,xmm12[11,u,u,u,u],zero,zero,xmm12[12,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm11 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm5[8],xmm11[9],xmm5[9],xmm11[10],xmm5[10],xmm11[11],xmm5[11],xmm11[12],xmm5[12],xmm11[13],xmm5[13],xmm11[14],xmm5[14],xmm11[15],xmm5[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm12 -; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,3],zero,xmm0[u,u,u,u,4,5],zero,xmm0[u,u,u] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u],zero,zero,xmm6[9,u,u,u,u],zero,zero,xmm6[10,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm9 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u> -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u> -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128> -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9],zero,xmm2[u,u,u,u,10,11],zero,xmm2[u,u,u,u,12,13] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm15 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm6 -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm7 -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,u,u,u],zero,xmm3[7,u,u,u,u,u],zero,xmm3[8,u,u] +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm2 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u,u,u,u,5,6],zero,xmm0[u,u,u,u,12,13],zero,xmm0[u] +; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm13 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[6,u,u,u,u],zero,zero,xmm13[7,u,u,u,u],zero,zero,xmm13[8,u] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128> +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,6,7],zero,xmm3[u,u,u,u,8,9],zero,xmm3[u,u,u,u,10] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm6, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,2,3],zero,xmm3[u,u,u,u,4,5],zero,xmm3[u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u],zero,zero,xmm8[9,u,u,u,u],zero,zero,xmm8[10,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6] +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <8,9,128,u,u,u,u,10,11,128,u,u,u,u,12,13> +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u],zero,xmm7[7,u,u,u,u,u],zero,xmm7[8,u,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[u,7],zero,xmm9[u,u,u,u,u,8],zero,xmm9[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm11 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm8, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm10, %ymm5 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u],zero,zero,xmm13[11,u,u,u,u],zero,zero,xmm13[12,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[u,u,u],zero,zero,xmm13[9,u,u,u,u],zero,zero,xmm13[10,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <8,9,128,u,u,u,u,10,11,128,u,u,u,u,12,13> -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u],zero,zero,xmm11[2,u,u,u,u],zero,zero,xmm11[3,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,4,5],zero,xmm2[u,u,u,u,6,7],zero,xmm2[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm13 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3],xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm1, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm6 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,u,u,u],zero,xmm6[7,u,u,u,u,u],zero,xmm6[8,u,u] -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm8 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,7],zero,xmm8[u,u,u,u,u,8],zero,xmm8[u,u] +; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm8 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm8[4,u,u,u,u],zero,zero,xmm8[5,u,u,u,u],zero,zero +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,9],zero,xmm2[u,u,u,u,10,11],zero,xmm2[u,u,u,u,12,13] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u,u,u,u,5,6],zero,xmm0[u,u,u,u,12,13],zero,xmm0[u] -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm7 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[6,u,u,u,u],zero,zero,xmm7[7,u,u,u,u],zero,zero,xmm7[8,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm15 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm15, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm15 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u],zero,xmm5[7,u,u,u,u,u],zero,xmm5[8,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[u,u,u,7],zero,xmm2[u,u,u,u,u,8],zero,xmm2[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm14 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm14 -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u],zero,xmm14[7,u,u,u,u,u],zero,xmm14[8,u,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,7],zero,xmm1[u,u,u,u,u,8],zero,xmm1[u,u,u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u],zero,zero,xmm8[2,u,u,u,u],zero,zero,xmm8[3,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,4,5],zero,xmm2[u,u,u,u,6,7],zero,xmm2[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm10, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u],zero,zero,xmm7[11,u,u,u,u],zero,zero,xmm7[12,u,u,u,u],zero -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,6,7],zero,xmm4[u,u,u,u,8,9],zero,xmm4[u,u,u,u,10] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm12, %xmm3 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,2,3],zero,xmm4[u,u,u,u,4,5],zero,xmm4[u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u],zero,zero,xmm7[9,u,u,u,u],zero,zero,xmm7[10,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm12, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm14[8],xmm1[9],xmm14[9],xmm1[10],xmm14[10],xmm1[11],xmm14[11],xmm1[12],xmm14[12],xmm1[13],xmm14[13],xmm1[14],xmm14[14],xmm1[15],xmm14[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm1[8],xmm14[9],xmm1[9],xmm14[10],xmm1[10],xmm14[11],xmm1[11],xmm14[12],xmm1[12],xmm14[13],xmm1[13],xmm14[14],xmm1[14],xmm14[15],xmm1[15] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u> -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u> -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm14 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9> +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm9, %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm2[4,u,u,u,u],zero,zero,xmm2[5,u,u,u,u],zero,zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[8,9],zero,xmm3[u,u,u,u,10,11],zero,xmm3[u,u,u,u,12,13] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u],zero,zero,xmm2[2,u,u,u,u],zero,zero,xmm2[3,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,4,5],zero,xmm3[u,u,u,u,6,7],zero,xmm3[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u],zero,xmm5[7,u,u,u,u,u],zero,xmm5[8,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,7],zero,xmm14[u,u,u,u,u,8],zero,xmm14[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15] +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u],zero,xmm3[7,u,u,u,u,u],zero,xmm3[8,u,u,u,u,u],zero +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,7],zero,xmm1[u,u,u,u,u,8],zero,xmm1[u,u,u,u,u,9] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm15, %xmm2 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2],zero,xmm2[u,u,6,7,8,9],zero,xmm2[u,u,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm4[9,u,u],zero,zero,zero,zero,xmm4[10,u,u],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm15, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3],zero,xmm2[u,6,7,8,9,10],zero,xmm2[u,13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,zero,xmm6[9,u],zero,zero,zero,zero,zero,xmm6[10,u],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm15, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4],zero,xmm2[6,7,8,9,10,11],zero,xmm2[13,14,15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,zero,zero,xmm8[9],zero,zero,zero,zero,zero,zero,xmm8[10],zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm15, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u],zero,xmm12[7,u,u,u,u,u],zero,xmm12[8,u,u,u,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,7],zero,xmm13[u,u,u,u,u,8],zero,xmm13[u,u,u,u] -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u],zero,xmm2[7,u,u,u,u,u],zero,xmm2[8,u,u,u,u,u],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,7],zero,xmm4[u,u,u,u,u,8],zero,xmm4[u,u,u,u,u,9] -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm7 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[u,u,6,7,8,9],zero,xmm1[u,u,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm4[9,u,u],zero,zero,zero,zero,xmm4[10,u,u],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,xmm1[u,6,7,8,9,10],zero,xmm1[u,13,14,15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm2[9,u],zero,zero,zero,zero,zero,xmm2[10,u],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4],zero,xmm1[6,7,8,9,10,11],zero,xmm1[13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm11[9],zero,zero,zero,zero,zero,zero,xmm11[10],zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,1,2,3,4],zero,xmm0[u,u,8,9,10,11],zero,xmm0[u,u,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u],zero,zero,zero,zero,xmm4[7,u,u],zero,zero,zero,zero,xmm4[8,u,u],zero -; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm5 -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u],zero,zero,zero,zero,xmm4[7,u,u],zero,zero,zero,zero,xmm4[8,u,u],zero +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,1,2,3,4,5],zero,xmm0[u,8,9,10,11,12],zero,xmm0[u,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u],zero,zero,zero,zero,zero,xmm2[7,u],zero,zero,zero,zero,zero,xmm2[8,u],zero -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u],zero,zero,zero,zero,zero,xmm6[7,u],zero,zero,zero,zero,zero,xmm6[8,u],zero +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4,5,6],zero,xmm0[8,9,10,11,12,13],zero,xmm0[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[6],zero,zero,zero,zero,zero,zero,xmm11[7],zero,zero,zero,zero,zero,zero,xmm11[8],zero -; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[6],zero,zero,zero,zero,zero,zero,xmm8[7],zero,zero,zero,zero,zero,zero,xmm8[8],zero +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 -; AVX1-ONLY-NEXT: vmovaps %ymm14, %ymm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm14[8],xmm5[9],xmm14[9],xmm5[10],xmm14[10],xmm5[11],xmm14[11],xmm5[12],xmm14[12],xmm5[13],xmm14[13],xmm5[14],xmm14[14],xmm5[15],xmm14[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,xmm0[u,u,4,5,6,7],zero,xmm0[u,u,11,12,13,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm5[11,u,u],zero,zero,zero,zero,xmm5[12,u,u],zero,zero,zero,zero,xmm5[13] -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm4[11,u,u],zero,zero,zero,zero,xmm4[12,u,u],zero,zero,zero,zero,xmm4[13] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[u,4,5,6,7,8],zero,xmm1[u,11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[11,u],zero,zero,zero,zero,zero,xmm2[12,u],zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm6[11,u],zero,zero,zero,zero,zero,xmm6[12,u],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7,8,9],zero,xmm1[11,12,13,14,15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm11[11],zero,zero,zero,zero,zero,zero,xmm11[12],zero,zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm8[11],zero,zero,zero,zero,zero,zero,xmm8[12],zero,zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u,4,5,6,7,0],zero,xmm0[u,11,12,13,14,1],zero,xmm0[u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[13,u],zero,zero,zero,zero,zero,xmm2[14,u],zero,zero,zero,zero,zero,xmm2[15,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[13,u],zero,zero,zero,zero,zero,xmm6[14,u],zero,zero,zero,zero,zero,xmm6[15,u] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2,3,4,5,6,7],zero,xmm0[9,10,11,12,13,14],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm11[13],zero,zero,zero,zero,zero,zero,xmm11[14],zero,zero,zero,zero,zero,zero,xmm11[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm8[13],zero,zero,zero,zero,zero,zero,xmm8[14],zero,zero,zero,zero,zero,zero,xmm8[15] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5> ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm4 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10],zero,xmm4[u,u,u,u,13,12],zero,xmm4[u,u,u,u,15,14],zero -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm13[13,u,u,u,u],zero,zero,xmm13[14,u,u,u,u],zero,zero,xmm13[15] -; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm7 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm7[4,5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm10, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9> -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm9 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm11, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm9, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[4,5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3] -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm12 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm12, %ymm9 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm9, %ymm9 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm13 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm13, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm13 -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm11, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm11 -; AVX1-ONLY-NEXT: vpor %xmm5, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm11 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[4,5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3] -; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm0 -; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm11, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm12, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm5 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10],zero,xmm5[u,u,u,u,13,12],zero,xmm5[u,u,u,u,15,14],zero -; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm4 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5> +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm1 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm15, %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm10, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <128,13,u,u,u,u,128,128,14,u,u,u,u,128,128,15> +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm15 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <10,128,u,u,u,u,13,12,128,u,u,u,u,15,14,128> +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm15 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm12 = xmm15[4,5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm12 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm12, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm14[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9] +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm15 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm15, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u] +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm3, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,u],zero,zero,xmm13[2,u,u,u,u],zero,zero,xmm13[3,u,u,u,u] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[u,u,4,5],zero,xmm15[u,u,u,u,6,7],zero,xmm15[u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm14, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm14 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[4,5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm14, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm10, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm13, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm6 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15] +; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[4,5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm2, 352(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 320(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 288(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm1, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 96(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 96(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm9, 224(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm1, 352(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 432(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -5882,49 +5865,49 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm0, 384(%rax) ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vmovaps %xmm0, 400(%rax) -; AVX1-ONLY-NEXT: addq $632, %rsp # imm = 0x278 +; AVX1-ONLY-NEXT: addq $552, %rsp # imm = 0x228 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i8_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $808, %rsp # imm = 0x328 +; AVX2-SLOW-NEXT: subq $744, %rsp # imm = 0x2E8 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm7 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm4 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,ymm0[27,20,21,26],zero,ymm0[24],zero,ymm0[26,27,26,27],zero,ymm0[25] +; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[7],zero,ymm0[11,4,5,10],zero,ymm0[8],zero,ymm0[10,11,10,11],zero,ymm0[9],zero,ymm0[23],zero,ymm0[27,20,21,26],zero,ymm0[24],zero,ymm0[26,27,26,27],zero,ymm0[25] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero,ymm1[27],zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[9],zero,ymm1[7],zero,zero,zero,zero,ymm1[10],zero,ymm1[8],zero,zero,zero,zero,ymm1[11],zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero,ymm1[27],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm5[9],zero,ymm5[7],zero,zero,zero,zero,ymm5[10],zero,ymm5[8],zero,zero,zero,zero,zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm7[9],zero,ymm7[7],zero,zero,zero,zero,ymm7[10],zero,ymm7[8],zero,zero,zero,zero,ymm7[11],zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero,ymm7[27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[6,7,10,11,8,9,6,7,8,9,10,11,10,11,8,9,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -5940,7 +5923,7 @@ ; AVX2-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa (%rax), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[12,13,10,11,12,13,14,15,14,15,12,13,12,13,14,15,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 @@ -5973,369 +5956,357 @@ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm10 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm11 -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm1, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm4 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm7 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[1,1,0,0,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm14 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3],xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm9 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm9[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm9 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[1,1,0,0,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm15, %ymm13, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm11, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm15 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm15[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm9 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[1,1,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,0] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm11, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm12, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm10, %xmm15 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u,u,9] +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm12, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm13, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm11 +; AVX2-SLOW-NEXT: vpor %xmm0, %xmm11, %xmm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm13 -; AVX2-SLOW-NEXT: vpor %xmm8, %xmm13, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm15 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-SLOW-NEXT: vmovdqa %xmm7, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm12 -; AVX2-SLOW-NEXT: vpor %xmm15, %xmm12, %xmm12 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm8, %ymm12, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm5, %xmm11 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm10 -; AVX2-SLOW-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm11 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,7],zero,xmm0[5],zero,xmm0[u,u,u,8],zero,xmm0[6],zero,xmm0[u,u,u,9] +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm5, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm12 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm12, %xmm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm10, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm11 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = zero,xmm5[4,u,u,u],zero,xmm5[7],zero,xmm5[5,u,u,u],zero,xmm5[8],zero,xmm5[6] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm2, %xmm13 -; AVX2-SLOW-NEXT: vpor %xmm11, %xmm13, %xmm11 +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm1 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm11, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,0] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm11 = zero,xmm10[4,u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6] +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm8, %xmm12 +; AVX2-SLOW-NEXT: vpor %xmm11, %xmm12, %xmm11 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm15 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm11, %ymm15, %ymm11 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm10 -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm14, %xmm12 -; AVX2-SLOW-NEXT: vpor %xmm10, %xmm12, %xmm10 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm12 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm10, %ymm12, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm11, %ymm12, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm8 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm6[8],xmm13[9],xmm6[9],xmm13[10],xmm6[10],xmm13[11],xmm6[11],xmm13[12],xmm6[12],xmm13[13],xmm6[13],xmm13[14],xmm6[14],xmm13[15],xmm6[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm15[8],xmm14[9],xmm15[9],xmm14[10],xmm15[10],xmm14[11],xmm15[11],xmm14[12],xmm15[12],xmm14[13],xmm15[13],xmm14[14],xmm15[14],xmm14[15],xmm15[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15] -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm6 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,5,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm15[8],xmm14[9],xmm15[9],xmm14[10],xmm15[10],xmm14[11],xmm15[11],xmm14[12],xmm15[12],xmm14[13],xmm15[13],xmm14[14],xmm15[14],xmm14[15],xmm15[15] -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,5,5,6] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm6, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,5,5,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,4,128,2,128,128,128,128,5,128,3,128,128,128,128,6,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm10, %ymm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,128,2,128,128,128,128,5,128,3,128,128,128,128,6,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm9, %ymm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] ; AVX2-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpshuflw $150, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm14, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm8, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw $150, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = mem[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,1,1,4,4,5,5] ; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm13, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm8, %ymm4 ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0] ; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm6 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm14, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,1,1,4,4,5,5] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,2,128,128,128,128,5,128,3,128,128,128,128,6,128,4,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm5 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[18],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22],zero,ymm12[20],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm4 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm15[2],zero,zero,zero,zero,ymm15[5],zero,ymm15[3],zero,zero,zero,zero,ymm15[6],zero,ymm15[4],zero,ymm15[18],zero,zero,zero,zero,ymm15[21],zero,ymm15[19],zero,zero,zero,zero,ymm15[22],zero,ymm15[20],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm5, %ymm7, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[2],zero,zero,zero,zero,ymm11[5],zero,ymm11[3],zero,zero,zero,zero,ymm11[6],zero,ymm11[4],zero,ymm11[18],zero,zero,zero,zero,ymm11[21],zero,ymm11[19],zero,zero,zero,zero,ymm11[22],zero,ymm11[20],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] ; AVX2-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,ymm6[27,20,21,26],zero,ymm6[24],zero,ymm6[26,27,26,27],zero,ymm6[25] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm7 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm1[7],zero,ymm1[11,4,5,10],zero,ymm1[8],zero,ymm1[10,11,10,11],zero,ymm1[9],zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[9],zero,ymm8[7],zero,zero,zero,zero,ymm8[10],zero,ymm8[8],zero,zero,zero,zero,ymm8[11],zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm2[9],zero,ymm2[7],zero,zero,zero,zero,ymm2[10],zero,ymm2[8],zero,zero,zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm15[9],zero,ymm15[7],zero,zero,zero,zero,ymm15[10],zero,ymm15[8],zero,zero,zero,zero,ymm15[11],zero,ymm15[25],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero,zero,zero,ymm15[27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero,ymm7[27],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm6 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vpor %ymm4, %ymm8, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm10, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm15 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm12, %ymm11 +; AVX2-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm9[25],zero,ymm9[23],zero,zero,zero,zero,ymm9[26],zero,ymm9[24],zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm5 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[6,7,10,11,8,9,6,7,8,9,10,11,10,11,8,9,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm3 +; AVX2-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,ymm15[0],zero,ymm15[14],zero,zero,zero,zero,ymm15[1],zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,ymm15[30],zero,zero,zero,zero,ymm15[17],zero,ymm15[31],zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpor %ymm4, %ymm9, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm4, %ymm8 +; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm14, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm15 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm14 -; AVX2-SLOW-NEXT: vpor %ymm9, %ymm11, %ymm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm4, %ymm9, %ymm4 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm1, %ymm5 +; AVX2-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm7 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm11[0],zero,ymm11[14],zero,zero,zero,zero,ymm11[1],zero,ymm11[15],zero,zero,zero,zero,zero,zero,ymm11[16],zero,ymm11[30],zero,zero,zero,zero,ymm11[17],zero,ymm11[31],zero,zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm12 +; AVX2-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm2, %ymm6 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm5 ; AVX2-SLOW-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm13, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm9, %ymm3 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm10 -; AVX2-SLOW-NEXT: vpor %ymm8, %ymm10, %ymm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm2, %ymm11 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm6, %ymm9 -; AVX2-SLOW-NEXT: vpor %ymm5, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm6, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm5, %ymm9, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm8, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm15[11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm13[u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] -; AVX2-SLOW-NEXT: vpshufhw $190, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm9, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm8, %ymm5 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm5, %ymm8, %ymm5 +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm1, %ymm4 +; AVX2-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm9, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm8, %ymm3, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm6, %ymm2, %ymm6 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[12,13,10,11,12,13,14,15,14,15,12,13,12,13,14,15,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, 320(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm6, 320(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm5, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm5, 416(%rax) -; AVX2-SLOW-NEXT: addq $808, %rsp # imm = 0x328 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, 416(%rax) +; AVX2-SLOW-NEXT: addq $744, %rsp # imm = 0x2E8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride7_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $648, %rsp # imm = 0x288 +; AVX2-FAST-NEXT: subq $600, %rsp # imm = 0x258 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 @@ -6344,19 +6315,19 @@ ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 32(%rax), %ymm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm1[7],zero,ymm1[11,4,5,10],zero,ymm1[8],zero,ymm1[10,11,10,11],zero,ymm1[9],zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[9],zero,ymm2[7],zero,zero,zero,zero,ymm2[10],zero,ymm2[8],zero,zero,zero,zero,ymm2[11],zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm9 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm7[9],zero,ymm7[7],zero,zero,zero,zero,ymm7[10],zero,ymm7[8],zero,zero,zero,zero,zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm6[9],zero,ymm6[7],zero,zero,zero,zero,ymm6[10],zero,ymm6[8],zero,zero,zero,zero,ymm6[11],zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27] ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 @@ -6373,19 +6344,19 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[6,7,10,11,8,9,6,7,8,9,10,11,10,11,8,9,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[17,18,19,30],zero,ymm6[28],zero,ymm6[28,29,30,31],zero,ymm6[29],zero,ymm6[31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm6[1,2,3,14],zero,ymm6[12],zero,ymm6[12,13,14,15],zero,ymm6[13],zero,ymm6[15],zero,ymm6[17,18,19,30],zero,ymm6[28],zero,ymm6[28,29,30,31],zero,ymm6[29],zero,ymm6[31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[11],zero,zero,zero,zero,ymm7[14],zero,ymm7[12],zero,zero,zero,zero,ymm7[15],zero,ymm7[13],zero,ymm7[27],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero,ymm9[29],zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm9[14],zero,ymm9[12],zero,zero,zero,zero,ymm9[15],zero,ymm9[13],zero,zero,zero,zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero,ymm9[29],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm8[14],zero,ymm8[12],zero,zero,zero,zero,ymm8[15],zero,ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> @@ -6398,359 +6369,348 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u> ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[12,13,10,11,12,13,14,15,14,15,12,13,12,13,14,15,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; AVX2-FAST-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm14 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm7 -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm2 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm12 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm10 -; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,0,0,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,2,0,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm14 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] -; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm4, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm6 ; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[1,1,0,0,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm13 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm7 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm11, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm15 +; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm11 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,0] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm15, %ymm11, %ymm10 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm10, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm12, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm11 -; AVX2-FAST-NEXT: vpor %xmm1, %xmm11, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm11 +; AVX2-FAST-NEXT: vpor %xmm6, %xmm11, %xmm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm13 -; AVX2-FAST-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm12, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm10 -; AVX2-FAST-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm14 +; AVX2-FAST-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm6, %ymm14, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm10 +; AVX2-FAST-NEXT: vpor %xmm12, %xmm10, %xmm10 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm10 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm11 -; AVX2-FAST-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm11 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX2-FAST-NEXT: vpor %xmm0, %xmm11, %xmm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm10, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm11 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm13 -; AVX2-FAST-NEXT: vpor %xmm11, %xmm13, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm14 +; AVX2-FAST-NEXT: vpor %xmm11, %xmm14, %xmm11 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm14, %xmm15 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm11, %ymm15, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm11, %ymm15, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm10 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm15, %xmm12 ; AVX2-FAST-NEXT: vpor %xmm10, %xmm12, %xmm10 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm15, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm9, %xmm12 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm10, %ymm12, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm10, %ymm12, %ymm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; AVX2-FAST-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm6, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm13[8],mem[8],xmm13[9],mem[9],xmm13[10],mem[10],xmm13[11],mem[11],xmm13[12],mem[12],xmm13[13],mem[13],xmm13[14],mem[14],xmm13[15],mem[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm6 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm1, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm14[8],mem[8],xmm14[9],mem[9],xmm14[10],mem[10],xmm14[11],mem[11],xmm14[12],mem[12],xmm14[13],mem[13],xmm14[14],mem[14],xmm14[15],mem[15] -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,6] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,4,5,5,6] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8],xmm7[8],xmm15[9],xmm7[9],xmm15[10],xmm7[10],xmm15[11],xmm7[11],xmm15[12],xmm7[12],xmm15[13],xmm7[13],xmm15[14],xmm7[14],xmm15[15],xmm7[15] +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[17,18,19,30],zero,ymm1[28],zero,ymm1[28,29,30,31],zero,ymm1[29],zero,ymm1[31] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm2 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm6[7],zero,ymm6[11,4,5,10],zero,ymm6[8],zero,ymm6[10,11,10,11],zero,ymm6[9],zero,ymm6[23],zero,ymm6[27,20,21,26],zero,ymm6[24],zero,ymm6[26,27,26,27],zero,ymm6[25] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm10 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[9],zero,ymm2[7],zero,zero,zero,zero,ymm2[10],zero,ymm2[8],zero,zero,zero,zero,ymm2[11],zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm13 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29],zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm1[9],zero,ymm1[7],zero,zero,zero,zero,ymm1[10],zero,ymm1[8],zero,zero,zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[27,28,29,30],zero,ymm0[28],zero,ymm0[26,27,30,31],zero,ymm0[29] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm0[9],zero,ymm0[7],zero,zero,zero,zero,ymm0[10],zero,ymm0[8],zero,zero,zero,zero,ymm0[11],zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm7 ; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm4[27],zero,zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm3, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[23],zero,ymm11[27,20,21,26],zero,ymm11[24],zero,ymm11[26,27,26,27],zero,ymm11[25] -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm15 -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero,ymm13[27],zero -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-NEXT: vpor %ymm3, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vmovdqa (%rax), %ymm3 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[6,7,10,11,8,9,6,7,8,9,10,11,10,11,8,9,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm11 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm8, %ymm11, %ymm8 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] -; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm12, %ymm14, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm8, %ymm12, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm0[1,2,3,14],zero,ymm0[12],zero,ymm0[12,13,14,15],zero,ymm0[13],zero,ymm0[15],zero,ymm0[17,18,19,30],zero,ymm0[28],zero,ymm0[28,29,30,31],zero,ymm0[29],zero,ymm0[31] +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm12 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[11],zero,zero,zero,zero,ymm1[14],zero,ymm1[12],zero,zero,zero,zero,ymm1[15],zero,ymm1[13],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,ymm2[14],zero,ymm2[12],zero,zero,zero,zero,ymm2[15],zero,ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm6[14],zero,ymm6[12],zero,zero,zero,zero,ymm6[15],zero,ymm6[13],zero,zero,zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm13, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm7, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[27],zero,ymm5[27,28,29,30],zero,ymm5[28],zero,ymm5[26,27,30,31],zero,ymm5[29] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm4[27],zero,zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm15, %ymm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm9, %ymm10, %ymm9 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm11 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm13 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FAST-NEXT: vpor %ymm11, %ymm13, %ymm11 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm13 = ymm8[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,4,5,5,7,4,5] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm14, %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm11, %ymm13, %ymm11 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm10 +; AVX2-FAST-NEXT: vpor %ymm9, %ymm10, %ymm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm11[12,13,10,11,12,13,14,15,14,15,12,13,12,13,14,15,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm7, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,2,128,128,128,128,5,128,3,128,128,128,128,6,128,4,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm10 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [2,128,128,128,128,5,128,3,128,128,128,128,6,128,4,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm12, %ymm12 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm10, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,5,128,3,128,128,128,128,6,128,4,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm14 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,5,128,3,128,128,128,128,6,128,4,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm13 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm14, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm9 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm11 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm9, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm11 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm6[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm14, %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm12, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm10 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm12 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm9, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,4,128,2,128,128,128,128,5,128,3,128,128,128,128,6,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm12 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [4,128,2,128,128,128,128,5,128,3,128,128,128,128,6,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm14 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm12, %ymm14, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,2,3,4,5,2,3,4,5,12,13,14,15,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm12, %ymm15, %ymm12 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm9 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm15, %ymm13 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX2-FAST-NEXT: vpor %ymm9, %ymm13, %ymm9 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm13 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm9, %ymm13, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm11, %ymm3, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vpor %ymm10, %ymm12, %ymm10 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm12 +; AVX2-FAST-NEXT: vpor %ymm2, %ymm12, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm7 -; AVX2-FAST-NEXT: vpor %ymm7, %ymm13, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm10, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm11 +; AVX2-FAST-NEXT: vpor %ymm3, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm10 -; AVX2-FAST-NEXT: vpor %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm2, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm7, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpor %ymm3, %ymm6, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm4 +; AVX2-FAST-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FAST-NEXT: vmovdqa %ymm1, 320(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 320(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 128(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm10, 352(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6758,22 +6718,16 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-FAST-NEXT: addq $648, %rsp # imm = 0x288 +; AVX2-FAST-NEXT: addq $600, %rsp # imm = 0x258 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i8_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $648, %rsp # imm = 0x288 +; AVX2-FAST-PERLANE-NEXT: subq $600, %rsp # imm = 0x258 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm2 @@ -6782,19 +6736,19 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm1[7],zero,ymm1[11,4,5,10],zero,ymm1[8],zero,ymm1[10,11,10,11],zero,ymm1[9],zero,ymm1[23],zero,ymm1[27,20,21,26],zero,ymm1[24],zero,ymm1[26,27,26,27],zero,ymm1[25] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[9],zero,ymm2[7],zero,zero,zero,zero,ymm2[10],zero,ymm2[8],zero,zero,zero,zero,ymm2[11],zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm7[9],zero,ymm7[7],zero,zero,zero,zero,ymm7[10],zero,ymm7[8],zero,zero,zero,zero,zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm6[9],zero,ymm6[7],zero,zero,zero,zero,ymm6[10],zero,ymm6[8],zero,zero,zero,zero,ymm6[11],zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 @@ -6811,19 +6765,19 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[6,7,10,11,8,9,6,7,8,9,10,11,10,11,8,9,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[17,18,19,30],zero,ymm6[28],zero,ymm6[28,29,30,31],zero,ymm6[29],zero,ymm6[31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm6[1,2,3,14],zero,ymm6[12],zero,ymm6[12,13,14,15],zero,ymm6[13],zero,ymm6[15],zero,ymm6[17,18,19,30],zero,ymm6[28],zero,ymm6[28,29,30,31],zero,ymm6[29],zero,ymm6[31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[11],zero,zero,zero,zero,ymm7[14],zero,ymm7[12],zero,zero,zero,zero,ymm7[15],zero,ymm7[13],zero,ymm7[27],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero,ymm9[29],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm9[14],zero,ymm9[12],zero,zero,zero,zero,ymm9[15],zero,ymm9[13],zero,zero,zero,zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero,ymm9[29],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm8[14],zero,ymm8[12],zero,zero,zero,zero,ymm8[15],zero,ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> @@ -6836,350 +6790,348 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[12,13,10,11,12,13,14,15,14,15,12,13,12,13,14,15,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm14, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm14, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm15, %ymm11, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm12, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm13, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm14, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm11, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm11, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm3, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm4, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm14, %xmm13, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm8, %ymm13, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm2, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm5, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm6, %ymm14, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm8, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm12, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm3, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm11, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm10, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm7, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm15, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm11, %xmm13, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm13, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm11, %xmm14, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm2, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm11, %ymm14, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm14, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm4, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm11, %ymm15, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm7, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm15, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm10, %xmm12, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm9, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm10, %ymm12, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm10, %ymm12, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm6, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm6 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm15[8],mem[8],xmm15[9],mem[9],xmm15[10],mem[10],xmm15[11],mem[11],xmm15[12],mem[12],xmm15[13],mem[13],xmm15[14],mem[14],xmm15[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm13[8],mem[8],xmm13[9],mem[9],xmm13[10],mem[10],xmm13[11],mem[11],xmm13[12],mem[12],xmm13[13],mem[13],xmm13[14],mem[14],xmm13[15],mem[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm5, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm1, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8],xmm7[8],xmm15[9],xmm7[9],xmm15[10],xmm7[10],xmm15[11],xmm7[11],xmm15[12],xmm7[12],xmm15[13],xmm7[13],xmm15[14],xmm7[14],xmm15[15],xmm7[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[17,18,19,30],zero,ymm12[28],zero,ymm12[28,29,30,31],zero,ymm12[29],zero,ymm12[31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm6[7],zero,ymm6[11,4,5,10],zero,ymm6[8],zero,ymm6[10,11,10,11],zero,ymm6[9],zero,ymm6[23],zero,ymm6[27,20,21,26],zero,ymm6[24],zero,ymm6[26,27,26,27],zero,ymm6[25] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[9],zero,ymm2[7],zero,zero,zero,zero,ymm2[10],zero,ymm2[8],zero,zero,zero,zero,ymm2[11],zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm1[9],zero,ymm1[7],zero,zero,zero,zero,ymm1[10],zero,ymm1[8],zero,zero,zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[27,28,29,30],zero,ymm0[28],zero,ymm0[26,27,30,31],zero,ymm0[29] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm0[9],zero,ymm0[7],zero,zero,zero,zero,ymm0[10],zero,ymm0[8],zero,zero,zero,zero,ymm0[11],zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm3, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[23],zero,ymm15[27,20,21,26],zero,ymm15[24],zero,ymm15[26,27,26,27],zero,ymm15[25] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero,ymm5[27],zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero,zero,zero,ymm12[27] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm3, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[6,7,10,11,8,9,6,7,8,9,10,11,10,11,8,9,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm0[1,2,3,14],zero,ymm0[12],zero,ymm0[12,13,14,15],zero,ymm0[13],zero,ymm0[15],zero,ymm0[17,18,19,30],zero,ymm0[28],zero,ymm0[28,29,30,31],zero,ymm0[29],zero,ymm0[31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm10, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm13, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm7, %ymm11, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm5, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[11],zero,zero,zero,zero,ymm1[14],zero,ymm1[12],zero,zero,zero,zero,ymm1[15],zero,ymm1[13],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm14, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,ymm2[14],zero,ymm2[12],zero,zero,zero,zero,ymm2[15],zero,ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm6[14],zero,ymm6[12],zero,zero,zero,zero,ymm6[15],zero,ymm6[13],zero,zero,zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm10, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm7, %ymm9, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[27],zero,ymm5[27,28,29,30],zero,ymm5[28],zero,ymm5[26,27,30,31],zero,ymm5[29] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm15, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm4[27],zero,zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm14, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm10 = ymm11[12,13,10,11,12,13,14,15,14,15,12,13,12,13,14,15,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm5, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [128,2,128,128,128,128,5,128,3,128,128,128,128,6,128,4,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [2,128,128,128,128,5,128,3,128,128,128,128,6,128,4,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm12, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm10, %ymm12, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,5,128,3,128,128,128,128,6,128,4,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm2, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,5,128,3,128,128,128,128,6,128,4,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm6, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm14, %ymm13, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm11, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm4, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm13, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm13, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm4, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm6, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm9, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [128,4,128,2,128,128,128,128,5,128,3,128,128,128,128,6,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm8, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [4,128,2,128,128,128,128,5,128,3,128,128,128,128,6,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm5, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm12, %ymm14, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,2,3,4,5,2,3,4,5,12,13,14,15,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm12, %ymm15, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm8, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm15, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm13, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm7, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm9, %ymm13, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm3, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm12, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm6, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm3, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm7, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm9, %ymm11, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm11, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm9, %ymm6, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm3, %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm14, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm9, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm3, %ymm6, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm10, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm13, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm15, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm8, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 320(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 320(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 352(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7187,1974 +7139,1470 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $648, %rsp # imm = 0x288 +; AVX2-FAST-PERLANE-NEXT: addq $600, %rsp # imm = 0x258 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i8_stride7_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $1368, %rsp # imm = 0x558 -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm13 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm15, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm18 -; AVX512F-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: subq $1464, %rsp # imm = 0x5B8 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm11 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm11[14],zero,zero,zero,zero,zero,zero,ymm11[15],zero,zero,zero,zero,zero,zero,ymm11[16],zero,zero,zero,zero,zero,zero,ymm11[17],zero,zero,zero,zero,zero,zero,ymm11[18] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero +; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm14 ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm7 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm7, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[0,1,0,1,14],zero,ymm4[14,15,0,1,14,15],zero,ymm4[13,14,15,16,17,16],zero,ymm4[30,31,30,31,16,17],zero,ymm4[31,28,29,30,31] ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm7 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm8 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[13,u,u,u,u,u],zero,ymm8[14,u,u,u,u,u],zero,ymm8[15,u,u,u,u,u],zero,ymm8[16,u,u,u,u,u],zero,ymm8[17,u,u,u] +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] -; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm24 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29] -; AVX512F-SLOW-NEXT: vmovdqa %ymm2, %ymm10 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm2[14],zero,ymm2[12],zero,zero,zero,zero,ymm2[15],zero,ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,7,128,128,128,128,10,128,8,128,128,128,128,11,128,9,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm10, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm31 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero,ymm9[29],zero,zero -; AVX512F-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm8, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[12,13,14],zero,ymm10[12],zero,ymm10[14,15,14,15],zero,ymm10[13],zero,ymm10[15,12,13,28,29,30],zero,ymm10[28],zero,ymm10[30,31,30,31],zero,ymm10[29],zero,ymm10[31,28,29] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm10, %ymm22 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <9,128,7,u,u,u,128,10,128,8,u,u,u,128,11,128,25,128,23,u,u,u,128,26,128,24,u,u,u,128,27,128> +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm27 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm24 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm3 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm2[14],zero,ymm2[12],zero,zero,zero,zero,ymm2[15],zero,ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,9,128,7,128,128,128,128,10,128,8,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm10, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm30 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm7[23],zero,ymm7[21,22,23,26],zero,ymm7[24],zero,ymm7[28,29,26,27] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm7[18,19,20,21],zero,ymm7[19],zero,ymm7[25,26,27,22],zero,ymm7[20],zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm7, %ymm26 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[11,u,u,u],zero,ymm10[14],zero,ymm10[12,u,u,u],zero,ymm10[15],zero,ymm10[13,u,27,u,u,u],zero,ymm10[30],zero,ymm10[28,u,u,u],zero,ymm10[31],zero,ymm10[29,u] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm10, %ymm29 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [8,9,128,7,128,5,6,7,10,128,8,128,12,13,10,11,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm19 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm15, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero,ymm13[20],zero,zero +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm10 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm10[11],zero,zero,zero,zero,ymm10[14],zero,ymm10[12],zero,zero,zero,zero,ymm10[15],zero,ymm10[13],zero,ymm10[27],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero,ymm10[29] +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,9,128,7,128,128,128,128,10,128,8,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] +; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm25 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[13],zero,ymm2[11,u,u,u],zero,ymm2[14],zero,ymm2[12,u,u,u],zero,ymm2[15],zero,ymm2[29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,8,9,128,7,128,7,8,9,10,128,8,128,14,15,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm10, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm10, %ymm16 +; AVX512F-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %ymm2 +; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[12,13,10,11,12,13,14,15,14,15,12,13,12,13,14,15,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,10,11,8,9,6,7,8,9,10,11,10,11,8,9,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,7],zero,xmm5[5],zero,xmm5[u,u,u,8],zero,xmm5[6],zero,xmm5[u,u,u,9] +; AVX512F-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm30 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm27 -; AVX512F-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm20 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm23 +; AVX512F-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm13, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm28 +; AVX512F-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm14, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm11[5],zero,ymm11[3],zero,zero,zero,zero,ymm11[6],zero,ymm11[4],zero,zero,zero,zero,zero,zero,ymm11[21],zero,ymm11[19],zero,zero,zero,zero,ymm11[22],zero,ymm11[20],zero,zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[2,3,4,5],zero,ymm14[3],zero,ymm14[5,4,5,6],zero,ymm14[4],zero,ymm14[6,7,18,19,20,21],zero,ymm14[19],zero,ymm14[21,20,21,22],zero,ymm14[20],zero,ymm14[22,23] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm3, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm3[2],zero,zero,zero,zero,ymm3[5],zero,ymm3[3],zero,zero,zero,zero,ymm3[6],zero,ymm3[4],zero,ymm3[18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[2],zero,ymm4[2,3,4,5],zero,ymm4[3],zero,ymm4[9,10,11,6],zero,ymm4[4],zero,ymm4[18],zero,ymm4[18,19,20,21],zero,ymm4[19],zero,ymm4[25,26,27,22],zero,ymm4[20],zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm8, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm8[4],zero,ymm8[2],zero,zero,zero,zero,ymm8[5],zero,ymm8[3],zero,zero,zero,zero,ymm8[6],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm7, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[4],zero,ymm7[2],zero,ymm7[4,5,4,5],zero,ymm7[3],zero,ymm7[3,4,5,6],zero,ymm7[20],zero,ymm7[18],zero,ymm7[20,21,20,21],zero,ymm7[19],zero,ymm7[19,20,21,22],zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vporq %xmm2, %xmm3, %xmm22 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm17 -; AVX512F-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm8, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm9, %ymm3 -; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm2 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa %ymm11, %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa %ymm6, %ymm13 -; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm10, %ymm25 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm12, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm29 -; AVX512F-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX512F-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm4 -; AVX512F-SLOW-NEXT: vpor %xmm1, %xmm4, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm0, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX512F-SLOW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,0,1],zmm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512F-SLOW-NEXT: vporq %xmm0, %xmm1, %xmm27 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm14, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,7],zero,xmm11[5],zero,xmm11[u,u,u,8],zero,xmm11[6],zero,xmm11[u,u,u,9] +; AVX512F-SLOW-NEXT: vporq %xmm0, %xmm1, %xmm30 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm31 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512F-SLOW-NEXT: vporq %xmm0, %xmm1, %xmm28 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm25 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,0,1,14],zero,ymm2[14,15,0,1,14,15],zero,ymm2[13,14,15,16,17,16],zero,ymm2[30,31,30,31,16,17],zero,ymm2[31,28,29,30,31] +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,ymm0[18] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm21 +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[13,u,u,u,u,u],zero,ymm1[14,u,u,u,u,u],zero,ymm1[15,u,u,u,u,u],zero,ymm1[16,u,u,u,u,u],zero,ymm1[17,u,u,u] +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-SLOW-NEXT: vpandn %ymm1, %ymm9, %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm14 = zero,ymm0[13],zero,zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm21 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm28 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[27],zero,zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm10[0,1,2,3],zmm12[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm13[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm26 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm12 = [9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10] -; AVX512F-SLOW-NEXT: vmovdqa %ymm15, %ymm3 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm15, %ymm15 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm23 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29],zero,zero,zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm13, %ymm12 -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm13 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm24 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm27 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm2 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm12, %ymm31 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm0 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm22, %zmm22 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm15[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm15 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-SLOW-NEXT: vpandn %ymm1, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm1[13],zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm17 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm22 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm15 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm26 # 16-byte Folded Reload +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[1,1,0,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm23 ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,ymm0[14],zero,ymm0[12],zero,zero,zero,zero,ymm0[15],zero,ymm0[13],zero,zero,zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero,zero ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm13 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[12,13,14],zero,ymm0[12],zero,ymm0[14,15,14,15],zero,ymm0[13],zero,ymm0[15,12,13,28,29,30],zero,ymm0[28],zero,ymm0[30,31,30,31],zero,ymm0[29],zero,ymm0[31,28,29] +; AVX512F-SLOW-NEXT: vmovdqa %xmm14, %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3],xmm11[4],xmm14[4],xmm11[5],xmm14[5],xmm11[6],xmm14[6],xmm11[7],xmm14[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm14, %xmm14 ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm0[23],zero,ymm0[21,22,23,26],zero,ymm0[24],zero,ymm0[28,29,26,27] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm0[18,19,20,21],zero,ymm0[19],zero,ymm0[25,26,27,22],zero,ymm0[20],zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm29, %ymm15 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm15, %ymm9 -; AVX512F-SLOW-NEXT: vmovdqa %ymm6, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm15, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm15 -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm29 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] -; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm15, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm28 -; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm0[14],zero,ymm0[12],zero,zero,zero,zero,ymm0[15],zero,ymm0[13],zero,zero,zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm0[13],zero,zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,2] -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-SLOW-NEXT: vpandnq %ymm15, %ymm25, %ymm15 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm30[2,3,2,3] -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm14, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[11,u,u,u],zero,ymm0[14],zero,ymm0[12,u,u,u],zero,ymm0[15],zero,ymm0[13,u,27,u,u,u],zero,ymm0[30],zero,ymm0[28,u,u,u],zero,ymm0[31],zero,ymm0[29,u] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm2[11],zero,zero,zero,zero,ymm2[14],zero,ymm2[12],zero,zero,zero,zero,ymm2[15],zero,ymm2[13],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[13],zero,ymm2[11,u,u,u],zero,ymm2[14],zero,ymm2[12,u,u,u],zero,ymm2[15],zero,ymm2[29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm6, %ymm18 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm15[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm31 = ymm8[2,3,2,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm18 = ymm18[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm10[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm9[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm3[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm13[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm12[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm20[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm19[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm17[2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vporq %zmm4, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vpand %ymm4, %ymm11, %ymm8 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm9 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vporq %zmm9, %zmm8, %zmm8 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm9 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm23[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11 -; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm11 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm0 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm9 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vporq %zmm0, %zmm9, %zmm0 -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm9 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm17 = zmm24[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm17 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm8, %zmm17 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm27[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm22[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm8, %zmm9 -; AVX512F-SLOW-NEXT: vpternlogq $248, %ymm4, %ymm16, %ymm15 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $236, %ymm4, %ymm10, %ymm0 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm7[2,3,2,3] -; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm18, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm4 -; AVX512F-SLOW-NEXT: vpor %ymm13, %ymm12, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm1, %zmm5, %zmm0 -; AVX512F-SLOW-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm20 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm31[0,1,0,1] -; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm10 -; AVX512F-SLOW-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512F-SLOW-NEXT: # zmm8 = zmm8[0,1,0,1],mem[0,1,0,1] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[1,1,0,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,0] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm12 -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[1,1,0,0,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,0] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm15 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm29[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm18 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm28[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm27, %zmm0 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm4[8],xmm11[8],xmm4[9],xmm11[9],xmm4[10],xmm11[10],xmm4[11],xmm11[11],xmm4[12],xmm11[12],xmm4[13],xmm11[13],xmm4[14],xmm11[14],xmm4[15],xmm11[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm11 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm30, %zmm11 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm10, %xmm10 +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm28, %zmm22 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm1[2],zero,zero,zero,zero,ymm1[5],zero,ymm1[3],zero,zero,zero,zero,ymm1[6],zero,ymm1[4],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[2],zero,ymm1[2,3,4,5],zero,ymm1[3],zero,ymm1[9,10,11,6],zero,ymm1[4],zero,ymm1[18],zero,ymm1[18,19,20,21],zero,ymm1[19],zero,ymm1[25,26,27,22],zero,ymm1[20],zero +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,ymm1[5],zero,ymm1[3],zero,zero,zero,zero,ymm1[6],zero,ymm1[4],zero,zero,zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[2,3,4,5],zero,ymm1[3],zero,ymm1[5,4,5,6],zero,ymm1[4],zero,ymm1[6,7,18,19,20,21],zero,ymm1[19],zero,ymm1[21,20,21,22],zero,ymm1[20],zero,ymm1[22,23] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm21 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm28 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vporq %zmm21, %zmm28, %zmm21 +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm28 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm27 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vporq %zmm28, %zmm27, %zmm27 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm27 +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm21 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm28 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vporq %zmm21, %zmm28, %zmm21 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm27, %zmm28, %zmm21 +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm27 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm30 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vporq %zmm27, %zmm30, %zmm27 +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm25 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm24 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vporq %zmm25, %zmm24, %zmm24 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm27, %zmm28, %zmm24 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm28, %zmm11 +; AVX512F-SLOW-NEXT: vporq %ymm6, %ymm31, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpor %ymm5, %ymm7, %ymm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm1 +; AVX512F-SLOW-NEXT: vpor %ymm12, %ymm15, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vpor %ymm10, %ymm8, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm6 +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[4],zero,ymm0[2],zero,zero,zero,zero,ymm0[5],zero,ymm0[3],zero,zero,zero,zero,ymm0[6],zero,ymm0[20],zero,ymm0[18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[4],zero,ymm2[2],zero,ymm2[4,5,4,5],zero,ymm2[3],zero,ymm2[3,4,5,6],zero,ymm2[20],zero,ymm2[18],zero,ymm2[20,21,20,21],zero,ymm2[19],zero,ymm2[19,20,21,22],zero +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm9[0,1,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm16[0,1,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[6,7,10,11,8,9,6,7,8,9,10,11,10,11,8,9,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[12,13,10,11,12,13,14,15,14,15,12,13,12,13,14,15,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm13 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm20[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm19[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm18[0,1,0,1] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm10[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm15 = zero,ymm10[13],zero,zero,zero,zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-SLOW-NEXT: vpandnq %ymm10, %ymm18, %ymm10 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm12 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm10 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[1,1,0,0,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,6] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,3,3] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm22 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm24 = mem[2,3,2,3] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm21 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm21 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,0,1,0] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,6] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm17 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm17 ; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm4 -; AVX512F-SLOW-NEXT: vporq %ymm15, %ymm16, %ymm5 -; AVX512F-SLOW-NEXT: vporq %ymm18, %ymm19, %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm4 +; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm7 = mem[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm29[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm8 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm26[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm16 = zmm23[0,0,1,0,4,4,5,4] +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm16 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm16 +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm7 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: # zmm8 = mem[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vporq %zmm7, %zmm8, %zmm7 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm1 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm10[0,1,0,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm13, %zmm3 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm3 -; AVX512F-SLOW-NEXT: vporq %ymm22, %ymm24, %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm7[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm6 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm25[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm5 = zmm20[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm12[0,0,1,0,4,4,5,4] -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm24, %zmm3 +; AVX512F-SLOW-NEXT: vporq %ymm25, %ymm27, %ymm7 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm7 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm15[0,0,1,0] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm8 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm12[0,1,0,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1 +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm22[0,1,0,1,4,5,4,5] +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm7 +; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm5, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm10 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm7, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, (%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, 64(%rax) -; AVX512F-SLOW-NEXT: addq $1368, %rsp # imm = 0x558 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512F-SLOW-NEXT: addq $1464, %rsp # imm = 0x5B8 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; -; AVX512F-ONLY-FAST-LABEL: store_i8_stride7_vf64: -; AVX512F-ONLY-FAST: # %bb.0: -; AVX512F-ONLY-FAST-NEXT: subq $1432, %rsp # imm = 0x598 -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero,ymm8[25] -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,ymm0[18] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[13,u,u,u,u,u],zero,ymm1[14,u,u,u,u,u],zero,ymm1[15,u,u,u,u,u],zero,ymm1[16,u,u,u,u,u],zero,ymm1[17,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm1, %ymm22 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm20 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vporq %xmm0, %xmm3, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm12, %xmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[0,1,14],zero,ymm8[12,13,0,1,14,15],zero,ymm8[3,12,13,2,3,16],zero,ymm8[30,31,28,29,16,17],zero,ymm8[31,18,19,28,29,18],zero -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] -; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] -; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-ONLY-FAST-NEXT: vpandn %ymm1, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] -; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm24, %ymm8, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm11, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm29, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm15[0,1,2,3],zmm8[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm28 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm10, %xmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm11 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm14, %xmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm21, %zmm30 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm3[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm10 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,6] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm0, %ymm26, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] -; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm13[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm14[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm0[23],zero,ymm0[23,24,25,26],zero,ymm0[24],zero,ymm0[30,31] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm5 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm14 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm16[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] -; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm4, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] -; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm4, %ymm11, %ymm31 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm31, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm12, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm15, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpandq %ymm4, %ymm21, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm20, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm4, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpandq %ymm24, %ymm19, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm5, %zmm4, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vpandq %ymm24, %ymm18, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm5, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm30[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm4, %zmm5, %zmm11 -; AVX512F-ONLY-FAST-NEXT: vpandq %ymm24, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm4, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm16 = zmm0[0,1,0,1],mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[1,1,0,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,2,0,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm18 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[1,1,0,0,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm12, %ymm5, %ymm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm12 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm13 -; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] -; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [4,5,4,5,5,7,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermd %ymm6, %ymm19, %ymm19 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[2,3,2,3,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # ymm21 = mem[0,1,0,1] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm22 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 32-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm14, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload -; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm23[0,1,0,1,4,5,4,5] -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm17, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm15, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpor %ymm13, %ymm8, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 -; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 320(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 64(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 384(%rax) -; AVX512F-ONLY-FAST-NEXT: addq $1432, %rsp # imm = 0x598 -; AVX512F-ONLY-FAST-NEXT: vzeroupper -; AVX512F-ONLY-FAST-NEXT: retq -; -; AVX512DQ-FAST-LABEL: store_i8_stride7_vf64: -; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: subq $1432, %rsp # imm = 0x598 -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm10 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero,ymm8[25] -; AVX512DQ-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,ymm0[18] -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm16 -; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[13,u,u,u,u,u],zero,ymm1[14,u,u,u,u,u],zero,ymm1[15,u,u,u,u,u],zero,ymm1[16,u,u,u,u,u],zero,ymm1[17,u,u,u] -; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm1, %ymm22 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512DQ-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm20 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vporq %xmm0, %xmm3, %xmm21 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm12, %xmm29 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm5 -; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm12 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm5 -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm18 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[0,1,14],zero,ymm8[12,13,0,1,14,15],zero,ymm8[3,12,13,2,3,16],zero,ymm8[30,31,28,29,16,17],zero,ymm8[31,18,19,28,29,18],zero -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u] -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm14 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm10 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25] -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm7 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] -; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] -; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6] -; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512DQ-FAST-NEXT: vpandn %ymm1, %ymm4, %ymm4 -; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm22 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] -; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm24, %ymm8, %ymm15 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm11, %xmm4 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm15[0,1,2,3],zmm8[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm1 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm28 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm10, %xmm25 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm11 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] -; AVX512DQ-FAST-NEXT: vmovdqa %xmm14, %xmm8 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm21, %zmm30 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm11 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm3[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm10 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %xmm10 -; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,6] -; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm7 -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5] -; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512DQ-FAST-NEXT: vpandnq %ymm0, %ymm26, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm7 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] -; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm13 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm13[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm14[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm0[23],zero,ymm0[23,24,25,26],zero,ymm0[24],zero,ymm0[30,31] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm4 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,3,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm5 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm5 -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm14 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm16[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] -; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm15 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm2 -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm4, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] -; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm4, %ymm11, %ymm31 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm31, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm9 -; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm12, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm15, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm3 -; AVX512DQ-FAST-NEXT: vpandq %ymm4, %ymm21, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm20, %zmm1 -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm4, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vpandq %ymm24, %ymm19, %ymm4 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm4, %zmm4 -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm5, %zmm4, %zmm4 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 -; AVX512DQ-FAST-NEXT: vpandq %ymm24, %ymm18, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1 -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm5, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 -; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,0,1,4,5,4,5] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm30[0,1,0,1,4,5,4,5] -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm4, %zmm5, %zmm11 -; AVX512DQ-FAST-NEXT: vpandq %ymm24, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm4, %zmm2 -; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512DQ-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm16 = zmm0[0,1,0,1],mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[1,1,0,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,2,0,0,1] -; AVX512DQ-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[1,1,0,0,4,5,6,7] -; AVX512DQ-FAST-NEXT: vpermd %ymm12, %ymm5, %ymm17 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm10 -; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm12 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm14 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero -; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm13 -; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] -; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31] -; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm8 -; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [4,5,4,5,5,7,4,5] -; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm19, %ymm19 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm20 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm20 -; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: # ymm21 = mem[0,1,0,1] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm21 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm22 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm22 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm29 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm29 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 32-byte Folded Reload -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm1 -; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm14, %ymm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7 -; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload -; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,0,1,4,5,4,5] -; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm23[0,1,0,1,4,5,4,5] -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm17, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm3 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 -; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm15, %ymm4 -; AVX512DQ-FAST-NEXT: vpor %ymm13, %ymm8, %ymm5 -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 -; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 320(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 64(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 384(%rax) -; AVX512DQ-FAST-NEXT: addq $1432, %rsp # imm = 0x598 -; AVX512DQ-FAST-NEXT: vzeroupper -; AVX512DQ-FAST-NEXT: retq +; AVX512F-FAST-LABEL: store_i8_stride7_vf64: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: subq $1736, %rsp # imm = 0x6C8 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm13[14],zero,zero,zero,zero,zero,zero,ymm13[15],zero,zero,zero,zero,zero,zero,ymm13[16],zero,zero,zero,zero,zero,zero,ymm13[17],zero,zero,zero,zero,zero,zero,ymm13[18] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm4 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm5 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[0,1,0,1,14],zero,ymm5[14,15,0,1,14,15],zero,ymm5[13,14,15,16,17,16],zero,ymm5[30,31,30,31,16,17],zero,ymm5[31,28,29,30,31] +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm9 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm9[14],zero,zero,zero,zero,zero,zero,ymm9[15],zero,zero,zero,zero,zero,zero,ymm9[16],zero,zero,zero,zero,zero,zero,ymm9[17],zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm10 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[13,u,u,u,u,u],zero,ymm10[14,u,u,u,u,u],zero,ymm10[15,u,u,u,u,u],zero,ymm10[16,u,u,u,u,u],zero,ymm10[17,u,u,u] +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm2[14],zero,ymm2[12],zero,zero,zero,zero,ymm2[15],zero,ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,7,128,128,128,128,10,128,8,128,128,128,128,11,128,9,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm28 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[12,13,14],zero,ymm6[12],zero,ymm6[14,15,14,15],zero,ymm6[13],zero,ymm6[15,12,13,28,29,30],zero,ymm6[28],zero,ymm6[30,31,30,31],zero,ymm6[29],zero,ymm6[31,28,29] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <9,128,7,u,u,u,128,10,128,8,u,u,u,128,11,128,25,128,23,u,u,u,128,26,128,24,u,u,u,128,27,128> +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm21 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm11 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm11[14],zero,ymm11[12],zero,zero,zero,zero,ymm11[15],zero,ymm11[13],zero,zero,zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm12 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,9,128,7,128,128,128,128,10,128,8,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[11,u,u,u],zero,ymm12[14],zero,ymm12[12,u,u,u],zero,ymm12[15],zero,ymm12[13,u,27,u,u,u],zero,ymm12[30],zero,ymm12[28,u,u,u],zero,ymm12[31],zero,ymm12[29,u] +; AVX512F-FAST-NEXT: vmovdqu %ymm12, (%rsp) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,128,7,128,5,6,7,10,128,8,128,12,13,10,11,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm6[11],zero,zero,zero,zero,ymm6[14],zero,ymm6[12],zero,zero,zero,zero,ymm6[15],zero,ymm6[13],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29] +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,9,128,7,128,128,128,128,10,128,8,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm22 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[13],zero,ymm2[11,u,u,u],zero,ymm2[14],zero,ymm2[12,u,u,u],zero,ymm2[15],zero,ymm2[29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,8,9,128,7,128,7,8,9,10,128,8,128,14,15,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm16 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm2 +; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[12,13,10,11,12,13,14,15,14,15,12,13,12,13,14,15,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,10,11,8,9,6,7,8,9,10,11,10,11,8,9,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm7 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6,u,u,u],zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,7],zero,xmm7[5],zero,xmm7[u,u,u,8],zero,xmm7[6],zero,xmm7[u,u,u,9] +; AVX512F-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm17 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm6, %xmm25 +; AVX512F-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm6 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm15 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm3[4,u,u,u],zero,xmm3[7],zero,xmm3[5,u,u,u],zero,xmm3[8],zero,xmm3[6] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm15, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm14, %xmm29 +; AVX512F-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm14 +; AVX512F-FAST-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm1 +; AVX512F-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm13[5],zero,ymm13[3],zero,zero,zero,zero,ymm13[6],zero,ymm13[4],zero,zero,zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero,ymm13[20],zero,zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[2,3,4,5],zero,ymm14[3],zero,ymm14[5,4,5,6],zero,ymm14[4],zero,ymm14[6,7,18,19,20,21],zero,ymm14[19],zero,ymm14[21,20,21,22],zero,ymm14[20],zero,ymm14[22,23] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm4[2],zero,zero,zero,zero,ymm4[5],zero,ymm4[3],zero,zero,zero,zero,ymm4[6],zero,ymm4[4],zero,ymm4[18],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[2],zero,ymm5[2,3,4,5],zero,ymm5[3],zero,ymm5[9,10,11,6],zero,ymm5[4],zero,ymm5[18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm10[4],zero,ymm10[2],zero,zero,zero,zero,ymm10[5],zero,ymm10[3],zero,zero,zero,zero,ymm10[6],zero,ymm10[20],zero,ymm10[18],zero,zero,zero,zero,ymm10[21],zero,ymm10[19],zero,zero,zero,zero,ymm10[22] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm24 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[4],zero,ymm9[2],zero,ymm9[4,5,4,5],zero,ymm9[3],zero,ymm9[3,4,5,6],zero,ymm9[20],zero,ymm9[18],zero,ymm9[20,21,20,21],zero,ymm9[19],zero,ymm9[19,20,21,22],zero +; AVX512F-FAST-NEXT: vmovdqa64 %ymm9, %ymm22 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512F-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u],zero,xmm5[7],zero,xmm5[5,u,u,u],zero,xmm5[8],zero,xmm5[6,u,u,u],zero +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u,u,9] +; AVX512F-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm1[4,u,u,u],zero,xmm1[7],zero,xmm1[5,u,u,u],zero,xmm1[8],zero,xmm1[6] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512F-FAST-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm12[14],zero,zero,zero,zero,zero,zero,ymm12[15],zero,zero,zero,zero,zero,zero,ymm12[16],zero,zero,zero,zero,zero,zero,ymm12[17],zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm11[0,1,0,1,14],zero,ymm11[14,15,0,1,14,15],zero,ymm11[13,14,15,16,17,16],zero,ymm11[30,31,30,31,16,17],zero,ymm11[31,28,29,30,31] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm31 +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm10, %ymm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,ymm0[18] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm29 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm28 +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm10, %ymm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm26 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[13,u,u,u,u,u],zero,ymm1[14,u,u,u,u,u],zero,ymm1[15,u,u,u,u,u],zero,ymm1[16,u,u,u,u,u],zero,ymm1[17,u,u,u] +; AVX512F-FAST-NEXT: vpor %ymm0, %ymm10, %ymm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-FAST-NEXT: vpandn %ymm0, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm4 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,ymm4[13],zero,zero,zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm23 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm27 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm10 +; AVX512F-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm10 +; AVX512F-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm3[8],xmm15[9],xmm3[9],xmm15[10],xmm3[10],xmm15[11],xmm3[11],xmm15[12],xmm3[12],xmm15[13],xmm3[13],xmm15[14],xmm3[14],xmm15[15],xmm3[15] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3],xmm15[4],xmm3[4],xmm15[5],xmm3[5],xmm15[6],xmm3[6],xmm15[7],xmm3[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm10, %xmm10 +; AVX512F-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 16-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,ymm0[14],zero,ymm0[12],zero,zero,zero,zero,ymm0[15],zero,ymm0[13],zero,zero,zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[12,13,14],zero,ymm0[12],zero,ymm0[14,15,14,15],zero,ymm0[13],zero,ymm0[15,12,13,28,29,30],zero,ymm0[28],zero,ymm0[30,31,30,31],zero,ymm0[29],zero,ymm0[31,28,29] +; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm6 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm7 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm0[14],zero,ymm0[12],zero,zero,zero,zero,ymm0[15],zero,ymm0[13],zero,zero,zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[11,u,u,u],zero,ymm0[14],zero,ymm0[12,u,u,u],zero,ymm0[15],zero,ymm0[13,u,27,u,u,u],zero,ymm0[30],zero,ymm0[28,u,u,u],zero,ymm0[31],zero,ymm0[29,u] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm15 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm8 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm8[11],zero,zero,zero,zero,ymm8[14],zero,ymm8[12],zero,zero,zero,zero,ymm8[15],zero,ymm8[13],zero,ymm8[27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29] +; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm8 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[13],zero,ymm8[11,u,u,u],zero,ymm8[14],zero,ymm8[12,u,u,u],zero,ymm8[15],zero,ymm8[29],zero,ymm8[27,u,u,u],zero,ymm8[30],zero,ymm8[28,u,u,u],zero,ymm8[31],zero +; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm3 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX512F-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm11[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm12[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm7[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm9[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm15[0,1,0,1] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm9 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm19 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm11 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm18 +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm15 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3] +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13] +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm11 +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = zero,ymm0[2],zero,zero,zero,zero,ymm0[5],zero,ymm0[3],zero,zero,zero,zero,ymm0[6],zero,ymm0[4],zero,ymm0[18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[2],zero,ymm0[2,3,4,5],zero,ymm0[3],zero,ymm0[9,10,11,6],zero,ymm0[4],zero,ymm0[18],zero,ymm0[18,19,20,21],zero,ymm0[19],zero,ymm0[25,26,27,22],zero,ymm0[20],zero +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,ymm0[5],zero,ymm0[3],zero,zero,zero,zero,ymm0[6],zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[2,3,4,5],zero,ymm0[3],zero,ymm0[5,4,5,6],zero,ymm0[4],zero,ymm0[6,7,18,19,20,21],zero,ymm0[19],zero,ymm0[21,20,21,22],zero,ymm0[20],zero,ymm0[22,23] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm0[4],zero,ymm0[2],zero,zero,zero,zero,ymm0[5],zero,ymm0[3],zero,zero,zero,zero,ymm0[6],zero,ymm0[20],zero,ymm0[18],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[4],zero,ymm0[2],zero,ymm0[4,5,4,5],zero,ymm0[3],zero,ymm0[3,4,5,6],zero,ymm0[20],zero,ymm0[18],zero,ymm0[20,21,20,21],zero,ymm0[19],zero,ymm0[19,20,21,22],zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[6,7,10,11,8,9,6,7,8,9,10,11,10,11,8,9,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[12,13,10,11,12,13,14,15,14,15,12,13,12,13,14,15,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,3,2,3,4,5,2,3,4,5,12,13,14,15,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm3 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm0[13],zero,zero,zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm0 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm31 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vporq %zmm0, %zmm31, %zmm0 +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm31 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm29 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vporq %zmm31, %zmm29, %zmm29 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm29 +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm0 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm31 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vporq %zmm0, %zmm31, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm29, %zmm31, %zmm0 +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm29 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm28 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vporq %zmm29, %zmm28, %zmm28 +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm29 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm26 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vporq %zmm29, %zmm26, %zmm26 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm28, %zmm31, %zmm26 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm19 = zmm19[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm18 = zmm18[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm19, %zmm31, %zmm18 +; AVX512F-FAST-NEXT: vporq %ymm25, %ymm21, %ymm19 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm19, %zmm17 +; AVX512F-FAST-NEXT: vporq %ymm22, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm10, %zmm10 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm16 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm10 +; AVX512F-FAST-NEXT: vpor %ymm14, %ymm12, %ymm12 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm4, %zmm12 +; AVX512F-FAST-NEXT: vpor %ymm13, %ymm15, %ymm13 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm4, %zmm13 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm12, %zmm16, %zmm13 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-FAST-NEXT: vpandn %ymm1, %ymm12, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm30[0,1,0,1] +; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm11[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm7[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm11 = mem[0,0,1,0] +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm14 = mem[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm15 = mem[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm27[0,0,1,0] +; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,2,3] +; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm2 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm23 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm23 +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm0 = mem[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm9 = mem[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm9 +; AVX512F-FAST-NEXT: vpermq $16, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm0 = mem[0,0,1,0,4,4,5,4] +; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm12 = mem[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm12 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm12 +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm0 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm9 = mem[2,3,2,3,6,7,6,7] +; AVX512F-FAST-NEXT: vporq %zmm0, %zmm9, %zmm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm26, %zmm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm0 +; AVX512F-FAST-NEXT: vpor %ymm14, %ymm15, %ymm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm4, %zmm4 +; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm4 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm17, %zmm0 +; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: # zmm5 = mem[0,1,0,1,4,5,4,5] +; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm5 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm5 +; AVX512F-FAST-NEXT: vpor %ymm7, %ymm8, %ymm0 +; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm1 +; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, 64(%rax) +; AVX512F-FAST-NEXT: addq $1736, %rsp # imm = 0x6C8 +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: store_i8_stride7_vf64: ; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: subq $72, %rsp ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-SLOW-NEXT: vmovdqa (%rax), %ymm13 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rax), %ymm9 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX512BW-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm0 -; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-SLOW-NEXT: vpermw %ymm13, %ymm2, %ymm2 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %ymm10 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX512BW-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm3 -; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %ymm12 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512BW-SLOW-NEXT: vpshufb %ymm4, %ymm12, %ymm5 -; AVX512BW-SLOW-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm6 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX512BW-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa (%rax), %xmm6 +; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512BW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-SLOW-NEXT: vpermw %ymm6, %ymm9, %ymm2 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX512BW-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm4 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512BW-SLOW-NEXT: vpshufb %ymm16, %ymm0, %ymm5 +; AVX512BW-SLOW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm14 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %xmm19 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm19[8],xmm14[8],xmm19[9],xmm14[9],xmm19[10],xmm14[10],xmm19[11],xmm14[11],xmm19[12],xmm14[12],xmm19[13],xmm14[13],xmm19[14],xmm14[14],xmm19[15],xmm14[15] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm5, %xmm5 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm11 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm21 ; AVX512BW-SLOW-NEXT: movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020 ; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm11 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512BW-SLOW-NEXT: vpshufb %ymm0, %ymm17, %ymm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %ymm18 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512BW-SLOW-NEXT: vpshufb %ymm3, %ymm18, %ymm7 -; AVX512BW-SLOW-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm29 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %xmm23 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm23[8],xmm29[8],xmm23[9],xmm29[9],xmm23[10],xmm29[10],xmm23[11],xmm29[11],xmm23[12],xmm29[12],xmm23[13],xmm29[13],xmm23[14],xmm29[14],xmm23[15],xmm29[15] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm21 {%k1} +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512BW-SLOW-NEXT: vpshufb %ymm12, %ymm4, %ymm1 +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512BW-SLOW-NEXT: vpshufb %ymm13, %ymm5, %ymm7 +; AVX512BW-SLOW-NEXT: vpor %ymm1, %ymm7, %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %xmm22 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm22[8],xmm20[8],xmm22[9],xmm20[9],xmm22[10],xmm20[10],xmm22[11],xmm20[11],xmm22[12],xmm20[12],xmm22[13],xmm20[13],xmm22[14],xmm20[14],xmm22[15],xmm20[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm23 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX512BW-SLOW-NEXT: vpshufb %xmm23, %xmm7, %xmm7 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm19 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm20, %ymm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %ymm21 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512BW-SLOW-NEXT: vpshufb %ymm26, %ymm21, %ymm7 -; AVX512BW-SLOW-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm22 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %xmm16 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm16[8],xmm22[8],xmm16[9],xmm22[9],xmm16[10],xmm22[10],xmm16[11],xmm22[11],xmm16[12],xmm22[12],xmm16[13],xmm22[13],xmm16[14],xmm22[14],xmm16[15],xmm22[15] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm25 = xmm25[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,1,0,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm25, %zmm8 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm25 +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] +; AVX512BW-SLOW-NEXT: vpshufb %ymm17, %ymm7, %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %ymm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX512BW-SLOW-NEXT: vpshufb %ymm18, %ymm8, %ymm24 +; AVX512BW-SLOW-NEXT: vporq %ymm1, %ymm24, %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm26 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %xmm27 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm24 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX512BW-SLOW-NEXT: vpshufb %xmm24, %xmm28, %xmm28 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[0,1,0,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm28, %zmm3 ; AVX512BW-SLOW-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306 ; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm19, %zmm8 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm25, %zmm3 {%k1} ; AVX512BW-SLOW-NEXT: movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38 ; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm11, %zmm8 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] -; AVX512BW-SLOW-NEXT: vpermw %ymm9, %ymm11, %ymm11 -; AVX512BW-SLOW-NEXT: vpshufb %ymm1, %ymm9, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r9), %ymm28 -; AVX512BW-SLOW-NEXT: vpshufb %ymm2, %ymm28, %ymm2 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %ymm31 -; AVX512BW-SLOW-NEXT: vpshufb %ymm4, %ymm31, %ymm4 -; AVX512BW-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm31[20],zero,ymm31[18],zero,ymm31[20,21,20,21],zero,ymm31[19],zero,ymm31[19,20,21,22],zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm11 = ymm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm28[20],zero,ymm28[18],zero,zero,zero,zero,ymm28[21],zero,ymm28[19],zero,zero,zero,zero,ymm28[22] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpor %ymm4, %ymm11, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm4 -; AVX512BW-SLOW-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm21, %zmm3 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0] +; AVX512BW-SLOW-NEXT: vpermw %ymm6, %ymm21, %ymm21 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm28 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm28, %xmm6, %xmm6 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm21, %zmm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm29 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX512BW-SLOW-NEXT: vpshufb %xmm29, %xmm14, %xmm21 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm30 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm19, %xmm25 +; AVX512BW-SLOW-NEXT: vporq %xmm21, %xmm25, %xmm21 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm19[0],xmm14[0],xmm19[1],xmm14[1],xmm19[2],xmm14[2],xmm19[3],xmm14[3],xmm19[4],xmm14[4],xmm19[5],xmm14[5],xmm19[6],xmm14[6],xmm19[7],xmm14[7] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm19, %xmm19 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm21, %zmm19, %zmm19 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm21 = zmm19[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: movabsq $4647998506761461824, %r10 # imm = 0x4081020408102040 +; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm6, %zmm21 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm31 = +; AVX512BW-SLOW-NEXT: vpshufb %xmm31, %xmm22, %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm20, %xmm19 +; AVX512BW-SLOW-NEXT: vporq %xmm6, %xmm19, %xmm6 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm20[0],xmm22[0],xmm20[1],xmm22[1],xmm20[2],xmm22[2],xmm20[3],xmm22[3],xmm20[4],xmm22[4],xmm20[5],xmm22[5],xmm20[6],xmm22[6],xmm20[7],xmm22[7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm20, %xmm20 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm20, %zmm6 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm27, %xmm20 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm26, %xmm22 +; AVX512BW-SLOW-NEXT: vporq %xmm20, %xmm22, %xmm20 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm26[0],xmm27[0],xmm26[1],xmm27[1],xmm26[2],xmm27[2],xmm26[3],xmm27[3],xmm26[4],xmm27[4],xmm26[5],xmm27[5],xmm26[6],xmm27[6],xmm26[7],xmm27[7] +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm25 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX512BW-SLOW-NEXT: vpshufb %xmm25, %xmm22, %xmm22 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm20, %zmm22, %zmm20 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm22 = zmm6[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm6 = zmm20[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: movabsq $871499720017774092, %r10 # imm = 0xC183060C183060C +; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm22, %zmm6 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rax), %xmm20 +; AVX512BW-SLOW-NEXT: movabsq $8133997386832558192, %r10 # imm = 0x70E1C3870E1C3870 +; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm21, %zmm6 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r9), %xmm21 +; AVX512BW-SLOW-NEXT: vpermw %ymm20, %ymm9, %ymm9 +; AVX512BW-SLOW-NEXT: vpshufb %xmm28, %xmm20, %xmm22 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm22 = ymm22[0,0,1,0] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm22, %zmm9 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm22 +; AVX512BW-SLOW-NEXT: vpshufb %xmm29, %xmm21, %xmm26 +; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm22, %xmm27 +; AVX512BW-SLOW-NEXT: vporq %xmm26, %xmm27, %xmm27 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm26 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] +; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm26, %xmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdx), %xmm26 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm27, %zmm10 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rcx), %xmm27 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: movabsq $290499906672591364, %r10 # imm = 0x408102040810204 +; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm9, %zmm10 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdi), %xmm28 +; AVX512BW-SLOW-NEXT: vpshufb %xmm31, %xmm27, %xmm9 +; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm26, %xmm2 +; AVX512BW-SLOW-NEXT: vpor %xmm2, %xmm9, %xmm2 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] +; AVX512BW-SLOW-NEXT: vpshufb %xmm23, %xmm9, %xmm9 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rsi), %xmm30 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm2, %zmm2 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm30, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm28, %xmm0 +; AVX512BW-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15] +; AVX512BW-SLOW-NEXT: vpshufb %xmm24, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm0[0,1,0,1,4,5,4,5] +; AVX512BW-SLOW-NEXT: movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060 ; AVX512BW-SLOW-NEXT: kmovq %r10, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm4 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm25 -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm25[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] -; AVX512BW-SLOW-NEXT: movl $676341840, %esi # imm = 0x28502850 -; AVX512BW-SLOW-NEXT: kmovd %esi, %k1 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm2[u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k1} +; AVX512BW-SLOW-NEXT: movabsq $-8714997200177740921, %r10 # imm = 0x870E1C3870E1C387 +; AVX512BW-SLOW-NEXT: kmovq %r10, %k2 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm9 {%k2} +; AVX512BW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] +; AVX512BW-SLOW-NEXT: vpermw %ymm10, %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vpshufb %ymm11, %ymm10, %ymm1 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%r9), %ymm11 +; AVX512BW-SLOW-NEXT: vpshufb %ymm15, %ymm11, %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX512BW-SLOW-NEXT: vpshufb %ymm16, %ymm15, %ymm2 +; AVX512BW-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm15[18],zero,ymm15[20,21,20,21],zero,ymm15[19],zero,ymm15[19,20,21,22],zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm16 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[20],zero,ymm11[18],zero,zero,zero,zero,ymm11[21],zero,ymm11[19],zero,zero,zero,zero,ymm11[22] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,2,3] +; AVX512BW-SLOW-NEXT: vporq %ymm2, %ymm16, %ymm2 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: movabsq $145249953336295682, %rax # imm = 0x204081020408102 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k2 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm24 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm23 +; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm23[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512BW-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm31 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] +; AVX512BW-SLOW-NEXT: movl $676341840, %eax # imm = 0x28502850 +; AVX512BW-SLOW-NEXT: kmovd %eax, %k2 +; AVX512BW-SLOW-NEXT: vpshufb %ymm31, %ymm24, %ymm0 {%k2} +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufb %ymm17, %ymm23, %ymm2 +; AVX512BW-SLOW-NEXT: vpshufb %ymm18, %ymm24, %ymm16 +; AVX512BW-SLOW-NEXT: vporq %ymm2, %ymm16, %ymm2 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm16 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm17 +; AVX512BW-SLOW-NEXT: vpshufb %ymm12, %ymm17, %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rcx), %ymm12 +; AVX512BW-SLOW-NEXT: vpshufb %ymm13, %ymm12, %ymm2 +; AVX512BW-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm17[2],zero,ymm17[2,3,4,5],zero,ymm17[3],zero,ymm17[9,10,11,6],zero,ymm17[4],zero,ymm17[18],zero,ymm17[18,19,20,21],zero,ymm17[19],zero,ymm17[25,26,27,22],zero,ymm17[20],zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm13 = zero,ymm12[2],zero,zero,zero,zero,ymm12[5],zero,ymm12[3],zero,zero,zero,zero,ymm12[6],zero,ymm12[4],zero,ymm12[18],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22],zero,ymm12[20] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpor %ymm2, %ymm13, %ymm2 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: movabsq $3485998880071096368, %rax # imm = 0x3060C183060C1830 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k4 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k4} +; AVX512BW-SLOW-NEXT: movabsq $-4357498600088870461, %rax # imm = 0xC3870E1C3870E1C3 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k3 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm16 {%k3} +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm7[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX512BW-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm29 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] +; AVX512BW-SLOW-NEXT: movl $338170920, %eax # imm = 0x14281428 +; AVX512BW-SLOW-NEXT: kmovd %eax, %k3 +; AVX512BW-SLOW-NEXT: vpshufb %ymm29, %ymm8, %ymm0 {%k3} +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm28[0],xmm30[0],xmm28[1],xmm30[1],xmm28[2],xmm30[2],xmm28[3],xmm30[3],xmm28[4],xmm30[4],xmm28[5],xmm30[5],xmm28[6],xmm30[6],xmm28[7],xmm30[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm25, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm18 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512BW-SLOW-NEXT: vpshufb %ymm18, %ymm5, %ymm1 +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,3,3,4,6,7,7] +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k2} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm26[0],xmm27[0],xmm26[1],xmm27[1],xmm26[2],xmm27[2],xmm26[3],xmm27[3],xmm26[4],xmm27[4],xmm26[5],xmm27[5],xmm26[6],xmm27[6],xmm26[7],xmm27[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm25, %ymm11 -; AVX512BW-SLOW-NEXT: vpshufb %ymm26, %ymm2, %ymm19 -; AVX512BW-SLOW-NEXT: vporq %ymm11, %ymm19, %ymm11 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm11 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX512BW-SLOW-NEXT: vpshufb %ymm0, %ymm5, %ymm19 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX512BW-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm3 -; AVX512BW-SLOW-NEXT: vporq %ymm19, %ymm3, %ymm3 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm19 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm19[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm24 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,3,2,3] -; AVX512BW-SLOW-NEXT: vporq %ymm19, %ymm24, %ymm19 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm3, %zmm3 -; AVX512BW-SLOW-NEXT: movabsq $3485998880071096368, %rsi # imm = 0x3060C183060C1830 -; AVX512BW-SLOW-NEXT: kmovq %rsi, %k3 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm3, %zmm11 {%k3} -; AVX512BW-SLOW-NEXT: movabsq $-4357498600088870461, %rsi # imm = 0xC3870E1C3870E1C3 -; AVX512BW-SLOW-NEXT: kmovq %rsi, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm4, %zmm11 {%k2} -; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm20[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] -; AVX512BW-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm6 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] -; AVX512BW-SLOW-NEXT: movl $338170920, %esi # imm = 0x14281428 -; AVX512BW-SLOW-NEXT: kmovd %esi, %k2 -; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm21, %ymm3 {%k2} -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm26 -; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm17[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,3,3,4,6,7,7] -; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512BW-SLOW-NEXT: vpshufb %ymm7, %ymm18, %ymm4 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm4[2,3,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm27 = xmm27[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,1,0,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm19, %zmm19 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm26, %zmm19 {%k3} -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm26 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm10[27],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm27 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[27],zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,3,2,3] -; AVX512BW-SLOW-NEXT: vporq %ymm26, %ymm27, %ymm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r9), %xmm26 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm27 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm30, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rax), %zmm30 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm24 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] -; AVX512BW-SLOW-NEXT: vpermw %zmm30, %zmm24, %zmm24 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm13 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm13 {%k4} +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm19[27],zero,zero,zero,zero,ymm19[30],zero,ymm19[28],zero,zero,zero,zero,ymm19[31],zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm25[27],zero,zero,zero,zero,ymm25[30],zero,ymm25[28],zero,zero,zero,zero,ymm25[31],zero,ymm25[29] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] +; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm14 # 64-byte Reload +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm20, %zmm14, %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] +; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm2, %zmm1 ; AVX512BW-SLOW-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 -; AVX512BW-SLOW-NEXT: kmovq %rax, %k3 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm24, %zmm0 {%k3} +; AVX512BW-SLOW-NEXT: kmovq %rax, %k4 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k4} ; AVX512BW-SLOW-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E -; AVX512BW-SLOW-NEXT: kmovq %rax, %k3 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm19 {%k3} -; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm25[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] -; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm0 {%k2} -; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] -; AVX512BW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k4 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm13 {%k4} +; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512BW-SLOW-NEXT: vpshufb %ymm31, %ymm8, %ymm0 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [9,128,7,128,128,128,128,10,128,8,128,128,128,128,11,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] +; AVX512BW-SLOW-NEXT: vpshufb %ymm1, %ymm8, %ymm2 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] -; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm25, %ymm25 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm25 = ymm25[2,3,2,3] -; AVX512BW-SLOW-NEXT: vporq %ymm2, %ymm25, %ymm2 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [128,7,128,128,128,128,10,128,8,128,128,128,128,11,128,9,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512BW-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm7 +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpor %ymm2, %ymm7, %ymm2 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm25 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm2 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm4[18,19,20,21],zero,zmm4[19],zero,zmm4[25,26,27,22],zero,zmm4[20],zero,zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm4[55],zero,zmm4[53,54,55,58],zero,zmm4[56],zero,zmm4[60,61,58,59] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,3,2,3,6,7,6,7] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm4 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22],zero,zmm5[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7] +; AVX512BW-SLOW-NEXT: vporq %zmm2, %zmm4, %zmm4 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm4 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512BW-SLOW-NEXT: vpermw %zmm14, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm2 = zmm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm25[18],zero,zmm25[20,21,20,21],zero,zmm25[19],zero,zmm25[19,20,21,22],zero,zmm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm25[55],zero,zmm25[55,56,57,58],zero,zmm25[56],zero,zmm25[62,63] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,3,2,3,6,7,6,7] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm5 = zmm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm19[20],zero,zmm19[18],zero,zero,zero,zero,zmm19[21],zero,zmm19[19],zero,zero,zero,zero,zmm19[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm19[57],zero,zmm19[55],zero,zero,zero,zero,zmm19[58],zero,zmm19[56],zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,3,2,3,6,7,6,7] +; AVX512BW-SLOW-NEXT: vporq %zmm2, %zmm5, %zmm2 +; AVX512BW-SLOW-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 +; AVX512BW-SLOW-NEXT: kmovq %rax, %k4 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k4} +; AVX512BW-SLOW-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C +; AVX512BW-SLOW-NEXT: kmovq %rax, %k4 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm4 {%k4} +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm23[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX512BW-SLOW-NEXT: vpshufb %ymm29, %ymm24, %ymm0 {%k3} +; AVX512BW-SLOW-NEXT: vpshufb %ymm1, %ymm24, %ymm1 +; AVX512BW-SLOW-NEXT: vpshufb %ymm8, %ymm23, %ymm2 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512BW-SLOW-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,3,3,4,6,7,7] -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} +; AVX512BW-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm17[8,9],zero,ymm17[7],zero,ymm17[5,6,7,10],zero,ymm17[8],zero,ymm17[12,13,10,11,24,25],zero,ymm17[23],zero,ymm17[21,22,23,26],zero,ymm17[24],zero,ymm17[28,29,26,27] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm12[9],zero,ymm12[7],zero,zero,zero,zero,ymm12[10],zero,ymm12[8],zero,zero,zero,zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512BW-SLOW-NEXT: vpshufb %ymm18, %ymm12, %ymm2 +; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm17[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,3,3,4,6,7,7] +; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm5, %ymm2 {%k2} +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18 ; AVX512BW-SLOW-NEXT: kmovq %rax, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm25 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm31[0,1,2,3],zmm0[4,5,6,7] +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm1 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm11, %zmm2 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm1[23],zero,zmm1[23,24,25,26],zero,zmm1[24],zero,zmm1[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm1[59],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm28[0,1,2,3],zmm2[4,5,6,7] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[25],zero,zmm5[23],zero,zero,zero,zero,zmm5[26],zero,zmm5[24],zero,zero,zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[59],zero,zero,zero,zero,zmm5[62],zero,zmm5[60],zero,zero,zero,zero,zmm5[63],zero,zmm5[61] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,3,2,3,6,7,6,7] -; AVX512BW-SLOW-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512BW-SLOW-NEXT: movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060 -; AVX512BW-SLOW-NEXT: kmovq %rax, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm25 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] -; AVX512BW-SLOW-NEXT: vpermi2w %zmm30, %zmm9, %zmm1 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[25],zero,zmm2[23],zero,zero,zero,zero,zmm2[26],zero,zmm2[24],zero,zero,zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm2[59],zero,zero,zero,zero,zmm2[62],zero,zmm2[60],zero,zero,zero,zero,zmm2[63],zero,zmm2[61] +; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,3,2,3,6,7,6,7] +; AVX512BW-SLOW-NEXT: vporq %zmm1, %zmm2, %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,30,29,30,31,31,30,30,31,30,29,30,31,31,30,30,31] +; AVX512BW-SLOW-NEXT: vpermw %zmm10, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 -; AVX512BW-SLOW-NEXT: kmovq %rax, %k3 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm25 {%k3} -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm20[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm21[u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] -; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm21, %ymm5 -; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm20, %ymm6 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm5, %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm6 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm5[18,19,20,21],zero,zmm5[19],zero,zmm5[25,26,27,22],zero,zmm5[20],zero,zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm5[55],zero,zmm5[53,54,55,58],zero,zmm5[56],zero,zmm5[60,61,58,59] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,3,2,3,6,7,6,7] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm6[18],zero,zero,zero,zero,zmm6[21],zero,zmm6[19],zero,zero,zero,zero,zmm6[22],zero,zmm6[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm6[57],zero,zmm6[55],zero,zero,zero,zero,zmm6[58],zero,zmm6[56],zero,zero,zero,zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,3,2,3,6,7,6,7] -; AVX512BW-SLOW-NEXT: vporq %zmm5, %zmm6, %zmm17 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm17 {%k2} -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm1 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm1[18],zero,zmm1[20,21,20,21],zero,zmm1[19],zero,zmm1[19,20,21,22],zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm1[55],zero,zmm1[55,56,57,58],zero,zmm1[56],zero,zmm1[62,63] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm0[20],zero,zmm0[18],zero,zero,zero,zero,zmm0[21],zero,zmm0[19],zero,zero,zero,zero,zmm0[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm0[57],zero,zmm0[55],zero,zero,zero,zero,zmm0[58],zero,zmm0[56],zero,zero -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] -; AVX512BW-SLOW-NEXT: vporq %zmm1, %zmm0, %zmm1 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm30, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm2, %zmm2 -; AVX512BW-SLOW-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 -; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1} -; AVX512BW-SLOW-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C -; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm17 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm6 -; AVX512BW-SLOW-NEXT: vpor %xmm2, %xmm6, %xmm2 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm14, %xmm7 -; AVX512BW-SLOW-NEXT: vpor %xmm4, %xmm7, %xmm4 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm4, %zmm4 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm4 = zmm4[0,1,0,1,4,5,4,5] -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm4 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm26, %xmm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] -; AVX512BW-SLOW-NEXT: vpermi2w %zmm30, %zmm9, %zmm10 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512BW-SLOW-NEXT: vpshufb %xmm9, %xmm27, %xmm12 -; AVX512BW-SLOW-NEXT: vpor %xmm7, %xmm12, %xmm7 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm7, %zmm7 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,1,0,1,4,5,4,5] -; AVX512BW-SLOW-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 ; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm10, %zmm7 {%k1} -; AVX512BW-SLOW-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 -; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm4 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm23, %xmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm29, %xmm5 -; AVX512BW-SLOW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm29[0],xmm23[0],xmm29[1],xmm23[1],xmm29[2],xmm23[2],xmm29[3],xmm23[3],xmm29[4],xmm23[4],xmm29[5],xmm23[5],xmm29[6],xmm23[6],xmm29[7],xmm23[7] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm5, %zmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm16, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm22, %xmm5 -; AVX512BW-SLOW-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm22[0],xmm16[0],xmm22[1],xmm16[1],xmm22[2],xmm16[2],xmm22[3],xmm16[3],xmm22[4],xmm16[4],xmm22[5],xmm16[5],xmm22[6],xmm16[6],xmm22[7],xmm16[7] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm3 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,1,0,1,4,5,4,5] -; AVX512BW-SLOW-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C -; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm3 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpshufb %xmm9, %xmm6, %xmm2 -; AVX512BW-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm2, %zmm0 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] -; AVX512BW-SLOW-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 -; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} -; AVX512BW-SLOW-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 -; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm3 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm11, 320(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm25, 384(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm19, 192(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, 256(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 384(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm13, 192(%rax) +; AVX512BW-SLOW-NEXT: addq $72, %rsp ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: store_i8_stride7_vf64: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: subq $200, %rsp +; AVX512BW-FAST-NEXT: subq $136, %rsp ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa (%rax), %ymm6 -; AVX512BW-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa 32(%rax), %ymm12 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm1 -; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-FAST-NEXT: vpermw %ymm6, %ymm4, %ymm4 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa (%r9), %ymm14 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX512BW-FAST-NEXT: vpshufb %ymm18, %ymm14, %ymm7 -; AVX512BW-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512BW-FAST-NEXT: vpshufb %ymm20, %ymm1, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512BW-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %xmm25 -; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm10 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm10[8],xmm25[8],xmm10[9],xmm25[9],xmm10[10],xmm25[10],xmm10[11],xmm25[11],xmm10[12],xmm25[12],xmm10[13],xmm25[13],xmm10[14],xmm25[14],xmm10[15],xmm25[15] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm22 +; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512BW-FAST-NEXT: vmovdqa (%rax), %xmm3 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512BW-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-FAST-NEXT: vpermw %ymm3, %ymm2, %ymm2 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm3 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm4 +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm6 +; AVX512BW-FAST-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm12 +; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm13 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm18 ; AVX512BW-FAST-NEXT: movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020 ; AVX512BW-FAST-NEXT: kmovq %r10, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm6, %zmm22 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512BW-FAST-NEXT: vpshufb %ymm21, %ymm1, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa %ymm1, %ymm6 -; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512BW-FAST-NEXT: vpshufb %ymm23, %ymm1, %ymm11 -; AVX512BW-FAST-NEXT: vmovdqa %ymm1, %ymm7 -; AVX512BW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FAST-NEXT: vpor %ymm8, %ymm11, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm16 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm26 -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512BW-FAST-NEXT: vpshufb %ymm28, %ymm11, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm13 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm29 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512BW-FAST-NEXT: vpshufb %ymm29, %ymm13, %ymm17 -; AVX512BW-FAST-NEXT: vporq %ymm8, %ymm17, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm17 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %xmm19 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm27 = xmm27[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm27, %zmm8 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm18 {%k1} +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm3 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512BW-FAST-NEXT: vpshufb %ymm16, %ymm5, %ymm9 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FAST-NEXT: vpor %ymm3, %ymm9, %ymm3 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm19 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm22 +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] +; AVX512BW-FAST-NEXT: vpshufb %ymm23, %ymm14, %ymm3 +; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %ymm15 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX512BW-FAST-NEXT: vpshufb %ymm24, %ymm15, %ymm9 +; AVX512BW-FAST-NEXT: vpor %ymm3, %ymm9, %ymm3 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm20 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %xmm21 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm25 = xmm25[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,1,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm25, %zmm6 ; AVX512BW-FAST-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306 ; AVX512BW-FAST-NEXT: kmovq %r10, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm26, %zmm8 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm22, %zmm6 {%k1} ; AVX512BW-FAST-NEXT: movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38 ; AVX512BW-FAST-NEXT: kmovq %r10, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm22, %zmm8 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] -; AVX512BW-FAST-NEXT: vpermw %ymm12, %ymm22, %ymm22 -; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm0 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm22 -; AVX512BW-FAST-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512BW-FAST-NEXT: vpshufb %ymm18, %ymm1, %ymm18 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %ymm27 -; AVX512BW-FAST-NEXT: vpshufb %ymm20, %ymm27, %ymm20 -; AVX512BW-FAST-NEXT: vporq %ymm18, %ymm20, %ymm18 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm27[20],zero,ymm27[18],zero,ymm27[20,21,20,21],zero,ymm27[19],zero,ymm27[19,20,21,22],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm26 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[20],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,3,2,3] -; AVX512BW-FAST-NEXT: vporq %ymm20, %ymm26, %ymm20 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm18, %zmm26 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm18, %zmm6 {%k1} +; AVX512BW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] +; AVX512BW-FAST-NEXT: vpermw %ymm28, %ymm18, %ymm18 +; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm28, %ymm0 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %ymm29 +; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm29, %ymm1 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %ymm30 +; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm30, %ymm2 +; AVX512BW-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm30[18],zero,ymm30[20,21,20,21],zero,ymm30[19],zero,ymm30[19,20,21,22],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm18 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm29[20],zero,ymm29[18],zero,zero,zero,zero,ymm29[21],zero,ymm29[19],zero,zero,zero,zero,ymm29[22] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm18[2,3,2,3] +; AVX512BW-FAST-NEXT: vporq %ymm2, %ymm18, %ymm2 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm18 ; AVX512BW-FAST-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 ; AVX512BW-FAST-NEXT: kmovq %r10, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm22, %zmm26 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %ymm22 -; AVX512BW-FAST-NEXT: vpshufb %ymm21, %ymm22, %ymm18 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rcx), %ymm30 -; AVX512BW-FAST-NEXT: vpshufb %ymm23, %ymm30, %ymm20 -; AVX512BW-FAST-NEXT: vporq %ymm18, %ymm20, %ymm18 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm22[18],zero,ymm22[18,19,20,21],zero,ymm22[19],zero,ymm22[25,26,27,22],zero,ymm22[20],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm21 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm30[18],zero,zero,zero,zero,ymm30[21],zero,ymm30[19],zero,zero,zero,zero,ymm30[22],zero,ymm30[20] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[2,3,2,3] -; AVX512BW-FAST-NEXT: vporq %ymm20, %ymm21, %ymm20 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm18, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %ymm31 -; AVX512BW-FAST-NEXT: vpshufb %ymm28, %ymm31, %ymm18 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX512BW-FAST-NEXT: vpshufb %ymm29, %ymm0, %ymm20 -; AVX512BW-FAST-NEXT: vporq %ymm18, %ymm20, %ymm18 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm20 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm31[18,19,20,21],zero,ymm31[19],zero,ymm31[21,20,21,22],zero,ymm31[20],zero,ymm31[22,23] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm23 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm18 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %ymm27 +; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm27, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX512BW-FAST-NEXT: vpshufb %ymm16, %ymm5, %ymm1 +; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm27[2],zero,ymm27[2,3,4,5],zero,ymm27[3],zero,ymm27[9,10,11,6],zero,ymm27[4],zero,ymm27[18],zero,ymm27[18,19,20,21],zero,ymm27[19],zero,ymm27[25,26,27,22],zero,ymm27[20],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm5[2],zero,zero,zero,zero,ymm5[5],zero,ymm5[3],zero,zero,zero,zero,ymm5[6],zero,ymm5[4],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512BW-FAST-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512BW-FAST-NEXT: vpshufb %ymm23, %ymm3, %ymm16 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512BW-FAST-NEXT: vpshufb %ymm24, %ymm2, %ymm22 +; AVX512BW-FAST-NEXT: vporq %ymm16, %ymm22, %ymm16 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm22 = ymm3[2,3,4,5],zero,ymm3[3],zero,ymm3[5,4,5,6],zero,ymm3[4],zero,ymm3[6,7,18,19,20,21],zero,ymm3[19],zero,ymm3[21,20,21,22],zero,ymm3[20],zero,ymm3[22,23] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,3,2,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm23 = zero,zero,zero,zero,ymm2[5],zero,ymm2[3],zero,zero,zero,zero,ymm2[6],zero,ymm2[4],zero,zero,zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20],zero,zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3] -; AVX512BW-FAST-NEXT: vporq %ymm20, %ymm23, %ymm20 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %zmm23 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm18, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm9 -; AVX512BW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 -; AVX512BW-FAST-NEXT: kmovq %r10, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm21, %zmm18 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm21 -; AVX512BW-FAST-NEXT: movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3 -; AVX512BW-FAST-NEXT: kmovq %r10, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm26, %zmm18 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 (%rax), %zmm26 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[0,1,2,3],zmm23[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm22 = zmm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,zmm22[23],zero,zmm22[21,22,23,26],zero,zmm22[24],zero,zmm22[28,29,26,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zero,zero,zmm22[62],zero,zmm22[60],zero,zero,zero,zero,zmm22[63],zero,zmm22[61],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm30[0,1,2,3],zmm2[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm28 = zmm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm28[25],zero,zmm28[23],zero,zero,zero,zero,zmm28[26],zero,zmm28[24],zero,zero,zero,zero,zmm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm28[62],zero,zmm28[60],zero,zero,zero,zero,zmm28[63],zero,zmm28[61],zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm28 = zmm28[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm22, %zmm28, %zmm29 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm28 -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,zmm0[23],zero,zero,zero,zero,zmm0[26],zero,zmm0[24],zero,zero,zero,zero,zmm0[27],zero,zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,60,61,62],zero,zmm0[60],zero,zmm0[62,63,62,63],zero,zmm0[61],zero,zmm0[63,60,61] +; AVX512BW-FAST-NEXT: vporq %ymm22, %ymm23, %ymm22 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm16, %zmm16 +; AVX512BW-FAST-NEXT: movabsq $3485998880071096368, %rsi # imm = 0x3060C183060C1830 +; AVX512BW-FAST-NEXT: kmovq %rsi, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm4, %zmm16 {%k1} +; AVX512BW-FAST-NEXT: movabsq $-4357498600088870461, %rsi # imm = 0xC3870E1C3870E1C3 +; AVX512BW-FAST-NEXT: kmovq %rsi, %k2 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm18, %zmm16 {%k2} +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm14[28],zero,ymm14[30,31,30,31],zero,ymm14[29],zero,ymm14[31,28,29] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm18 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm15[30],zero,ymm15[28],zero,zero,zero,zero,ymm15[31],zero,ymm15[29],zero,zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm18[2,3,2,3] +; AVX512BW-FAST-NEXT: vporq %ymm4, %ymm18, %ymm4 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm18[0,1,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm4, %zmm22 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm18 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm18[2,3,2,3] +; AVX512BW-FAST-NEXT: vporq %ymm4, %ymm18, %ymm18 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm25 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rcx), %xmm26 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm25[0],xmm26[0],xmm25[1],xmm26[1],xmm25[2],xmm26[2],xmm25[3],xmm26[3],xmm25[4],xmm26[4],xmm25[5],xmm26[5],xmm25[6],xmm26[6],xmm25[7],xmm26[7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm23, %xmm23 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,1,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm18, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm22, %zmm18 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm22 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm4[27],zero,zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,3,2,3] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm23 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[27],zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,3,2,3] +; AVX512BW-FAST-NEXT: vporq %ymm22, %ymm23, %ymm24 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %xmm22 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %xmm23 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm24, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rax), %xmm24 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm24, %zmm31, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm31, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm31 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] +; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm31, %zmm1 +; AVX512BW-FAST-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 +; AVX512BW-FAST-NEXT: kmovq %rax, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-FAST-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E +; AVX512BW-FAST-NEXT: kmovq %rax, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm18 {%k1} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm27, %zmm0 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm5, %zmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,zmm0[23],zero,zmm0[21,22,23,26],zero,zmm0[24],zero,zmm0[28,29,26,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero,zmm0[61],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm31[0,1,2,3],zmm3[4,5,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm22 = zmm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm22[23],zero,zero,zero,zero,zmm22[26],zero,zmm22[24],zero,zero,zero,zero,zmm22[27],zero,zmm22[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zmm22[62],zero,zmm22[60],zero,zero,zero,zero,zmm22[63],zero,zmm22[61],zero,zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm0, %zmm22, %zmm22 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %xmm30 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm1[25],zero,zmm1[23],zero,zero,zero,zero,zmm1[26],zero,zmm1[24],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero,zmm1[61],zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,zmm1[23],zero,zero,zero,zero,zmm1[26],zero,zmm1[24],zero,zero,zero,zero,zmm1[27],zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,60,61,62],zero,zmm1[60],zero,zmm1[62,63,62,63],zero,zmm1[61],zero,zmm1[63,60,61] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm2[23],zero,zero,zero,zero,zmm2[26],zero,zmm2[24],zero,zero,zero,zero,zmm2[27],zero,zmm2[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zmm2[62],zero,zmm2[60],zero,zero,zero,zero,zmm2[63],zero,zmm2[61],zero,zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vporq %zmm1, %zmm2, %zmm27 ; AVX512BW-FAST-NEXT: movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm29, %zmm22 {%k1} -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm27[0,1,2,3],zmm21[4,5,6,7] +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm27 {%k1} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm29, %zmm30, %zmm0 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm29, %zmm1 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm0[23],zero,zmm0[23,24,25,26],zero,zmm0[24],zero,zmm0[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm0[59],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm9[4,5,6,7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[25],zero,zmm1[23],zero,zero,zero,zero,zmm1[26],zero,zmm1[24],zero,zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm1[59],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero,zmm1[61] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] ; AVX512BW-FAST-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm22 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,62,61,62,63,63,62,62,63,62,61,62,63,63,62,62,63] -; AVX512BW-FAST-NEXT: vpermi2w %zmm26, %zmm12, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm27 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,30,29,30,31,31,30,30,31,30,29,30,31,31,30,30,31] +; AVX512BW-FAST-NEXT: vpermw %zmm28, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 -; AVX512BW-FAST-NEXT: kmovq %rax, %k3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm22 {%k3} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm11[28],zero,ymm11[30,31,30,31],zero,ymm11[29],zero,ymm11[31,28,29] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero,ymm13[29],zero,zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm28[0],xmm30[0],xmm28[1],xmm30[1],xmm28[2],xmm30[2],xmm28[3],xmm30[3],xmm28[4],xmm30[4],xmm28[5],xmm30[5],xmm28[6],xmm30[6],xmm28[7],xmm30[7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm7[30],zero,ymm7[28],zero,zero,zero,zero,ymm7[31],zero,ymm7[29],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm27 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero -; AVX512BW-FAST-NEXT: vmovdqa64 %ymm6, %ymm20 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,3,2,3] -; AVX512BW-FAST-NEXT: vporq %ymm0, %ymm27, %ymm27 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm31 -; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm31[0],xmm1[0],xmm31[1],xmm1[1],xmm31[2],xmm1[2],xmm31[3],xmm1[3],xmm31[4],xmm1[4],xmm31[5],xmm1[5],xmm31[6],xmm1[6],xmm31[7],xmm1[7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512BW-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm27, %zmm27 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm27 {%k2} -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm14[27],zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm24[27],zero,zero,zero,zero,ymm24[30],zero,ymm24[28],zero,zero,zero,zero,ymm24[31],zero,ymm24[29] -; AVX512BW-FAST-NEXT: vmovdqa64 %ymm24, %ymm7 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512BW-FAST-NEXT: vpor %ymm0, %ymm2, %ymm2 -; AVX512BW-FAST-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX512BW-FAST-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] -; AVX512BW-FAST-NEXT: vpermw %zmm26, %zmm2, %zmm2 -; AVX512BW-FAST-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 -; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm0 {%k2} -; AVX512BW-FAST-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm27 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm2 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512BW-FAST-NEXT: vpshufb %xmm0, %xmm31, %xmm24 -; AVX512BW-FAST-NEXT: vporq %xmm2, %xmm24, %xmm2 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm31[8],xmm1[9],xmm31[9],xmm1[10],xmm31[10],xmm1[11],xmm31[11],xmm1[12],xmm31[12],xmm1[13],xmm31[13],xmm1[14],xmm31[14],xmm1[15],xmm31[15] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512BW-FAST-NEXT: vpshufb %xmm2, %xmm30, %xmm24 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm31 = -; AVX512BW-FAST-NEXT: vpshufb %xmm31, %xmm28, %xmm29 -; AVX512BW-FAST-NEXT: vporq %xmm24, %xmm29, %xmm24 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm28 = xmm28[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm28, %zmm24, %zmm24 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm24 = zmm24[0,1,0,1,4,5,4,5] -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm24 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm28 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm29 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm3, %xmm30 -; AVX512BW-FAST-NEXT: vporq %xmm28, %xmm30, %xmm28 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm28, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54] -; AVX512BW-FAST-NEXT: vpermi2w %zmm26, %zmm12, %zmm4 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,1,0,1,4,5,4,5] -; AVX512BW-FAST-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 -; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm4, %zmm3 {%k2} -; AVX512BW-FAST-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 -; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm24 {%k2} -; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm16, %xmm3 -; AVX512BW-FAST-NEXT: vpshufb %xmm0, %xmm15, %xmm0 -; AVX512BW-FAST-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm0 -; AVX512BW-FAST-NEXT: vpshufb %xmm2, %xmm19, %xmm2 -; AVX512BW-FAST-NEXT: vpshufb %xmm31, %xmm17, %xmm3 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512BW-FAST-NEXT: vpshufb %xmm0, %xmm19, %xmm2 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm17, %xmm3 ; AVX512BW-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm17[0],xmm19[0],xmm17[1],xmm19[1],xmm17[2],xmm19[2],xmm17[3],xmm19[3],xmm17[4],xmm19[4],xmm17[5],xmm19[5],xmm17[6],xmm19[6],xmm17[7],xmm19[7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512BW-FAST-NEXT: vpshufb %xmm2, %xmm21, %xmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm28 = +; AVX512BW-FAST-NEXT: vpshufb %xmm28, %xmm20, %xmm17 +; AVX512BW-FAST-NEXT: vporq %xmm5, %xmm17, %xmm5 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm20[0],xmm21[0],xmm20[1],xmm21[1],xmm20[2],xmm21[2],xmm20[3],xmm21[3],xmm20[4],xmm21[4],xmm20[5],xmm21[5],xmm20[6],xmm21[6],xmm20[7],xmm21[7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm17, %zmm5 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm17 = zmm5[0,1,0,1,4,5,4,5] ; AVX512BW-FAST-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm2 {%k2} -; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm25, %xmm0 -; AVX512BW-FAST-NEXT: vpshufb %xmm29, %xmm10, %xmm1 -; AVX512BW-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm25[0],xmm10[1],xmm25[1],xmm10[2],xmm25[2],xmm10[3],xmm25[3],xmm10[4],xmm25[4],xmm10[5],xmm25[5],xmm10[6],xmm25[6],xmm10[7],xmm25[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] -; AVX512BW-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm1 # 32-byte Folded Reload -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm17 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm19 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX512BW-FAST-NEXT: vpshufb %xmm19, %xmm12, %xmm3 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm20 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm13, %xmm5 +; AVX512BW-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512BW-FAST-NEXT: vpermw %zmm5, %zmm12, %zmm5 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,1,0,1,4,5,4,5] ; AVX512BW-FAST-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm0 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm3 {%k2} ; AVX512BW-FAST-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 ; AVX512BW-FAST-NEXT: kmovq %rax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm2 {%k2} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm0[19],zero,zmm0[21,20,21,22],zero,zmm0[20],zero,zmm0[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm0[55],zero,zero,zero,zero,zmm0[58],zero,zmm0[56],zero,zero,zero,zero,zmm0[59],zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm3[21],zero,zmm3[19],zero,zero,zero,zero,zmm3[22],zero,zmm3[20],zero,zero,zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm3[55],zero,zero,zero,zero,zmm3[58],zero,zmm3[56],zero,zero,zero,zero,zmm3[59],zero,zmm3[57] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm0, %zmm3, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm3, %zmm3 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm4 # 32-byte Folded Reload -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm3[18,19,20,21],zero,zmm3[19],zero,zmm3[25,26,27,22],zero,zmm3[20],zero,zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm3[55],zero,zmm3[53,54,55,58],zero,zmm3[56],zero,zmm3[60,61,58,59] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm4[18],zero,zero,zero,zero,zmm4[21],zero,zmm4[19],zero,zero,zero,zero,zmm4[22],zero,zmm4[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm4[57],zero,zmm4[55],zero,zero,zero,zero,zmm4[58],zero,zmm4[56],zero,zero,zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1} -; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm21, %zmm4 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm0[18],zero,zmm0[20,21,20,21],zero,zmm0[19],zero,zmm0[19,20,21,22],zero,zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm0[55],zero,zmm0[55,56,57,58],zero,zmm0[56],zero,zmm0[62,63] -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm4[20],zero,zmm4[18],zero,zero,zero,zero,zmm4[21],zero,zmm4[19],zero,zero,zero,zero,zmm4[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm4[57],zero,zmm4[55],zero,zero,zero,zero,zmm4[58],zero,zmm4[56],zero,zero -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7] -; AVX512BW-FAST-NEXT: vporq %zmm0, %zmm4, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512BW-FAST-NEXT: vpermw %zmm1, %zmm4, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm17 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512BW-FAST-NEXT: vpermw %zmm7, %zmm3, %zmm3 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm5 = zmm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm11[18],zero,zmm11[20,21,20,21],zero,zmm11[19],zero,zmm11[19,20,21,22],zero,zmm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm11[55],zero,zmm11[55,56,57,58],zero,zmm11[56],zero,zmm11[62,63] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm11 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm4[20],zero,zmm4[18],zero,zero,zero,zero,zmm4[21],zero,zmm4[19],zero,zero,zero,zero,zmm4[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm4[57],zero,zmm4[55],zero,zero,zero,zero,zmm4[58],zero,zmm4[56],zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vporq %zmm5, %zmm11, %zmm5 ; AVX512BW-FAST-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 -; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-FAST-NEXT: kmovq %rax, %k2 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm5 {%k2} +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm3 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm11 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm3[19],zero,zmm3[21,20,21,22],zero,zmm3[20],zero,zmm3[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm3[55],zero,zero,zero,zero,zmm3[58],zero,zmm3[56],zero,zero,zero,zero,zmm3[59],zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm11 = zmm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm11[21],zero,zmm11[19],zero,zero,zero,zero,zmm11[22],zero,zmm11[20],zero,zero,zmm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm11[55],zero,zero,zero,zero,zmm11[58],zero,zmm11[56],zero,zero,zero,zero,zmm11[59],zero,zmm11[57] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vporq %zmm3, %zmm11, %zmm3 +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm7 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm4[18,19,20,21],zero,zmm4[19],zero,zmm4[25,26,27,22],zero,zmm4[20],zero,zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm4[55],zero,zmm4[53,54,55,58],zero,zmm4[56],zero,zmm4[60,61,58,59] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} zmm8 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm4[18],zero,zero,zero,zero,zmm4[21],zero,zmm4[19],zero,zero,zero,zero,zmm4[22],zero,zmm4[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm4[57],zero,zmm4[55],zero,zero,zero,zero,zmm4[58],zero,zmm4[56],zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] +; AVX512BW-FAST-NEXT: vporq %zmm7, %zmm8, %zmm7 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm3, %zmm7 {%k1} ; AVX512BW-FAST-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C +; AVX512BW-FAST-NEXT: kmovq %rax, %k2 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm5, %zmm7 {%k2} +; AVX512BW-FAST-NEXT: vpshufb %xmm0, %xmm26, %xmm0 +; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm25, %xmm1 +; AVX512BW-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm1 +; AVX512BW-FAST-NEXT: vpshufb %xmm28, %xmm9, %xmm2 +; AVX512BW-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} +; AVX512BW-FAST-NEXT: vpshufb %xmm19, %xmm22, %xmm0 +; AVX512BW-FAST-NEXT: vpshufb %xmm20, %xmm23, %xmm2 +; AVX512BW-FAST-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm24, %zmm24, %zmm2 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,20,21,20,21,21,22,21,22,20,21,20,21,21,22,21,22] +; AVX512BW-FAST-NEXT: vpermw %zmm2, %zmm3, %zmm2 +; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] +; AVX512BW-FAST-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 +; AVX512BW-FAST-NEXT: kmovq %rax, %k1 +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} +; AVX512BW-FAST-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 ; AVX512BW-FAST-NEXT: kmovq %rax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1} +; AVX512BW-FAST-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm22, 384(%rax) -; AVX512BW-FAST-NEXT: addq $200, %rsp +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm27, 384(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, 192(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-FAST-NEXT: addq $136, %rsp ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 @@ -9182,9 +8630,11 @@ ; AVX512: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} ; AVX512BW-ONLY-SLOW: {{.*}} +; AVX512DQ-FAST: {{.*}} ; AVX512DQ-SLOW: {{.*}} ; AVX512DQBW-FAST: {{.*}} ; AVX512DQBW-SLOW: {{.*}} +; AVX512F-ONLY-FAST: {{.*}} ; AVX512F-ONLY-SLOW: {{.*}} ; FALLBACK0: {{.*}} ; FALLBACK1: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -25,56 +25,98 @@ ; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: movdqa (%r8), %xmm2 ; SSE-NEXT: movdqa (%r11), %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] +; SSE-NEXT: psrld $16, %xmm2 +; SSE-NEXT: packssdw %xmm2, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movdqa %xmm0, (%rax) +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, (%rax) ; SSE-NEXT: retq ; -; AVX-LABEL: store_i8_stride8_vf2: -; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa (%rdx), %xmm1 -; AVX-NEXT: vmovdqa (%r8), %xmm2 -; AVX-NEXT: vmovdqa (%r11), %xmm3 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] -; AVX-NEXT: vmovdqa %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-ONLY-LABEL: store_i8_stride8_vf2: +; AVX1-ONLY: # %bb.0: +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%r11), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,1,3,5,7,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,10,12,14,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rax) +; AVX1-ONLY-NEXT: retq +; +; AVX2-ONLY-LABEL: store_i8_stride8_vf2: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa (%r8), %xmm2 +; AVX2-ONLY-NEXT: vmovdqa (%r11), %xmm3 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpbroadcastd %xmm3, %xmm3 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,1,3,5,7,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,10,12,14,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rax) +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: store_i8_stride8_vf2: +; AVX512: # %bb.0: +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512-NEXT: vmovdqa (%r8), %xmm2 +; AVX512-NEXT: vmovdqa (%r11), %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm3 = [0,4,0,4] +; AVX512-NEXT: vpermi2d %xmm1, %xmm2, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[8,10,12,14,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,1,3,5,7,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512-NEXT: vmovdqa %xmm0, (%rax) +; AVX512-NEXT: retq %in.vec0 = load <2 x i8>, ptr %in.vecptr0, align 64 %in.vec1 = load <2 x i8>, ptr %in.vecptr1, align 64 %in.vec2 = load <2 x i8>, ptr %in.vecptr2, align 64 @@ -103,73 +145,52 @@ ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa (%rdx), %xmm1 -; SSE-NEXT: movdqa (%r8), %xmm2 -; SSE-NEXT: movdqa (%r11), %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5,7,5] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,5,6,4] -; SSE-NEXT: packuswb %xmm5, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: pandn %xmm7, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm7 -; SSE-NEXT: pand %xmm4, %xmm7 -; SSE-NEXT: por %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,1,0,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm9 +; SSE-NEXT: movdqa (%r8), %xmm3 +; SSE-NEXT: movdqa (%r11), %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,1] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,3,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,1,1,3] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: pandn %xmm9, %xmm8 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm9, %xmm6 -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,6] -; SSE-NEXT: packuswb %xmm7, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,5,7,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: por %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm7, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,0,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: por %xmm8, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm4, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movdqa %xmm0, 16(%rax) -; SSE-NEXT: movdqa %xmm6, (%rax) +; SSE-NEXT: movdqa %xmm7, (%rax) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride8_vf4: @@ -181,12 +202,12 @@ ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa (%r11), %xmm3 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],mem[0],xmm3[1],mem[1] +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [2,6,10,14,3,7,11,15,2,6,10,14,3,7,11,15] ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm3 @@ -202,28 +223,53 @@ ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; -; AVX2-LABEL: store_i8_stride8_vf4: -; AVX2: # %bb.0: -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-NEXT: vmovdqa (%r11), %xmm3 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-ONLY-LABEL: store_i8_stride8_vf4: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-ONLY-NEXT: vpbroadcastd (%rcx), %xmm2 +; AVX2-ONLY-NEXT: vpbroadcastd (%rdx), %xmm3 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vpbroadcastd (%r11), %xmm2 +; AVX2-ONLY-NEXT: vpbroadcastd (%r10), %xmm3 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rax) +; AVX2-ONLY-NEXT: vzeroupper +; AVX2-ONLY-NEXT: retq +; +; AVX512-LABEL: store_i8_stride8_vf4: +; AVX512: # %bb.0: +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa (%r8), %xmm1 +; AVX512-NEXT: vpbroadcastd (%rdx), %xmm2 +; AVX512-NEXT: vpunpckldq (%rcx){1to4}, %xmm2, %xmm2 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512-NEXT: vpbroadcastd (%r11), %xmm2 +; AVX512-NEXT: vpunpckldq (%r10){1to4}, %xmm2, %xmm2 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15,16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %in.vec0 = load <4 x i8>, ptr %in.vecptr0, align 64 %in.vec1 = load <4 x i8>, ptr %in.vecptr1, align 64 %in.vec2 = load <4 x i8>, ptr %in.vecptr2, align 64 @@ -779,38 +825,38 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] ; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: pandn %xmm8, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] ; SSE-NEXT: pand %xmm2, %xmm8 ; SSE-NEXT: por %xmm9, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,3,3] ; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: pandn %xmm9, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm0, %xmm9 ; SSE-NEXT: por %xmm5, %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] ; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: pandn %xmm5, %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm14[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm14[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] ; SSE-NEXT: pand %xmm2, %xmm5 ; SSE-NEXT: por %xmm9, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,3,3] ; SSE-NEXT: movdqa %xmm0, %xmm12 ; SSE-NEXT: pandn %xmm9, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[3,3,3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm0, %xmm6 ; SSE-NEXT: por %xmm12, %xmm6 @@ -861,35 +907,35 @@ ; SSE-NEXT: por %xmm11, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] ; SSE-NEXT: movdqa %xmm2, %xmm11 ; SSE-NEXT: pandn %xmm6, %xmm11 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] ; SSE-NEXT: pand %xmm2, %xmm6 ; SSE-NEXT: por %xmm11, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,3,3] ; SSE-NEXT: movdqa %xmm0, %xmm13 ; SSE-NEXT: pandn %xmm11, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm10[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm0, %xmm11 ; SSE-NEXT: por %xmm13, %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: pandn %xmm4, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[3,3,3,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm0 @@ -898,12 +944,12 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm0, 96(%rax) -; SSE-NEXT: movdqa %xmm11, 112(%rax) +; SSE-NEXT: movdqa %xmm0, 112(%rax) +; SSE-NEXT: movdqa %xmm11, 96(%rax) ; SSE-NEXT: movdqa %xmm5, 80(%rax) ; SSE-NEXT: movdqa %xmm9, 64(%rax) -; SSE-NEXT: movdqa %xmm12, 32(%rax) -; SSE-NEXT: movdqa %xmm8, 48(%rax) +; SSE-NEXT: movdqa %xmm12, 48(%rax) +; SSE-NEXT: movdqa %xmm8, 32(%rax) ; SSE-NEXT: movdqa %xmm7, 16(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%rax) @@ -1177,119 +1223,171 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %in.vecptr7, ptr %out.vec) nounwind { ; SSE-LABEL: store_i8_stride8_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $216, %rsp +; SSE-NEXT: subq $56, %rsp ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE-NEXT: movdqa (%rdi), %xmm9 -; SSE-NEXT: movdqa (%rsi), %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa (%rsi), %xmm11 +; SSE-NEXT: movdqa (%rdx), %xmm12 +; SSE-NEXT: movdqa (%rcx), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rcx), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r8), %xmm13 -; SSE-NEXT: movdqa (%r9), %xmm12 +; SSE-NEXT: movdqa (%r8), %xmm8 +; SSE-NEXT: movdqa (%r9), %xmm1 ; SSE-NEXT: movdqa (%r10), %xmm14 -; SSE-NEXT: movdqa (%rax), %xmm11 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rax), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,1,3] +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,1,3] ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm5, %xmm10 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: pandn %xmm4, %xmm15 +; SSE-NEXT: movdqa %xmm10, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,0,0,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm7, %xmm11 +; SSE-NEXT: por %xmm15, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: por %xmm10, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm6, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm13[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: por %xmm11, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,1,3] +; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: pandn %xmm11, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[1,1,1,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm7, %xmm11 +; SSE-NEXT: por %xmm15, %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm15[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: pandn %xmm6, %xmm11 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm13[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: por %xmm11, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: pandn %xmm11, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm7, %xmm11 +; SSE-NEXT: por %xmm15, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE-NEXT: pand %xmm9, %xmm5 ; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm10 -; SSE-NEXT: pandn %xmm6, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,0,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: por %xmm10, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm6, %xmm3 +; SSE-NEXT: movdqa 16(%r8), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm14[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] -; SSE-NEXT: movdqa %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[2,1,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: por %xmm6, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: # xmm14 = xmm14[8],mem[8],xmm14[9],mem[9],xmm14[10],mem[10],xmm14[11],mem[11],xmm14[12],mem[12],xmm14[13],mem[13],xmm14[14],mem[14],xmm14[15],mem[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] +; SSE-NEXT: pand %xmm9, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] ; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload ; SSE-NEXT: # xmm12 = xmm12[8],mem[8],xmm12[9],mem[9],xmm12[10],mem[10],xmm12[11],mem[11],xmm12[12],mem[12],xmm12[13],mem[13],xmm12[14],mem[14],xmm12[15],mem[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: # xmm9 = xmm9[8],mem[8],xmm9[9],mem[9],xmm9[10],mem[10],xmm9[11],mem[11],xmm9[12],mem[12],xmm9[13],mem[13],xmm9[14],mem[14],xmm9[15],mem[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] ; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm7, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 @@ -1298,1104 +1396,992 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rax), %xmm10 -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,1] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: movdqa 16(%r8), %xmm13 -; SSE-NEXT: movdqa 16(%r9), %xmm15 -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,1,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movdqa 16(%rdx), %xmm12 -; SSE-NEXT: movdqa 16(%rcx), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa 16(%rax), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[3,3,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa 16(%r9), %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,1] +; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: por %xmm4, %xmm8 +; SSE-NEXT: movdqa 16(%rdx), %xmm4 +; SSE-NEXT: movdqa 16(%rcx), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] +; SSE-NEXT: movdqa %xmm7, %xmm12 +; SSE-NEXT: pandn %xmm3, %xmm12 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa 16(%rsi), %xmm10 +; SSE-NEXT: movdqa %xmm3, %xmm14 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,0,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm7, %xmm15 +; SSE-NEXT: por %xmm12, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm15[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] +; SSE-NEXT: movdqa %xmm9, %xmm15 +; SSE-NEXT: pandn %xmm8, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[2,1,3,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,1] +; SSE-NEXT: pand %xmm9, %xmm8 +; SSE-NEXT: por %xmm15, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm13[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,1,3] +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: pandn %xmm15, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[1,1,1,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm7, %xmm15 +; SSE-NEXT: por %xmm2, %xmm15 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,5,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: por %xmm8, %xmm2 +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm13[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm14[2,2,2,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm7, %xmm8 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm14 -; SSE-NEXT: movdqa 16(%rsi), %xmm5 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm9 -; SSE-NEXT: por %xmm2, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,3,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = xmm11[8],mem[8],xmm11[9],mem[9],xmm11[10],mem[10],xmm11[11],mem[11],xmm11[12],mem[12],xmm11[13],mem[13],xmm11[14],mem[14],xmm11[15],mem[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,1,1,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] +; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa %xmm7, %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,0,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm9, %xmm2 +; SSE-NEXT: por %xmm5, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,1,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = xmm12[8],mem[8],xmm12[9],mem[9],xmm12[10],mem[10],xmm12[11],mem[11],xmm12[12],mem[12],xmm12[13],mem[13],xmm12[14],mem[14],xmm12[15],mem[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[0,2,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,1,1,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,0,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[0,0,0,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,3,3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,6,5,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[3,3,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,4,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,5,7] +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5,5,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[3,3,3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: pandn %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[2,2,2,2] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm4 -; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: pandn %xmm2, %xmm5 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,6,5,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: por %xmm5, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,3,3] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pandn %xmm5, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[3,3,3,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: por %xmm11, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,5,5,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] -; SSE-NEXT: pand %xmm8, %xmm10 -; SSE-NEXT: pandn %xmm2, %xmm8 -; SSE-NEXT: por %xmm10, %xmm8 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,4,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[2,2,2,2] +; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,4,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,3,3] +; SSE-NEXT: movdqa %xmm7, %xmm10 +; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,2,2,2] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] ; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,6,5,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] +; SSE-NEXT: pand %xmm9, %xmm10 +; SSE-NEXT: pandn %xmm0, %xmm9 +; SSE-NEXT: por %xmm10, %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm0, %xmm7 +; SSE-NEXT: por %xmm3, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE-NEXT: movdqa %xmm3, 240(%rax) ; SSE-NEXT: movdqa %xmm6, 224(%rax) -; SSE-NEXT: movdqa %xmm5, 240(%rax) -; SSE-NEXT: movdqa %xmm4, 160(%rax) -; SSE-NEXT: movdqa %xmm9, 176(%rax) -; SSE-NEXT: movdqa %xmm0, 96(%rax) -; SSE-NEXT: movdqa %xmm3, 112(%rax) -; SSE-NEXT: movdqa %xmm1, 32(%rax) -; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movdqa %xmm2, 208(%rax) +; SSE-NEXT: movdqa %xmm5, 192(%rax) +; SSE-NEXT: movdqa %xmm1, 176(%rax) +; SSE-NEXT: movdqa %xmm8, 160(%rax) +; SSE-NEXT: movdqa %xmm15, 144(%rax) +; SSE-NEXT: movdqa %xmm12, 128(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 192(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 208(%rax) +; SSE-NEXT: movaps %xmm0, 112(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 128(%rax) +; SSE-NEXT: movaps %xmm0, 96(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 144(%rax) +; SSE-NEXT: movaps %xmm0, 80(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 64(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, 80(%rax) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movaps %xmm0, 48(%rax) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: addq $216, %rsp +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: addq $56, %rsp ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: store_i8_stride8_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $72, %rsp +; AVX1-ONLY-NEXT: subq $40, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm10 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm11 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm14 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm7 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm13 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm12 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm15[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm2, %ymm14 -; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm2, %ymm14 -; AVX1-ONLY-NEXT: vorps %ymm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4],ymm5[5],ymm13[6],ymm5[7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm15[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm2, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm8 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] -; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm8 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0],ymm5[1],ymm1[2],ymm5[3],ymm1[4],ymm5[5],ymm1[6],ymm5[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm5[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm15[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3],ymm1[4],ymm5[5],ymm1[6],ymm5[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovaps 16(%r9), %xmm1 +; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm15 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm14, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm11 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm10 +; AVX1-ONLY-NEXT: vandps %ymm8, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm8, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm13, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm4[1],ymm11[2],ymm4[3],ymm11[4],ymm4[5],ymm11[6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm15[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm14, %ymm11 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm12[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm8, %ymm11 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm12, %ymm7 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm11, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4],ymm0[5],ymm7[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm6 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm14, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm15, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm15 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3],xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm3[0],ymm11[1],ymm3[2],ymm11[3],ymm3[4],ymm11[5],ymm3[6],ymm11[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm8, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm13, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm11, 128(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) +; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $72, %rsp +; AVX1-ONLY-NEXT: addq $40, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i8_stride8_vf32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $88, %rsp +; AVX2-SLOW-NEXT: pushq %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm3 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm8 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7],ymm6[8,9,10],ymm0[11],ymm6[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm7 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm11 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm15[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm8, %ymm14 -; AVX2-SLOW-NEXT: vmovaps 16(%r10), %xmm8 -; AVX2-SLOW-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1],ymm10[2,3,4],ymm14[5],ymm10[6,7,8],ymm14[9],ymm10[10,11,12],ymm14[13],ymm10[14,15] -; AVX2-SLOW-NEXT: vmovdqa 16(%rax), %xmm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2],ymm0[3],ymm14[4],ymm0[5],ymm14[6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm9 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm10 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm13 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm14 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm7[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm3, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 16(%r10), %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm4[0],ymm15[1],ymm4[2,3,4],ymm15[5],ymm4[6,7,8],ymm15[9],ymm4[10,11,12],ymm15[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vmovdqa 16(%rax), %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2],ymm0[3],ymm15[4],ymm0[5],ymm15[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa 16(%r9), %xmm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm15, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm5[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm15, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 16(%r9), %xmm5 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3],ymm12[4,5,6],ymm1[7],ymm12[8,9,10],ymm1[11],ymm12[12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,3,3,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm15[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm13, %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 16(%r8), %xmm15 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7,8],ymm13[9],ymm12[10,11,12],ymm13[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm1[1],ymm12[2],ymm1[3],ymm12[4],ymm1[5],ymm12[6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3],ymm15[4,5,6],ymm1[7],ymm15[8,9,10],ymm1[11],ymm15[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm7 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7,8],ymm9[9],ymm7[10,11,12],ymm9[13],ymm7[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4],ymm3[5],ymm7[6],ymm3[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm15, %ymm6 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm7[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm15, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 16(%r8), %xmm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm15 = ymm6[0],ymm15[1],ymm6[2,3,4],ymm15[5],ymm6[6,7,8],ymm15[9],ymm6[10,11,12],ymm15[13],ymm6[14,15] +; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm1[1],ymm15[2],ymm1[3],ymm15[4],ymm1[5],ymm15[6],ymm1[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm2[8],xmm8[9],xmm2[9],xmm8[10],xmm2[10],xmm8[11],xmm2[11],xmm8[12],xmm2[12],xmm8[13],xmm2[13],xmm8[14],xmm2[14],xmm8[15],xmm2[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm8[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm10[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm9, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm9 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0,1,2],ymm2[3],ymm15[4,5,6],ymm2[7],ymm15[8,9,10],ymm2[11],ymm15[12,13,14],ymm2[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm12, %ymm12 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7,8],ymm14[9],ymm12[10,11,12],ymm14[13],ymm12[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm2[1],ymm12[2],ymm2[3],ymm12[4],ymm2[5],ymm12[6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm8[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm12, %ymm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6],ymm8[7],ymm10[8,9,10],ymm8[11],ymm10[12,13,14],ymm8[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,3,3,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm11, %ymm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7,8],ymm11[9],ymm10[10,11,12],ymm11[13],ymm10[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovdqa %xmm10, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm7, %ymm7 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; AVX2-SLOW-NEXT: vmovdqa %xmm8, %xmm10 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm10, %ymm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm14[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm14[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0,1,2],ymm7[3],ymm12[4,5,6],ymm7[7],ymm12[8,9,10],ymm7[11],ymm12[12,13,14],ymm7[15] -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3],ymm12[4,5,6],ymm10[7],ymm12[8,9,10],ymm10[11],ymm12[12,13,14],ymm10[15] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[1,1,1,1] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[3,3,3,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm13, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm4 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm10[1],ymm1[2],ymm10[3],ymm1[4],ymm10[5],ymm1[6],ymm10[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[3,3,3,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7,8],ymm0[9],ymm3[10,11,12],ymm0[13],ymm3[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4],ymm7[5],ymm0[6],ymm7[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm14[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm15, %ymm7 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7],ymm7[8,9,10],ymm0[11],ymm7[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm8[8],xmm5[9],xmm8[9],xmm5[10],xmm8[10],xmm5[11],xmm8[11],xmm5[12],xmm8[12],xmm5[13],xmm8[13],xmm5[14],xmm8[14],xmm5[15],xmm8[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,3,3,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 192(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 224(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 192(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, 160(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm10, 128(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) -; AVX2-SLOW-NEXT: addq $88, %rsp +; AVX2-SLOW-NEXT: popq %rax ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride8_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $72, %rsp +; AVX2-FAST-NEXT: pushq %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm6 -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm7 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] -; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm8 -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm9 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm10 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm5 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm10 +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm11 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] +; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm12 +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm13 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm14 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm15 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm15 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3],xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7,8],ymm14[9],ymm13[10,11,12],ymm14[13],ymm13[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4],ymm11[5],ymm13[6],ymm11[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7,8],ymm4[9],ymm6[10,11,12],ymm4[13],ymm6[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1,2],ymm3[3],ymm10[4,5,6],ymm3[7],ymm10[8,9,10],ymm3[11],ymm10[12,13,14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7,8],ymm0[9],ymm10[10,11,12],ymm0[13],ymm10[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6],ymm4[7],ymm1[8,9,10],ymm4[11],ymm1[12,13,14],ymm4[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7,8],ymm3[9],ymm4[10,11,12],ymm3[13],ymm4[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7],ymm11[8,9,10],ymm10[11],ymm11[12,13,14],ymm10[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm11 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7,8],ymm12[9],ymm11[10,11,12],ymm12[13],ymm11[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: vmovdqa 16(%r10), %xmm13 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7],ymm4[8,9,10],ymm1[11],ymm4[12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa 16(%rax), %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX2-FAST-NEXT: vmovdqa 16(%r9), %xmm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 16(%r10), %xmm5 -; AVX2-FAST-NEXT: vmovdqa 16(%rax), %xmm0 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vmovdqa 16(%r9), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 16(%r8), %xmm4 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm9, %ymm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm15 = ymm6[0,1,2],ymm3[3],ymm6[4,5,6],ymm3[7],ymm6[8,9,10],ymm3[11],ymm6[12,13,14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm10 -; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm14 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm10[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7,8],ymm13[9],ymm14[10,11,12],ymm13[13],ymm14[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4],ymm15[5],ymm13[6],ymm15[7] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7],ymm7[8,9,10],ymm0[11],ymm7[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7,8],ymm6[9],ymm3[10,11,12],ymm6[13],ymm3[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm4 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm3 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm5 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vmovdqa 16(%r8), %xmm5 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm12 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm1[3],ymm12[4,5,6],ymm1[7],ymm12[8,9,10],ymm1[11],ymm12[12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3],xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm4 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm8[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7,8],ymm9[9],ymm1[10,11,12],ymm9[13],ymm1[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm12[1],ymm1[2],ymm12[3],ymm1[4],ymm12[5],ymm1[6],ymm12[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm13[8],xmm7[9],xmm13[9],xmm7[10],xmm13[10],xmm7[11],xmm13[11],xmm7[12],xmm13[12],xmm7[13],xmm13[13],xmm7[14],xmm13[14],xmm7[15],xmm13[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm14[8],xmm5[9],xmm14[9],xmm5[10],xmm14[10],xmm5[11],xmm14[11],xmm5[12],xmm14[12],xmm5[13],xmm14[13],xmm5[14],xmm14[14],xmm5[15],xmm14[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7,8],ymm7[9],ymm6[10,11,12],ymm7[13],ymm6[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7,8],ymm2[9],ymm4[10,11,12],ymm2[13],ymm4[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 128(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 224(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm15, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 224(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 192(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm12, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-NEXT: addq $72, %rsp +; AVX2-FAST-NEXT: popq %rax ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i8_stride8_vf32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $72, %rsp +; AVX2-FAST-PERLANE-NEXT: pushq %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3],xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7,8],ymm14[9],ymm13[10,11,12],ymm14[13],ymm13[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4],ymm11[5],ymm13[6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm3, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7,8],ymm4[9],ymm6[10,11,12],ymm4[13],ymm6[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1,2],ymm3[3],ymm10[4,5,6],ymm3[7],ymm10[8,9,10],ymm3[11],ymm10[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7,8],ymm0[9],ymm10[10,11,12],ymm0[13],ymm10[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6],ymm4[7],ymm1[8,9,10],ymm4[11],ymm1[12,13,14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7,8],ymm3[9],ymm4[10,11,12],ymm3[13],ymm4[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm1, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm4, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7],ymm11[8,9,10],ymm10[11],ymm11[12,13,14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm0, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7,8],ymm12[9],ymm11[10,11,12],ymm12[13],ymm11[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r10), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7],ymm4[8,9,10],ymm1[11],ymm4[12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rax), %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r9), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r10), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rax), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r9), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r8), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm9, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm15 = ymm6[0,1,2],ymm3[3],ymm6[4,5,6],ymm3[7],ymm6[8,9,10],ymm3[11],ymm6[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm10[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7,8],ymm13[9],ymm14[10,11,12],ymm13[13],ymm14[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0],ymm15[1],ymm13[2],ymm15[3],ymm13[4],ymm15[5],ymm13[6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7],ymm7[8,9,10],ymm0[11],ymm7[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r8), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm1[3],ymm12[4,5,6],ymm1[7],ymm12[8,9,10],ymm1[11],ymm12[12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3],xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm8[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7,8],ymm9[9],ymm1[10,11,12],ymm9[13],ymm1[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm12[1],ymm1[2],ymm12[3],ymm1[4],ymm12[5],ymm1[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm8, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm13[8],xmm7[9],xmm13[9],xmm7[10],xmm13[10],xmm7[11],xmm13[11],xmm7[12],xmm13[12],xmm7[13],xmm13[13],xmm7[14],xmm13[14],xmm7[15],xmm13[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm14[8],xmm5[9],xmm14[9],xmm5[10],xmm14[10],xmm5[11],xmm14[11],xmm5[12],xmm14[12],xmm5[13],xmm14[13],xmm5[14],xmm14[14],xmm5[15],xmm14[15] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7,8],ymm6[9],ymm3[10,11,12],ymm6[13],ymm3[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm12, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm9, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm8, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm11, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7,8],ymm7[9],ymm6[10,11,12],ymm7[13],ymm6[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7,8],ymm2[9],ymm4[10,11,12],ymm2[13],ymm4[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $72, %rsp +; AVX2-FAST-PERLANE-NEXT: popq %rax ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -2403,219 +2389,205 @@ ; AVX512F-ONLY-SLOW: # %bb.0: ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r10), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%r10), %xmm11 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rax), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rax), %xmm12 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm23 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%r9), %xmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%r8), %xmm14 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm2, %xmm24 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7],ymm6[8,9,10],ymm0[11],ymm6[12,13,14],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm6 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm1, %xmm26 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rcx), %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm1[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm15, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3],ymm10[4,5,6],ymm2[7],ymm10[8,9,10],ymm2[11],ymm10[12,13,14],ymm2[15] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm10, %ymm7 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3],ymm7[4,5,6],ymm1[7],ymm7[8,9,10],ymm1[11],ymm7[12,13,14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdx), %xmm10 -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm17 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm7, %ymm2, %ymm21 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm15 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm7[0],xmm15[1],xmm7[1],xmm15[2],xmm7[2],xmm15[3],xmm7[3],xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm19 = xmm0[2,3,2,3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm20 = xmm0[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm19, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm21[2,1,3,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r10), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rax), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7],ymm5[8,9,10],ymm1[11],ymm5[12,13,14],ymm1[15] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,1,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7],ymm4[8,9,10],ymm1[11],ymm4[12,13,14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm7[8],xmm15[9],xmm7[9],xmm15[10],xmm7[10],xmm15[11],xmm7[11],xmm15[12],xmm7[12],xmm15[13],xmm7[13],xmm15[14],xmm7[14],xmm15[15],xmm7[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3],xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm7[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[2,1,3,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%r10), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm11, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7,8],ymm7[9],ymm1[10,11,12],ymm7[13],ymm1[14,15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rax), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%r9), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%r8), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm4, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rsi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm15, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3],ymm8[4,5,6],ymm10[7],ymm8[8,9,10],ymm10[11],ymm8[12,13,14],ymm10[15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm6[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm10, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7],ymm6[8,9,10],ymm2[11],ymm6[12,13,14],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rcx), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdx), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm14, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm12, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7,8],ymm0[9],ymm12[10,11,12],ymm0[13],ymm12[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm12, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm12[1],ymm9[2,3,4],ymm12[5],ymm9[6,7,8],ymm12[9],ymm9[10,11,12],ymm12[13],ymm9[14,15] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm13[0,1,2,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1,2],ymm12[3],ymm14[4,5,6],ymm12[7],ymm14[8,9,10],ymm12[11],ymm14[12,13,14],ymm12[15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm14, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6],ymm0[7],ymm13[8,9,10],ymm0[11],ymm13[12,13,14],ymm0[15] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm15[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2,3,4],ymm2[5],ymm12[6,7,8],ymm2[9],ymm12[10,11,12],ymm2[13],ymm12[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm24, %xmm3 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm15[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm15[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7,8],ymm12[9],ymm0[10,11,12],ymm12[13],ymm0[14,15] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm5[8],xmm11[9],xmm5[9],xmm11[10],xmm5[10],xmm11[11],xmm5[11],xmm11[12],xmm5[12],xmm11[13],xmm5[13],xmm11[14],xmm5[14],xmm11[15],xmm5[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm5[0,1,2,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm7, %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm11, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5,6],ymm3[7],ymm7[8,9,10],ymm3[11],ymm7[12,13,14],ymm3[15] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,1,1,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6],ymm2[7],ymm5[8,9,10],ymm2[11],ymm5[12,13,14],ymm2[15] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,3,3,3] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,4,6,5] +; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,6,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2 -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,1,3,3,6,5,7,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] +; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] ; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] -; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,6,7] -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[2,1,3,3,6,5,7,7] -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,2,2,3,4,5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 -; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] -; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm6[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512F-ONLY-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm16, %zmm18 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm17, %zmm19 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm16, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm17, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm18, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-ONLY-SLOW-NEXT: vzeroupper ; AVX512F-ONLY-SLOW-NEXT: retq ; @@ -2623,162 +2595,166 @@ ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r10), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r10), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6],ymm0[7],ymm5[8,9,10],ymm0[11],ymm5[12,13,14],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2,3,4],ymm2[5],ymm8[6,7,8],ymm2[9],ymm8[10,11,12],ymm2[13],ymm8[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm4, %xmm24 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm0, %ymm17, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3],ymm12[4,5,6],ymm1[7],ymm12[8,9,10],ymm1[11],ymm12[12,13,14],ymm1[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r10), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm11 -; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm13 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm28 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r10), %xmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rax), %xmm12 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r9), %xmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r8), %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[0,1,2,3,8,9,u,u,8,9,10,11,10,11,u,u,16,17,18,19,28,29,u,u,28,29,26,27,30,31,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3],ymm15[4,5,6],ymm0[7],ymm15[8,9,10],ymm0[11],ymm15[12,13,14],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23] -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,0,1,u,u,8,9,10,11,2,3,u,u,20,21,18,19,20,21,u,u,24,25,26,27,22,23,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm17 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm14[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm8[1],ymm15[2,3,4],ymm8[5],ymm15[6,7,8],ymm8[9],ymm15[10,11,12],ymm8[13],ymm15[14,15] -; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm31 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u] -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7,8],ymm14[9],ymm0[10,11,12],ymm14[13],ymm0[14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm18 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rax), %xmm15 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r9), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%r8), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm12 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm7 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm14 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm20 -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm6 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm28, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm9 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm24, %xmm22 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3],xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm7, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm9, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3],xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm22, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7,8],ymm2[9],ymm3[10,11,12],ymm2[13],ymm3[14,15] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm3 -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7] -; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm12[0,1,2],ymm6[3],ymm12[4,5,6],ymm6[7],ymm12[8,9,10],ymm6[11],ymm12[12,13,14],ymm6[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm27, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm5[8],xmm11[9],xmm5[9],xmm11[10],xmm5[10],xmm11[11],xmm5[11],xmm11[12],xmm5[12],xmm11[13],xmm5[13],xmm11[14],xmm5[14],xmm11[15],xmm5[15] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm6 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm6 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] -; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm6 ; AVX512F-ONLY-FAST-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 -; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7,8],ymm3[9],ymm4[10,11,12],ymm3[13],ymm4[14,15] -; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7,8],ymm0[9],ymm4[10,11,12],ymm0[13],ymm4[14,15] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512F-ONLY-FAST-NEXT: kmovw %ecx, %k1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm25, %zmm19 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm17, %zmm18 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm20, %zmm1 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1} -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm16, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm18, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm21, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, (%rax) ; AVX512F-ONLY-FAST-NEXT: vzeroupper ; AVX512F-ONLY-FAST-NEXT: retq ; @@ -2787,212 +2763,200 @@ ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rsi), %xmm10 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm19 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm20 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] ; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rcx), %xmm12 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdx), %xmm13 -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm1, %xmm22 -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7,8],ymm7[9],ymm5[10,11,12],ymm7[13],ymm5[14,15] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm16 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r10), %xmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa (%rax), %xmm6 -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm9 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm7 -; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm8 -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,5,7,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm15, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3],ymm1[4,5,6],ymm9[7],ymm1[8,9,10],ymm9[11],ymm1[12,13,14],ymm9[15] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm9, %ymm9 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5,6],ymm0[7],ymm9[8,9,10],ymm0[11],ymm9[12,13,14],ymm0[15] -; AVX512DQ-SLOW-NEXT: movw $-21846, %cx # imm = 0xAAAA -; AVX512DQ-SLOW-NEXT: kmovw %ecx, %k1 -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm16 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm14, %ymm2 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%r10), %xmm14 -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm9, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rax), %xmm15 -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm18 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%r9), %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa 16(%r8), %xmm9 -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,6,5,7,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,2,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6],ymm4[7],ymm2[8,9,10],ymm4[11],ymm2[12,13,14],ymm4[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7],ymm0[8,9,10],ymm1[11],ymm0[12,13,14],ymm1[15] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm17 {%k1} -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,1,3,3,6,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7,8],ymm5[9],ymm2[10,11,12],ymm5[13],ymm2[14,15] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm10, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm10 -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm3[8],xmm9[9],xmm3[9],xmm9[10],xmm3[10],xmm9[11],xmm3[11],xmm9[12],xmm3[12],xmm9[13],xmm3[13],xmm9[14],xmm3[14],xmm9[15],xmm3[15] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm10 {%k1} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7,8],ymm4[9],ymm0[10,11,12],ymm4[13],ymm0[14,15] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r10), %xmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rax), %xmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm5 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r9), %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa (%r8), %xmm12 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm6[0,1,2,3,6,5,7,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm2, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rsi), %xmm2 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm14, %ymm4 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm13[0,1,2],ymm5[3],ymm13[4,5,6],ymm5[7],ymm13[8,9,10],ymm5[11],ymm13[12,13,14],ymm5[15] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm13, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7],ymm6[8,9,10],ymm4[11],ymm6[12,13,14],ymm4[15] +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX512DQ-SLOW-NEXT: movw $-21846, %si # imm = 0xAAAA +; AVX512DQ-SLOW-NEXT: kmovw %esi, %k1 +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm5, %zmm4, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rcx), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdx), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm7[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm8, %ymm8 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm3, %ymm3 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm3[0],ymm8[1],ymm3[2,3,4],ymm8[5],ymm3[6,7,8],ymm8[9],ymm3[10,11,12],ymm8[13],ymm3[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%r10), %xmm3 +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm7[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm14, %ymm7 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7,8],ymm7[9],ymm1[10,11,12],ymm7[13],ymm1[14,15] +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rax), %xmm8 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%r9), %xmm7 +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm9, %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%r8), %xmm9 +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm11[0,1,2,3,6,5,7,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm12, %ymm12 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6],ymm13[7],ymm12[8,9,10],ymm13[11],ymm12[12,13,14],ymm13[15] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm10[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm13, %ymm11 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7],ymm11[8,9,10],ymm10[11],ymm11[12,13,14],ymm10[15] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm12, %zmm10, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm13[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm11, %ymm11 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,1,3,3,6,5,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7,8],ymm12[9],ymm11[10,11,12],ymm12[13],ymm11[14,15] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,1,1] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7,8],ymm12[9],ymm10[10,11,12],ymm12[13],ymm10[14,15] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm15 = xmm13[0,1,2,3,6,5,7,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm11[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm15, %ymm11 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1,2],ymm12[3],ymm14[4,5,6],ymm12[7],ymm14[8,9,10],ymm12[11],ymm14[12,13,14],ymm12[15] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3],ymm13[4,5,6],ymm11[7],ymm13[8,9,10],ymm11[11],ymm13[12,13,14],ymm11[15] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm12, %zmm11, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[3,3,3,3] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm4[0,1,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm5, %ymm5 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm6, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7,8],ymm5[9],ymm6[10,11,12],ymm5[13],ymm6[14,15] +; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm11, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm3[8],xmm8[9],xmm3[9],xmm8[10],xmm3[10],xmm8[11],xmm3[11],xmm8[12],xmm3[12],xmm8[13],xmm3[13],xmm8[14],xmm3[14],xmm8[15],xmm3[15] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm4 -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] -; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,5,5,7] +; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,6,5,7,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,0,2,1,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm7, %ymm3 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] -; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6],ymm4[7],ymm6[8,9,10],ymm4[11],ymm6[12,13,14],ymm4[15] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,1,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] +; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm4, %zmm3, %zmm2 {%k1} ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rax) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; @@ -3000,160 +2964,160 @@ ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3],xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7,8],ymm3[9],ymm4[10,11,12],ymm3[13],ymm4[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3,4],ymm0[5],ymm5[6,7,8],ymm0[9],ymm5[10,11,12],ymm0[13],ymm5[14,15] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa (%r10), %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %xmm15 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm8, %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm9 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm22 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3],ymm2[4,5,6],ymm7[7],ymm2[8,9,10],ymm7[11],ymm2[12,13,14],ymm7[15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7],ymm0[8,9,10],ymm1[11],ymm0[12,13,14],ymm1[15] +; AVX512DQ-FAST-NEXT: movw $-21846, %r11w # imm = 0xAAAA +; AVX512DQ-FAST-NEXT: kmovw %r11d, %k1 +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm16 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm13[8],xmm10[9],xmm13[9],xmm10[10],xmm13[10],xmm10[11],xmm13[11],xmm10[12],xmm13[12],xmm10[13],xmm13[13],xmm10[14],xmm13[14],xmm10[15],xmm13[15] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm22 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm4, %ymm9 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, %xmm5 -; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm25 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7,8],ymm1[9],ymm10[10,11,12],ymm1[13],ymm10[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rcx), %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm11, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdx), %xmm13 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 -; AVX512DQ-FAST-NEXT: vmovdqa (%r10), %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %xmm10 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rsi), %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 ; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm12 -; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm13 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm3[8],xmm9[9],xmm3[9],xmm9[10],xmm3[10],xmm9[11],xmm3[11],xmm9[12],xmm3[12],xmm9[13],xmm3[13],xmm9[14],xmm3[14],xmm9[15],xmm3[15] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm3 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm3 ; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm20 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX512DQ-FAST-NEXT: movw $-21846, %r11w # imm = 0xAAAA -; AVX512DQ-FAST-NEXT: kmovw %r11d, %k1 ; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm17 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rcx), %xmm3 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] -; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm29 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm14, %ymm0 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23,u,u,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7,8],ymm14[9],ymm0[10,11,12],ymm14[13],ymm0[14,15] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%r10), %xmm14 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%rax), %xmm15 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%r9), %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa 16(%r8), %xmm4 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[0,1,2,3,8,9,u,u,8,9,10,11,10,11,u,u,16,17,18,19,28,29,u,u,28,29,26,27,30,31,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3],ymm8[4,5,6],ymm1[7],ymm8[8,9,10],ymm1[11],ymm8[12,13,14],ymm1[15] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,0,1,u,u,u,u,u,u,2,3,u,u,u,u,u,u,20,21,u,u,u,u,u,u,22,23] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm11[0,1,2,3,0,1,u,u,8,9,10,11,2,3,u,u,20,21,18,19,20,21,u,u,24,25,26,27,22,23,u,u] -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3],ymm8[4,5,6],ymm0[7],ymm8[8,9,10],ymm0[11],ymm8[12,13,14],ymm0[15] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm16 {%k1} -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, %ymm7 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm6 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm11 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] -; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm5 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm14 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm15 -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm27, %ymm9 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm3, %zmm1, %zmm0 {%k1} -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm19, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm4 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm4 -; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7,8],ymm0[9],ymm3[10,11,12],ymm0[13],ymm3[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 16(%r10), %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm22 +; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm5, %xmm23 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm9, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rax), %xmm14 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] -; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512DQ-FAST-NEXT: vmovdqa 16(%r9), %xmm9 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3],xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%r8), %xmm15 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3],xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3],ymm5[4,5,6],ymm0[7],ymm5[8,9,10],ymm0[11],ymm5[12,13,14],ymm0[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm0, %zmm2, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm5 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm5 +; AVX512DQ-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm4[8],xmm14[9],xmm4[9],xmm14[10],xmm4[10],xmm14[11],xmm4[11],xmm14[12],xmm4[12],xmm14[13],xmm4[13],xmm14[14],xmm4[14],xmm14[15],xmm4[15] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm4 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm5 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm4, %zmm2, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x8 $1, %ymm4, %zmm2, %zmm0 {%k1} ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, (%rax) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -3162,110 +3126,90 @@ ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r11), %xmm25 -; AVX512BW-SLOW-NEXT: vmovdqa 16(%r11), %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa (%r10), %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa 16(%r10), %xmm12 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm25[8],xmm1[9],xmm25[9],xmm1[10],xmm25[10],xmm1[11],xmm25[11],xmm1[12],xmm25[12],xmm1[13],xmm25[13],xmm1[14],xmm25[14],xmm1[15],xmm25[15] +; AVX512BW-SLOW-NEXT: vmovdqa (%r11), %xmm11 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%r11), %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa (%r10), %xmm12 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%r10), %xmm1 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa 16(%r9), %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa 16(%r8), %xmm14 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm19 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] -; AVX512BW-SLOW-NEXT: vpermt2w %zmm5, %zmm23, %zmm19 -; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa 16(%rsi), %xmm15 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdi), %xmm16 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm10 -; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rcx), %xmm17 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdx), %xmm18 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm21 = xmm20[0,0,2,1,4,5,6,7] -; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm21 = xmm21[0],zero,xmm21[1],zero -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm22 = xmm20[0,2,2,3,4,5,6,7] -; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm22 = xmm22[0],zero,xmm22[1],zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm21, %ymm4 -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0],ymm4[1],ymm10[2,3,4],ymm4[5],ymm10[6,7,8],ymm4[9],ymm10[10,11,12],ymm4[13],ymm10[14,15] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm22 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u> -; AVX512BW-SLOW-NEXT: vpermt2w %ymm20, %ymm22, %ymm7 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm7 +; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%r9), %xmm2 +; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm14 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%r8), %xmm3 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] +; AVX512BW-SLOW-NEXT: vpermt2w %zmm5, %zmm4, %zmm15 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %xmm16 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%rcx), %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdx), %xmm7 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %xmm19 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%rsi), %xmm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm20 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm9 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm21 = xmm5[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm10, %ymm10 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,32,2,3,4,33,6,7,8,34,10,11,12,35,14,15,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u> +; AVX512BW-SLOW-NEXT: vpermt2w %zmm18, %zmm10, %zmm5 ; AVX512BW-SLOW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-SLOW-NEXT: kmovd %ecx, %k1 -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm19, %zmm7 {%k1} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm19, %zmm20 -; AVX512BW-SLOW-NEXT: vpermt2w %zmm4, %zmm23, %zmm20 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm21 = xmm4[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm10 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm21 = xmm19[0,0,2,1,4,5,6,7] -; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm21 = xmm21[0],zero,xmm21[1],zero -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm24 = xmm19[0,2,2,3,4,5,6,7] -; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm24 = xmm24[0],zero,xmm24[1],zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm0 -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7,8],ymm0[9],ymm10[10,11,12],ymm0[13],ymm10[14,15] -; AVX512BW-SLOW-NEXT: vpermt2w %ymm19, %ymm22, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm19 -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm20, %zmm19 {%k1} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm15, %zmm5 {%k1} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm12, %zmm12 +; AVX512BW-SLOW-NEXT: vpermt2w %zmm11, %zmm4, %zmm12 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 +; AVX512BW-SLOW-NEXT: vpermt2w %zmm11, %zmm10, %zmm13 +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm12, %zmm12, %zmm12 +; AVX512BW-SLOW-NEXT: vpermt2w %zmm11, %zmm4, %zmm12 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm14[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm15, %ymm15 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm14 +; AVX512BW-SLOW-NEXT: vpermt2w %zmm11, %zmm10, %zmm14 +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm12, %zmm14 {%k1} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 -; AVX512BW-SLOW-NEXT: vpermt2w %zmm0, %zmm23, %zmm4 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,0,2,1,4,5,6,7] -; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[0,2,2,3,4,5,6,7] -; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7,8],ymm12[9],ymm10[10,11,12],ymm12[13],ymm10[14,15] -; AVX512BW-SLOW-NEXT: vpermt2w %ymm11, %ymm22, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm25[0],xmm1[1],xmm25[1],xmm1[2],xmm25[2],xmm1[3],xmm25[3],xmm1[4],xmm25[4],xmm1[5],xmm25[5],xmm1[6],xmm25[6],xmm1[7],xmm25[7] +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 -; AVX512BW-SLOW-NEXT: vpermt2w %zmm1, %zmm23, %zmm2 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpermt2w %zmm0, %zmm4, %zmm1 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] -; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,2,2,3,4,5,6,7] -; AVX512BW-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] -; AVX512BW-SLOW-NEXT: vpermt2w %ymm4, %ymm22, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm19, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512BW-SLOW-NEXT: vpermt2w %zmm0, %zmm10, %zmm2 +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; @@ -3274,74 +3218,90 @@ ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-FAST-NEXT: vmovdqa (%r11), %xmm0 -; AVX512BW-FAST-NEXT: vmovdqa 16(%r11), %xmm7 -; AVX512BW-FAST-NEXT: vmovdqa (%r10), %xmm1 -; AVX512BW-FAST-NEXT: vmovdqa 16(%r10), %xmm8 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm5 -; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX512BW-FAST-NEXT: vmovdqa 16(%r9), %xmm9 -; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm3 -; AVX512BW-FAST-NEXT: vmovdqa 16(%r8), %xmm10 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,16,17,20,52,20,21,21,53,16,17,22,54,22,21,23,55] -; AVX512BW-FAST-NEXT: vpermt2w %zmm5, %zmm4, %zmm11 -; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm5 -; AVX512BW-FAST-NEXT: vmovdqa 16(%rcx), %xmm12 -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512BW-FAST-NEXT: vmovdqa 16(%rdx), %xmm14 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %xmm16 -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rsi), %xmm17 -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdi), %xmm19 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = <0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u> -; AVX512BW-FAST-NEXT: vpermt2w %zmm15, %zmm20, %zmm6 +; AVX512BW-FAST-NEXT: vmovdqa (%r11), %xmm11 +; AVX512BW-FAST-NEXT: vmovdqa 16(%r11), %xmm0 +; AVX512BW-FAST-NEXT: vmovdqa (%r10), %xmm12 +; AVX512BW-FAST-NEXT: vmovdqa 16(%r10), %xmm1 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm13 +; AVX512BW-FAST-NEXT: vmovdqa 16(%r9), %xmm2 +; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm14 +; AVX512BW-FAST-NEXT: vmovdqa 16(%r8), %xmm3 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,0,32,4,5,1,33,10,9,10,42,12,13,11,43,16,17,20,52,20,21,21,53,24,25,30,62,30,29,31,63] +; AVX512BW-FAST-NEXT: vpermt2w %zmm5, %zmm4, %zmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm16 +; AVX512BW-FAST-NEXT: vmovdqa 16(%rcx), %xmm6 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512BW-FAST-NEXT: vmovdqa 16(%rdx), %xmm7 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %xmm19 +; AVX512BW-FAST-NEXT: vmovdqa 16(%rsi), %xmm8 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm20 +; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm9 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm10 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm10 = <0,32,u,u,1,33,u,u,2,42,u,u,3,43,u,u,20,52,u,u,21,53,u,u,30,62,u,u,31,63,u,u> +; AVX512BW-FAST-NEXT: vpermt2w %zmm18, %zmm10, %zmm5 ; AVX512BW-FAST-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-FAST-NEXT: kmovd %ecx, %k1 -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm11, %zmm6 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm15 -; AVX512BW-FAST-NEXT: vpermt2w %zmm11, %zmm4, %zmm15 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm19[0],xmm17[0],xmm19[1],xmm17[1],xmm19[2],xmm17[2],xmm19[3],xmm17[3],xmm19[4],xmm17[4],xmm19[5],xmm17[5],xmm19[6],xmm17[6],xmm19[7],xmm17[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm21, %zmm21, %zmm21 -; AVX512BW-FAST-NEXT: vpermt2w %zmm11, %zmm20, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm15, %zmm21 {%k1} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm7 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm8 -; AVX512BW-FAST-NEXT: vpermt2w %zmm7, %zmm4, %zmm8 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm7 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 -; AVX512BW-FAST-NEXT: vpermt2w %zmm7, %zmm20, %zmm9 -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm15, %zmm5 {%k1} +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm11, %zmm11 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm12 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 +; AVX512BW-FAST-NEXT: vpermt2w %zmm11, %zmm4, %zmm12 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm11, %zmm11 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm14 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13 +; AVX512BW-FAST-NEXT: vpermt2w %zmm11, %zmm10, %zmm13 +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm11, %zmm11 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm12 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 +; AVX512BW-FAST-NEXT: vpermt2w %zmm11, %zmm4, %zmm12 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm11, %zmm11 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm15 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512BW-FAST-NEXT: vpermt2w %zmm11, %zmm10, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm12, %zmm14 {%k1} +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vpermt2w %zmm0, %zmm4, %zmm1 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3],xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm2, %zmm2 -; AVX512BW-FAST-NEXT: vpermt2w %zmm0, %zmm20, %zmm2 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512BW-FAST-NEXT: vpermt2w %zmm0, %zmm10, %zmm2 ; AVX512BW-FAST-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64 @@ -4156,7 +4116,7 @@ ; ; AVX1-ONLY-LABEL: store_i8_stride8_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $360, %rsp # imm = 0x168 +; AVX1-ONLY-NEXT: subq $296, %rsp # imm = 0x128 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm0 @@ -4164,673 +4124,601 @@ ; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm5 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm14 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm14, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm13 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm10 +; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm12 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm15 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3],xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm4 +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm10, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm10 +; AVX1-ONLY-NEXT: vmovdqa %xmm10, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm9 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm7[8],xmm13[9],xmm7[9],xmm13[10],xmm7[10],xmm13[11],xmm7[11],xmm13[12],xmm7[12],xmm13[13],xmm7[13],xmm13[14],xmm7[14],xmm13[15],xmm7[15] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm4[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm4 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm4, %ymm8 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4],ymm0[5],ymm7[6],ymm0[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm7[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm12 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm6, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm13, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm10 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 48(%r10), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm11 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2],ymm13[3],ymm1[4],ymm13[5],ymm1[6],ymm13[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm4, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4],ymm3[5],ymm7[6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm8 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm14, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm11[8],xmm2[9],xmm11[9],xmm2[10],xmm11[10],xmm2[11],xmm11[11],xmm2[12],xmm11[12],xmm2[13],xmm11[13],xmm2[14],xmm11[14],xmm2[15],xmm11[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm6, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm12 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm14 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm10 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm11 ; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2],ymm13[3],ymm1[4],ymm13[5],ymm1[6],ymm13[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm15[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm11, %ymm11 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm11, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2],ymm10[3],ymm0[4],ymm10[5],ymm0[6],ymm10[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm14, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7] -; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm6, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4],ymm0[5],ymm7[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm14, %ymm9, %ymm9 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm8[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 48(%r10), %xmm12 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm10 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm6, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm6[8],xmm11[8],xmm6[9],xmm11[9],xmm6[10],xmm11[10],xmm6[11],xmm11[11],xmm6[12],xmm11[12],xmm6[13],xmm11[13],xmm6[14],xmm11[14],xmm6[15],xmm11[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm14, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm11 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[1,1,1,1] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm4, %ymm14 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0],ymm13[1],ymm2[2],ymm13[3],ymm2[4],ymm13[5],ymm2[6],ymm13[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm9 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm8 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm14, %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm14, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm12[8],xmm10[9],xmm12[9],xmm10[10],xmm12[10],xmm10[11],xmm12[11],xmm10[12],xmm12[12],xmm10[13],xmm12[13],xmm10[14],xmm12[14],xmm10[15],xmm12[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm4, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2],ymm2[3],ymm6[4],ymm2[5],ymm6[6],ymm2[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm6, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm8, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[8],mem[8],xmm9[9],mem[9],xmm9[10],mem[10],xmm9[11],mem[11],xmm9[12],mem[12],xmm9[13],mem[13],xmm9[14],mem[14],xmm9[15],mem[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm11[0],zero,xmm11[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm4, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 192(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) -; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm4, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm15, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 384(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 352(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 288(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 256(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 224(%rax) ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 160(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, 448(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 128(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $360, %rsp # imm = 0x168 +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) +; AVX1-ONLY-NEXT: addq $296, %rsp # imm = 0x128 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; ; AVX2-SLOW-LABEL: store_i8_stride8_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $328, %rsp # imm = 0x148 +; AVX2-SLOW-NEXT: subq $264, %rsp # imm = 0x108 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa (%r10), %xmm2 +; AVX2-SLOW-NEXT: vmovdqa (%rax), %xmm3 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vmovdqa (%r9), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] +; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm12 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm15 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm6, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 16(%r10), %xmm6 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0],ymm11[1],ymm7[2,3,4],ymm11[5],ymm7[6,7,8],ymm11[9],ymm7[10,11,12],ymm11[13],ymm7[14,15] +; AVX2-SLOW-NEXT: vmovdqa 16(%rax), %xmm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2],ymm8[3],ymm11[4],ymm8[5],ymm11[6],ymm8[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 16(%r9), %xmm8 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7],ymm0[8,9,10],ymm1[11],ymm0[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[2,3,2,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vmovdqa 48(%r10), %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7,8],ymm7[9],ymm1[10,11,12],ymm7[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vmovdqa 48(%rax), %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4],ymm5[5],ymm7[6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 48(%r9), %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6],ymm5[7],ymm3[8,9,10],ymm5[11],ymm3[12,13,14],ymm5[15] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 48(%r8), %xmm3 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm14 -; AVX2-SLOW-NEXT: vmovdqa 48(%rcx), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 48(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm15[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm15[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm13, %ymm12 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7,8],ymm12[9],ymm14[10,11,12],ymm12[13],ymm14[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[3,3,3,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm15[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-SLOW-NEXT: vmovdqa 16(%r8), %xmm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7,8],ymm9[9],ymm1[10,11,12],ymm9[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm9 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm5[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4],ymm12[5],ymm4[6,7,8],ymm12[9],ymm4[10,11,12],ymm12[13],ymm4[14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] @@ -4842,1712 +4730,1657 @@ ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm5 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7,8],ymm14[9],ymm12[10,11,12],ymm14[13],ymm12[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[3,3,3,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[3,3,3,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] +; AVX2-SLOW-NEXT: vmovdqa 32(%r10), %xmm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 32(%rax), %xmm10 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3],ymm2[4,5,6],ymm5[7],ymm2[8,9,10],ymm5[11],ymm2[12,13,14],ymm5[15] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4],ymm2[5],ymm5[6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %xmm11 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7,8],ymm1[9],ymm6[10,11,12],ymm1[13],ymm6[14,15] +; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %xmm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7],ymm0[8,9,10],ymm1[11],ymm0[12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[3,3,3,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 16(%r10), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa 16(%rax), %xmm3 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 16(%r9), %xmm11 -; AVX2-SLOW-NEXT: vmovdqa 16(%r8), %xmm10 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[1,1,1,1] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm14 +; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1],ymm14[2,3,4],ymm0[5],ymm14[6,7,8],ymm0[9],ymm14[10,11,12],ymm0[13],ymm14[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[3,3,3,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm12[8],xmm10[9],xmm12[9],xmm10[10],xmm12[10],xmm10[11],xmm12[11],xmm10[12],xmm12[12],xmm10[13],xmm12[13],xmm10[14],xmm12[14],xmm10[15],xmm12[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7,8],ymm3[9],ymm6[10,11,12],ymm3[13],ymm6[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 48(%r10), %xmm13 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7,8],ymm14[9],ymm1[10,11,12],ymm14[13],ymm1[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm9[1],ymm1[2],ymm9[3],ymm1[4],ymm9[5],ymm1[6],ymm9[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm8, %ymm8 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 48(%rax), %xmm14 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 48(%r9), %xmm10 +; AVX2-SLOW-NEXT: vmovdqa 48(%r8), %xmm9 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm8 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 48(%rcx), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdx), %xmm3 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm12[1],ymm1[2],ymm12[3],ymm1[4],ymm12[5],ymm1[6],ymm12[7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3],ymm8[4,5,6],ymm1[7],ymm8[8,9,10],ymm1[11],ymm8[12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,3,3,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[3,3,3,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3,4],ymm0[5],ymm8[6,7,8],ymm0[9],ymm8[10,11,12],ymm0[13],ymm8[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[0,1,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,1,1] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero +; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,5,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7,8],ymm5[9],ymm6[10,11,12],ymm5[13],ymm6[14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4],ymm1[5],ymm5[6],ymm1[7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,4,6,5] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] ; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7,8],ymm2[9],ymm3[10,11,12],ymm2[13],ymm3[14,15] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,1,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3],ymm5[4,5,6],ymm3[7],ymm5[8,9,10],ymm3[11],ymm5[12,13,14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[0,0,2,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm11[0],zero,xmm11[1],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3,4],ymm10[5],ymm6[6,7,8],ymm10[9],ymm6[10,11,12],ymm10[13],ymm6[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5,5,7] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,4,6,5] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,6,6,7] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 160(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 128(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm8, 224(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm9, 192(%rax) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 480(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 448(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm15, 416(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm12, 384(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 352(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 288(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 480(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, 448(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-SLOW-NEXT: addq $328, %rsp # imm = 0x148 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) +; AVX2-SLOW-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: store_i8_stride8_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $392, %rsp # imm = 0x188 +; AVX2-FAST-NEXT: subq $264, %rsp # imm = 0x108 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vmovdqa (%r10), %xmm14 +; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm10 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm11 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm8 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm12 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm13 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7,8],ymm15[9],ymm5[10,11,12],ymm15[13],ymm5[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3],ymm8[4,5,6],ymm4[7],ymm8[8,9,10],ymm4[11],ymm8[12,13,14],ymm4[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm15 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2,3,4],ymm0[5],ymm15[6,7,8],ymm0[9],ymm15[10,11,12],ymm0[13],ymm15[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm14[8],xmm10[9],xmm14[9],xmm10[10],xmm14[10],xmm10[11],xmm14[11],xmm10[12],xmm14[12],xmm10[13],xmm14[13],xmm10[14],xmm14[14],xmm10[15],xmm14[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm12 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3],ymm10[4,5,6],ymm0[7],ymm10[8,9,10],ymm0[11],ymm10[12,13,14],ymm0[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm10 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7,8],ymm1[9],ymm10[10,11,12],ymm1[13],ymm10[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 48(%r10), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 48(%rax), %xmm5 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vmovdqa 48(%r9), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 48(%r8), %xmm13 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7],ymm4[8,9,10],ymm1[11],ymm4[12,13,14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm9 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 48(%rcx), %xmm14 -; AVX2-FAST-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm12 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7,8],ymm12[9],ymm1[10,11,12],ymm12[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm15, %ymm2 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa 16(%r10), %xmm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa 16(%rax), %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm4 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] +; AVX2-FAST-NEXT: vmovdqa 16(%r9), %xmm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 16(%r8), %xmm10 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm12 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm11[0,1,2],ymm3[3],ymm11[4,5,6],ymm3[7],ymm11[8,9,10],ymm3[11],ymm11[12,13,14],ymm3[15] +; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm15 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm15, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm0 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7,8],ymm8[9],ymm5[10,11,12],ymm8[13],ymm5[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2],ymm13[3],ymm5[4],ymm13[5],ymm5[6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7,8],ymm5[9],ymm7[10,11,12],ymm5[13],ymm7[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm4, %xmm5 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm5 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4],ymm2[5],ymm5[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%r10), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %xmm5 ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm6 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm11 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm11 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm15[1],ymm11[2,3,4],ymm15[5],ymm11[6,7,8],ymm15[9],ymm11[10,11,12],ymm15[13],ymm11[14,15] +; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm14 +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm15 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm12[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7,8],ymm13[9],ymm11[10,11,12],ymm13[13],ymm11[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2],ymm8[3],ymm11[4],ymm8[5],ymm11[6],ymm8[7] ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm7 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm8 -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm7 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7,8],ymm7[9],ymm10[10,11,12],ymm7[13],ymm10[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2],ymm4[3],ymm7[4],ymm4[5],ymm7[6],ymm4[7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm6, %xmm15 ; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm6, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7,8],ymm6[9],ymm1[10,11,12],ymm6[13],ymm1[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 16(%r10), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 16(%rax), %xmm5 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 16(%r9), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 16(%r8), %xmm7 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm14 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 -; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm15 -; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm0 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm13, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm3[1],ymm11[2],ymm3[3],ymm11[4],ymm3[5],ymm11[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm9 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm11, %ymm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7,8],ymm9[9],ymm11[10,11,12],ymm9[13],ymm11[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2],ymm2[3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 48(%r10), %xmm5 +; AVX2-FAST-NEXT: vmovdqa 48(%rax), %xmm6 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vmovdqa 48(%r9), %xmm10 +; AVX2-FAST-NEXT: vmovdqa 48(%r8), %xmm12 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm14, %xmm15 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm15, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vmovdqa 48(%rcx), %xmm15 +; AVX2-FAST-NEXT: vmovdqa 48(%rdx), %xmm0 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7,8],ymm9[9],ymm8[10,11,12],ymm9[13],ymm8[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2],ymm2[3],ymm8[4],ymm2[5],ymm8[6],ymm2[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm13[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7],ymm8[8,9,10],ymm7[11],ymm8[12,13,14],ymm7[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm8 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm7 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3,4],ymm0[5],ymm7[6,7,8],ymm0[9],ymm7[10,11,12],ymm0[13],ymm7[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3],ymm0[4],ymm6[5],ymm0[6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1],ymm3[2,3,4],ymm9[5],ymm3[6,7,8],ymm9[9],ymm3[10,11,12],ymm9[13],ymm3[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7],ymm6[8,9,10],ymm5[11],ymm6[12,13,14],ymm5[15] +; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7,8],ymm5[9],ymm1[10,11,12],ymm5[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm6 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm9 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm10 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4],ymm6[5],ymm9[6],ymm6[7] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7,8],ymm5[9],ymm7[10,11,12],ymm5[13],ymm7[14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4],ymm5[5],ymm0[6],ymm5[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm4, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 64(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 160(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 128(%rax) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 480(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 448(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 416(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 384(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 384(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 480(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 448(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $392, %rsp # imm = 0x188 +; AVX2-FAST-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: store_i8_stride8_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $392, %rsp # imm = 0x188 +; AVX2-FAST-PERLANE-NEXT: subq $264, %rsp # imm = 0x108 ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r10), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rax), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r9), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7,8],ymm15[9],ymm5[10,11,12],ymm15[13],ymm5[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3],ymm8[4,5,6],ymm4[7],ymm8[8,9,10],ymm4[11],ymm8[12,13,14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm0[1],ymm15[2,3,4],ymm0[5],ymm15[6,7,8],ymm0[9],ymm15[10,11,12],ymm0[13],ymm15[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm14[8],xmm10[9],xmm14[9],xmm10[10],xmm14[10],xmm10[11],xmm14[11],xmm10[12],xmm14[12],xmm10[13],xmm14[13],xmm10[14],xmm14[14],xmm10[15],xmm14[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm3, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3],ymm10[4,5,6],ymm0[7],ymm10[8,9,10],ymm0[11],ymm10[12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7,8],ymm1[9],ymm10[10,11,12],ymm1[13],ymm10[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r10), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rax), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r9), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r8), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm1[3],ymm4[4,5,6],ymm1[7],ymm4[8,9,10],ymm1[11],ymm4[12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rcx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm15, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7,8],ymm12[9],ymm1[10,11,12],ymm12[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm8, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm15, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r10), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rax), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm11, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,28,29,u,u,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r9), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r8), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm4, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm12, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm11[0,1,2],ymm3[3],ymm11[4,5,6],ymm3[7],ymm11[8,9,10],ymm3[11],ymm11[12,13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3],xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm14, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7,8],ymm8[9],ymm5[10,11,12],ymm8[13],ymm5[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2],ymm13[3],ymm5[4],ymm13[5],ymm5[6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm12, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7,8],ymm5[9],ymm7[10,11,12],ymm5[13],ymm7[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm1, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2],ymm2[3],ymm5[4],ymm2[5],ymm5[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r10), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm7, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm7, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm9, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm15[1],ymm11[2,3,4],ymm15[5],ymm11[6,7,8],ymm15[9],ymm11[10,11,12],ymm15[13],ymm11[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm13 = ymm12[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7,8],ymm13[9],ymm11[10,11,12],ymm13[13],ymm11[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm8[1],ymm11[2],ymm8[3],ymm11[4],ymm8[5],ymm11[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm13, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm12, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm10, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7,8],ymm7[9],ymm10[10,11,12],ymm7[13],ymm10[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2],ymm4[3],ymm7[4],ymm4[5],ymm7[6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm6, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm5, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,16,17,20,21,20,21,22,23,20,21,22,23,28,29,30,31] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7,8],ymm6[9],ymm1[10,11,12],ymm6[13],ymm1[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7],ymm2[8,9,10],ymm1[11],ymm2[12,13,14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,24,25,28,29,20,21,22,23,28,29,30,31,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm5, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r10), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rax), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r9), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%r8), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm12, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rcx), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdx), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm13, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm3[1],ymm11[2],ymm3[3],ymm11[4],ymm3[5],ymm11[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm13, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm11, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7,8],ymm9[9],ymm11[10,11,12],ymm9[13],ymm11[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2],ymm2[3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r10), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rax), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r9), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%r8), %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm13, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rsi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm14, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm15, %ymm8, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rcx), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdx), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7,8],ymm9[9],ymm8[10,11,12],ymm9[13],ymm8[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2],ymm2[3],ymm8[4],ymm2[5],ymm8[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm13[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7],ymm8[8,9,10],ymm7[11],ymm8[12,13,14],ymm7[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm11, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3,4],ymm0[5],ymm7[6,7,8],ymm0[9],ymm7[10,11,12],ymm0[13],ymm7[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2],ymm6[3],ymm0[4],ymm6[5],ymm0[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm8, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,16,17,20,21,16,17,20,21,24,25,26,27,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,20,21,18,19,20,21,18,19,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1],ymm3[2,3,4],ymm9[5],ymm3[6,7,8],ymm9[9],ymm3[10,11,12],ymm9[13],ymm3[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,16,17,18,19,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,16,17,18,19,28,29,26,27,28,29,26,27,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7],ymm6[8,9,10],ymm5[11],ymm6[12,13,14],ymm5[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7,8],ymm5[9],ymm1[10,11,12],ymm5[13],ymm1[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6],ymm6[7],ymm7[8,9,10],ymm6[11],ymm7[12,13,14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm8, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm7, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7,8],ymm10[9],ymm9[10,11,12],ymm10[13],ymm9[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4],ymm6[5],ymm9[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm15, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm7, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm7, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7,8],ymm5[9],ymm7[10,11,12],ymm5[13],ymm7[14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4],ymm5[5],ymm0[6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 480(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 448(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 416(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 384(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 256(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 352(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 320(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 160(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 416(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 384(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 480(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 448(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $392, %rsp # imm = 0x188 +; AVX2-FAST-PERLANE-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512F-SLOW-LABEL: store_i8_stride8_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $680, %rsp # imm = 0x2A8 +; AVX512F-SLOW-NEXT: subq $104, %rsp ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa 48(%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 48(%r10), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 48(%rax), %xmm5 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 48(%r9), %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 48(%r8), %xmm8 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm2, %ymm16 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 16(%rsi), %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm2, %ymm21 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm2, %ymm25 +; AVX512F-SLOW-NEXT: vmovdqa (%r10), %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa (%rax), %xmm11 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm3 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm2, %ymm28 +; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm7, %ymm6, %ymm27 +; AVX512F-SLOW-NEXT: vmovdqa 16(%rcx), %xmm6 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm12 +; AVX512F-SLOW-NEXT: vmovdqa 16(%rdx), %xmm7 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm8, %ymm0, %ymm31 +; AVX512F-SLOW-NEXT: vmovdqa 16(%r10), %xmm0 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm8, %ymm15 +; AVX512F-SLOW-NEXT: vmovdqa 16(%rax), %xmm9 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm8, %ymm16 +; AVX512F-SLOW-NEXT: vmovdqa 16(%r9), %xmm14 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm8, %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa 16(%r8), %xmm10 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm2, %ymm24 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm2, %ymm26 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm2, %ymm18 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm2, %ymm19 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm2, %ymm20 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm2, %ymm30 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm2, %ymm22 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm2, %ymm23 +; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm13 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 ; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm25[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm11 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm2 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm11 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm1 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%r10), %xmm0 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm1 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm9, %ymm14 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm25 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm30 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm23 -; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rax), %xmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm28[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm6 +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] +; AVX512F-SLOW-NEXT: vpandnq %zmm6, %zmm3, %zmm6 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm27[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm12 +; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %xmm9 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm8, %ymm7, %ymm21 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpandq %zmm3, %zmm12, %zmm12 +; AVX512F-SLOW-NEXT: movw $-21846, %r11w # imm = 0xAAAA +; AVX512F-SLOW-NEXT: kmovw %r11d, %k1 +; AVX512F-SLOW-NEXT: vpord %zmm6, %zmm12, %zmm11 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm6, %ymm6 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm12, %ymm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm31[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm15[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm6, %zmm21 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm2, %zmm21 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512F-SLOW-NEXT: vpandnq %zmm5, %zmm3, %zmm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm24[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm26[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm6, %zmm6 +; AVX512F-SLOW-NEXT: vpandq %zmm3, %zmm6, %zmm6 +; AVX512F-SLOW-NEXT: vpord %zmm5, %zmm6, %zmm21 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm29 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm6, %ymm6 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm12, %ymm5 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm12 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm14[8],xmm10[9],xmm14[9],xmm10[10],xmm14[10],xmm10[11],xmm14[11],xmm10[12],xmm14[12],xmm10[13],xmm14[13],xmm10[14],xmm14[14],xmm10[15],xmm14[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm6, %ymm27 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm7, %ymm20 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm6, %ymm25 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm6, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 ; AVX512F-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm6, %ymm6 +; AVX512F-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm5, %ymm6, %ymm16 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm6[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm5, %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rcx), %xmm5 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm18[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm19[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm10 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm6[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm6, %ymm13, %ymm18 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rdx), %xmm6 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm2, %zmm10 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm7, %ymm19 +; AVX512F-SLOW-NEXT: vmovdqa 48(%r10), %xmm7 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm20[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm30[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm8[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm8, %ymm13, %ymm20 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rax), %xmm8 +; AVX512F-SLOW-NEXT: vpandnq %zmm12, %zmm3, %zmm12 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm22[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13 +; AVX512F-SLOW-NEXT: vpandq %zmm3, %zmm13, %zmm13 +; AVX512F-SLOW-NEXT: vpord %zmm12, %zmm13, %zmm10 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm1, %ymm14 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm29 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm22 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm1, %ymm31 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm1, %ymm23 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm28 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rcx), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rdx), %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm17 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm24 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm1, %ymm27 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm1, %ymm26 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm26 -; AVX512F-SLOW-NEXT: vmovdqa 16(%r10), %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rax), %xmm13 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm28 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm1, %ymm24 +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm1, %ymm30 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 16(%r9), %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa 16(%r8), %xmm11 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa 48(%rsi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,1,1,1] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm5 = mem[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm6 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm5 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm6 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm7 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vpandnq %zmm4, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm7, %zmm7 -; AVX512F-SLOW-NEXT: movw $-21846, %ax # imm = 0xAAAA -; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vpord %zmm4, %zmm7, %zmm6 {%k1} -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm10, %ymm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm16[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm16 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm5, %zmm16 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm7 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpandnq %zmm2, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpord %zmm2, %zmm4, %zmm16 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm14 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm14 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm1 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm2 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpandnq %zmm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpord %zmm1, %zmm2, %zmm14 {%k1} -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm2 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm25[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm17 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm5, %zmm17 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vpandnq %zmm2, %zmm0, %zmm2 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vpord %zmm2, %zmm3, %zmm17 {%k1} -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm2, %ymm25 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm2, %ymm23 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm13 +; AVX512F-SLOW-NEXT: vmovdqa 48(%r9), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 48(%r8), %xmm0 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm12 ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,4,5,5,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm9, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw (%rsp), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3],xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm11, %ymm10, %ymm20 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm9, %ymm10, %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3],xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[0,0,2,1,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,2,2,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm11, %ymm9 -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX512F-SLOW-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3],xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,1,1,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[2,1,3,3,4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm11, %ymm11 -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm8[0,1,2,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm12, %ymm8 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rsi), %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm13 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[2,3,2,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm30 = xmm7[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm30 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm13, %ymm13 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm30 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm7, %ymm30, %ymm7 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm30 = mem[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm13, %zmm13 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm7, %zmm5, %zmm13 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm7 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm29 = ymm29[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm7, %zmm7 -; AVX512F-SLOW-NEXT: vpandnq %zmm7, %zmm0, %zmm7 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm29 = ymm31[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm28 = ymm28[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm29, %zmm28 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm28, %zmm28 -; AVX512F-SLOW-NEXT: vpord %zmm7, %zmm28, %zmm13 {%k1} -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm30 = xmm15[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm28 = xmm15[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,1,1] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm30 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm28[0],zero,zero,zero,xmm28[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm30, %ymm28 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm29, %ymm15 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm29 = xmm12[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm30 = xmm12[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm15, %zmm15 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm27 = ymm27[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm26 = ymm26[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm27, %zmm26 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm15, %zmm5, %zmm26 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[2,3,2,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm27 = xmm7[3,3,3,3] -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm22 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm24, %zmm22 -; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm19 = mem[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm18 = mem[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm19, %zmm18 -; AVX512F-SLOW-NEXT: vpandnq %zmm22, %zmm0, %zmm19 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm18, %zmm18 -; AVX512F-SLOW-NEXT: vpord %zmm19, %zmm18, %zmm26 {%k1} -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm29[0],zero,zero,zero,xmm29[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm18, %ymm18 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm31, %ymm12 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm12, %zmm12 -; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512F-SLOW-NEXT: # ymm18 = mem[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm19 = ymm25[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm18 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm12, %zmm5, %zmm18 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpandnq %zmm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpord %zmm1, %zmm2, %zmm18 {%k1} +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm9, %ymm9 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm6 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[0,0,2,1,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,2,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm4[0,1,2,3,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm8 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm15[8],xmm8[9],xmm15[9],xmm8[10],xmm15[10],xmm8[11],xmm15[11],xmm8[12],xmm15[12],xmm8[13],xmm15[13],xmm8[14],xmm15[14],xmm8[15],xmm15[15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm31 = xmm8[2,3,2,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm31[0],zero,zero,zero,xmm31[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm29 = xmm8[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm29[0],zero,zero,zero,xmm29[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,1,1] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm8, %ymm31, %ymm8 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm8, %zmm29 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm31 = mem[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm8, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm29, %zmm2, %zmm8 +; AVX512F-SLOW-NEXT: vpshufd $96, (%rsp), %ymm29 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm29 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm31 = mem[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm29, %zmm29 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm31 = ymm27[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm31, %zmm25 +; AVX512F-SLOW-NEXT: vpandnq %zmm29, %zmm3, %zmm29 +; AVX512F-SLOW-NEXT: vpandq %zmm3, %zmm25, %zmm25 +; AVX512F-SLOW-NEXT: vpord %zmm29, %zmm25, %zmm8 {%k1} +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[2,1,3,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm1, %ymm25 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm15 +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm29 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm31 = xmm0[2,3,2,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm31[0],zero,zero,zero,xmm31[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm27 = xmm0[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm27[0],zero,zero,zero,xmm27[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm31, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm27 = mem[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm31 = mem[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm27, %zmm27 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm2, %zmm27 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rsi), %xmm0 +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] +; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm15 +; AVX512F-SLOW-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload +; AVX512F-SLOW-NEXT: # ymm31 = mem[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm16[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm31, %zmm16 +; AVX512F-SLOW-NEXT: vpandnq %zmm16, %zmm3, %zmm16 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm17 +; AVX512F-SLOW-NEXT: vpandq %zmm3, %zmm17, %zmm17 +; AVX512F-SLOW-NEXT: vpord %zmm16, %zmm17, %zmm27 {%k1} +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm1[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm1[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm31 = xmm1[1,1,1,1] +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm15, %ymm15 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm31[0],zero,zero,zero,xmm31[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm18, %ymm16 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm1[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm18 = xmm1[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm19[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm19 = ymm20[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm16, %zmm16 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm15, %zmm2, %zmm16 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,3,2,3] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm19 = xmm0[3,3,3,3] +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm22 = ymm22[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm14, %zmm14 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm22 = ymm23[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm23 = ymm24[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm22 +; AVX512F-SLOW-NEXT: vpandnq %zmm14, %zmm3, %zmm14 +; AVX512F-SLOW-NEXT: vpandq %zmm3, %zmm22, %zmm22 +; AVX512F-SLOW-NEXT: vpord %zmm14, %zmm22, %zmm16 {%k1} +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm14, %ymm14 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm31, %ymm1 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm26[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm17 = ymm28[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm14 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm30[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm12, %zmm9 +; AVX512F-SLOW-NEXT: vpandnq %zmm1, %zmm3, %zmm1 +; AVX512F-SLOW-NEXT: vpandq %zmm3, %zmm9, %zmm9 +; AVX512F-SLOW-NEXT: vpord %zmm1, %zmm9, %zmm14 {%k1} ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm27[0],zero,zero,zero,xmm27[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm28, %ymm2 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,1,1,3,4,5,5,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[2,1,3,3,6,5,7,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm5, %zmm2 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[0,0,2,1,4,4,6,5] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[0,2,2,3,4,6,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512F-SLOW-NEXT: vpandnq %zmm1, %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vpandq %zmm0, %zmm3, %zmm0 -; AVX512F-SLOW-NEXT: vpord %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm20, %ymm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm6[2,1,3,3,6,5,7,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm2, %zmm1 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[0,0,2,1,4,4,6,5] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm29[0,2,2,3,4,6,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpandnq %zmm0, %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vpandq %zmm3, %zmm2, %zmm2 +; AVX512F-SLOW-NEXT: vpord %zmm0, %zmm2, %zmm1 {%k1} ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, 192(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm26, 128(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm13, 320(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm17, 256(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 448(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 384(%rax) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512F-SLOW-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 448(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, 320(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm27, 256(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512F-SLOW-NEXT: addq $104, %rsp ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: store_i8_stride8_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $392, %rsp # imm = 0x188 +; AVX512F-FAST-NEXT: subq $456, %rsp # imm = 0x1C8 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 48(%rcx), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdx), %xmm13 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] +; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 16(%rcx), %xmm12 +; AVX512F-FAST-NEXT: vmovdqa64 32(%rcx), %xmm16 +; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm7, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm27 ; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm2 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm3 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa %ymm7, %ymm10 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa %ymm7, %ymm11 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm15 +; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm4 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3],xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm30 +; AVX512F-FAST-NEXT: vmovdqa64 48(%rcx), %xmm29 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm2[8],xmm8[9],xmm2[9],xmm8[10],xmm2[10],xmm8[11],xmm2[11],xmm8[12],xmm2[12],xmm8[13],xmm2[13],xmm8[14],xmm2[14],xmm8[15],xmm2[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 16(%r10), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa 16(%rax), %xmm9 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512F-FAST-NEXT: vmovdqa %ymm10, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 16(%r9), %xmm7 +; AVX512F-FAST-NEXT: vmovdqa 16(%r8), %xmm10 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm12 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3],xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm14, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 48(%r10), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 48(%rax), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 48(%r9), %xmm5 -; AVX512F-FAST-NEXT: vmovdqa 48(%r8), %xmm6 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm4[8],xmm13[9],xmm4[9],xmm13[10],xmm4[10],xmm13[11],xmm4[11],xmm13[12],xmm4[12],xmm13[13],xmm4[13],xmm13[14],xmm4[14],xmm13[15],xmm4[15] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm11, %ymm14 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm17 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdx), %xmm11 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm21 -; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm22 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm6, %zmm23 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm6, %ymm5 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%r10), %xmm12 +; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm13 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa %ymm4, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm8 +; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm7[8],xmm10[9],xmm7[9],xmm10[10],xmm7[10],xmm10[11],xmm7[11],xmm10[12],xmm7[12],xmm10[13],xmm7[13],xmm10[14],xmm7[14],xmm10[15],xmm7[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, %ymm9 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm1 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm26 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm22 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm9, %ymm28 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm23 +; AVX512F-FAST-NEXT: vmovdqa 48(%r10), %xmm9 +; AVX512F-FAST-NEXT: vmovdqa 48(%rax), %xmm10 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm15, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm21 +; AVX512F-FAST-NEXT: vmovdqa 48(%r9), %xmm7 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa %ymm6, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa %ymm14, %ymm12 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm24 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm15 +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm14 ; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa 16(%rcx), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm26 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm27 -; AVX512F-FAST-NEXT: vmovdqa 16(%r10), %xmm6 -; AVX512F-FAST-NEXT: vmovdqa 16(%rax), %xmm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm20 -; AVX512F-FAST-NEXT: vmovdqa 16(%r9), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa 16(%r8), %xmm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm13, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm13 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm16 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa %ymm8, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm30 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm28 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm6 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm9, %ymm18 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm31 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm31 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm19 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm29 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm19 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 ; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm12, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm13 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm6 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm12, %ymm17 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm16 +; AVX512F-FAST-NEXT: vmovdqa 48(%r8), %xmm13 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm8 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} zmm6 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm8 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm15, %ymm20 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm8 +; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm15 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm4 +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} zmm15 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm27, %zmm15, %zmm4 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX512F-FAST-NEXT: vpandnq %zmm29, %zmm0, %zmm29 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm17, %zmm17 +; AVX512F-FAST-NEXT: vpandnq %zmm25, %zmm0, %zmm25 +; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm30, %zmm30 ; AVX512F-FAST-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vpord %zmm29, %zmm17, %zmm8 {%k1} -; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1],xmm15[2],mem[2],xmm15[3],mem[3],xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm15, %zmm17 -; AVX512F-FAST-NEXT: vmovdqa 48(%rsi), %xmm15 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm3 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm29, %ymm3 +; AVX512F-FAST-NEXT: vpord %zmm25, %zmm30, %zmm4 {%k1} +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm3 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm25, %ymm3 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm7 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm3 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm15, %ymm3 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm30 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm4 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm28, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm11 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm25 +; AVX512F-FAST-NEXT: vmovdqa 16(%rsi), %xmm11 +; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm2 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm27, %ymm2 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm4 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm15, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm3 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm11, %ymm3 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm4, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa 16(%rsi), %xmm14 -; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm15 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm5 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm5, %ymm19, %ymm5 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm14 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm14, %ymm15, %ymm14 -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm12, %ymm9 -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm10 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm27 +; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm3 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm31, %ymm3 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm11 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm10 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm10, %ymm11, %ymm10 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm9, %ymm9, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm7 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm13[8],xmm7[8],xmm13[9],xmm7[9],xmm13[10],xmm7[10],xmm13[11],xmm7[11],xmm13[12],xmm7[12],xmm13[13],xmm7[13],xmm13[14],xmm7[14],xmm13[15],xmm7[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm10 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 +; AVX512F-FAST-NEXT: vmovdqa 48(%rsi), %xmm10 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm11 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm14 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm14, %ymm31, %ymm14 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm13, %ymm13 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 +; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm10, %ymm10, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm11 +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5 +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm30 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpord %zmm10, %zmm11, %zmm7 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm2 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vpandnq (%rsp), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpord %zmm10, %zmm11, %zmm30 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm1 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpord %zmm10, %zmm11, %zmm1 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, (%rsp), %zmm15, %zmm2 # 64-byte Folded Reload +; AVX512F-FAST-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: vpord %zmm10, %zmm11, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm21, %zmm6, %zmm1 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm26, %zmm15, %zmm3 ; AVX512F-FAST-NEXT: vpandnq %zmm22, %zmm0, %zmm10 ; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm23, %zmm11 -; AVX512F-FAST-NEXT: vpord %zmm10, %zmm11, %zmm1 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm24, %zmm6, %zmm3 -; AVX512F-FAST-NEXT: vpandnq %zmm25, %zmm0, %zmm10 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm26, %zmm11 ; AVX512F-FAST-NEXT: vpord %zmm10, %zmm11, %zmm3 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm27, %zmm6, %zmm4 -; AVX512F-FAST-NEXT: vpandnq %zmm20, %zmm0, %zmm10 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm16, %zmm11 -; AVX512F-FAST-NEXT: vpord %zmm10, %zmm11, %zmm4 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm30, %zmm6, %zmm5 -; AVX512F-FAST-NEXT: vpandnq %zmm31, %zmm0, %zmm10 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm13, %zmm11 -; AVX512F-FAST-NEXT: vpord %zmm10, %zmm11, %zmm5 {%k1} -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm17, %zmm6, %zmm9 -; AVX512F-FAST-NEXT: vpandnq %zmm29, %zmm0, %zmm6 -; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm28, %zmm0 -; AVX512F-FAST-NEXT: vpord %zmm6, %zmm0, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm21, %zmm15, %zmm9 +; AVX512F-FAST-NEXT: vpandnq %zmm24, %zmm0, %zmm10 +; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm28, %zmm11 +; AVX512F-FAST-NEXT: vpord %zmm10, %zmm11, %zmm9 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm29, %zmm15, %zmm13 +; AVX512F-FAST-NEXT: vpandnq %zmm16, %zmm0, %zmm7 +; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm8, %zmm8 +; AVX512F-FAST-NEXT: vpord %zmm7, %zmm8, %zmm13 {%k1} +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm25, %zmm15, %zmm5 +; AVX512F-FAST-NEXT: vpandnq %zmm27, %zmm0, %zmm7 +; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm6, %zmm0 +; AVX512F-FAST-NEXT: vpord %zmm7, %zmm0, %zmm5 {%k1} ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) -; AVX512F-FAST-NEXT: addq $392, %rsp # imm = 0x188 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 448(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 256(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm30, 64(%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512F-FAST-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -6555,228 +6388,224 @@ ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-SLOW-NEXT: vmovdqa (%r10), %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 16(%r10), %xmm11 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r10), %xmm21 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r10), %xmm19 -; AVX512BW-SLOW-NEXT: vmovdqa (%rax), %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa 16(%rax), %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rax), %xmm22 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rax), %xmm20 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm3 +; AVX512BW-SLOW-NEXT: vmovdqa (%r10), %xmm5 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%r10), %xmm13 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%r10), %xmm7 +; AVX512BW-SLOW-NEXT: vmovdqa (%rax), %xmm6 +; AVX512BW-SLOW-NEXT: vmovdqa 16(%rax), %xmm14 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%rax), %xmm8 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm11 ; AVX512BW-SLOW-NEXT: vmovdqa 16(%r9), %xmm15 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r9), %xmm23 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r9), %xmm25 -; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm4 +; AVX512BW-SLOW-NEXT: vmovdqa 32(%r9), %xmm9 +; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm12 ; AVX512BW-SLOW-NEXT: vmovdqa64 16(%r8), %xmm16 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm24 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r8), %xmm26 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,16,17,20,21,20,21,21,23,16,17,22,21,22,21,23,23] -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm7, %zmm2 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,16,17,20,20,20,20,22,21,16,17,20,22,20,22,22,23] -; AVX512BW-SLOW-NEXT: movl $-2004318072, %eax # imm = 0x88888888 -; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm8, %zmm2 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rsi), %xmm28 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rdi), %xmm29 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm5, %ymm5 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm10 -; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rcx), %xmm31 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,20,20,18,19,22,21,22,21,20,22,18,19,22,23,22,23] -; AVX512BW-SLOW-NEXT: movl $572662306, %eax # imm = 0x22222222 -; AVX512BW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm14, %zmm5 {%k2} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm17, %zmm17, %zmm17 -; AVX512BW-SLOW-NEXT: vpermw %zmm17, %zmm7, %zmm17 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm8, %zmm17 {%k1} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm18 = xmm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm27 = xmm0[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm27[0],zero,zero,zero,xmm27[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm18, %ymm18 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm27, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa 48(%rdx), %xmm1 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm18 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm31[0],xmm1[1],xmm31[1],xmm1[2],xmm31[2],xmm1[3],xmm31[3],xmm1[4],xmm31[4],xmm1[5],xmm31[5],xmm1[6],xmm31[6],xmm1[7],xmm31[7] +; AVX512BW-SLOW-NEXT: vmovdqa 32(%r8), %xmm10 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm14, %zmm18 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rsi), %xmm30 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15] -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rcx), %xmm27 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm19, %zmm25 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm20, %zmm20, %zmm19 -; AVX512BW-SLOW-NEXT: vpermw %zmm19, %zmm7, %zmm19 -; AVX512BW-SLOW-NEXT: vpermw %zmm25, %zmm8, %zmm19 {%k1} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm25 = xmm20[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm25[0],zero,zero,zero,xmm25[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm26 = xmm20[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm26 = xmm26[0],zero,zero,zero,xmm26[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm25, %ymm25 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm26 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm20 = xmm20[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm26, %ymm20 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm20 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm31[8],xmm1[9],xmm31[9],xmm1[10],xmm31[10],xmm1[11],xmm31[11],xmm1[12],xmm31[12],xmm1[13],xmm31[13],xmm1[14],xmm31[14],xmm1[15],xmm31[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm14, %zmm20 {%k2} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3],xmm24[4],xmm23[4],xmm24[5],xmm23[5],xmm24[6],xmm23[6],xmm24[7],xmm23[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm25, %zmm25, %zmm25 -; AVX512BW-SLOW-NEXT: vpermw %zmm25, %zmm7, %zmm25 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm8, %zmm25 {%k1} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm30[0],xmm0[1],xmm30[1],xmm0[2],xmm30[2],xmm0[3],xmm30[3],xmm0[4],xmm30[4],xmm0[5],xmm30[5],xmm0[6],xmm30[6],xmm0[7],xmm30[7] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm26 = xmm1[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm26 = xmm26[0],zero,zero,zero,xmm26[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm28 = xmm1[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm28[0],zero,zero,zero,xmm28[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm26, %ymm26 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,16,17,20,21,20,21,21,23,16,17,22,21,22,21,23,23] +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm2, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,16,17,20,20,20,20,22,21,16,17,20,22,20,22,22,23] +; AVX512BW-SLOW-NEXT: movl $-2004318072, %r11d # imm = 0x88888888 +; AVX512BW-SLOW-NEXT: kmovd %r11d, %k1 +; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm3, %zmm0 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %xmm18 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm20 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm20[0],xmm18[0],xmm20[1],xmm18[1],xmm20[2],xmm18[2],xmm20[3],xmm18[3],xmm20[4],xmm18[4],xmm20[5],xmm18[5],xmm20[6],xmm18[6],xmm20[7],xmm18[7] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm1[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm4, %ymm4 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm28, %ymm1 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdx), %xmm31 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm26, %zmm1, %zmm26 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm31[0],xmm27[0],xmm31[1],xmm27[1],xmm31[2],xmm27[2],xmm31[3],xmm27[3],xmm31[4],xmm27[4],xmm31[5],xmm27[5],xmm31[6],xmm27[6],xmm31[7],xmm27[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm14, %zmm26 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rsi), %xmm28 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdi), %xmm29 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm24[8],xmm23[8],xmm24[9],xmm23[9],xmm24[10],xmm23[10],xmm24[11],xmm23[11],xmm24[12],xmm23[12],xmm24[13],xmm23[13],xmm24[14],xmm23[14],xmm24[15],xmm23[15] -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rcx), %xmm23 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm21, %zmm21, %zmm21 -; AVX512BW-SLOW-NEXT: vpermw %zmm21, %zmm7, %zmm21 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm8, %zmm21 {%k1} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm30[8],xmm0[9],xmm30[9],xmm0[10],xmm30[10],xmm0[11],xmm30[11],xmm0[12],xmm30[12],xmm0[13],xmm30[13],xmm0[14],xmm30[14],xmm0[15],xmm30[15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm22 = xmm0[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm22, %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm22, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm22 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm31[8],xmm27[8],xmm31[9],xmm27[9],xmm31[10],xmm27[10],xmm31[11],xmm27[11],xmm31[12],xmm27[12],xmm31[13],xmm27[13],xmm31[14],xmm27[14],xmm31[15],xmm27[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm14, %zmm22 {%k2} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm7, %zmm24 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm8, %zmm24 {%k1} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm27 = xmm0[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm27[0],zero,zero,zero,xmm27[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm0, %ymm27, %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdx), %xmm30 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm27 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm30[0],xmm23[0],xmm30[1],xmm23[1],xmm30[2],xmm23[2],xmm30[3],xmm23[3],xmm30[4],xmm23[4],xmm30[5],xmm23[5],xmm30[6],xmm23[6],xmm30[7],xmm23[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm14, %zmm27 {%k2} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm1, %zmm7, %zmm1 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm8, %zmm1 {%k1} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] -; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512BW-SLOW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX512BW-SLOW-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm3, %zmm3 -; AVX512BW-SLOW-NEXT: vpermw %zmm3, %zmm7, %zmm3 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm4 -; AVX512BW-SLOW-NEXT: vpermw %zmm4, %zmm8, %zmm3 {%k1} -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm30[8],xmm23[8],xmm30[9],xmm23[9],xmm30[10],xmm23[10],xmm30[11],xmm23[11],xmm30[12],xmm23[12],xmm30[13],xmm23[13],xmm30[14],xmm23[14],xmm30[15],xmm23[15] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm4 -; AVX512BW-SLOW-NEXT: vpermw %zmm4, %zmm14, %zmm0 {%k2} -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm1, %ymm17, %ymm1 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %xmm22 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm23 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm4, %zmm17 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,20,20,18,19,22,21,22,21,20,22,18,19,22,23,22,23] +; AVX512BW-SLOW-NEXT: movl $572662306, %r11d # imm = 0x22222222 +; AVX512BW-SLOW-NEXT: kmovd %r11d, %k2 +; AVX512BW-SLOW-NEXT: vpermw %zmm17, %zmm4, %zmm1 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rsi), %xmm19 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdi), %xmm21 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rcx), %xmm17 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm5, %zmm11 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm5 +; AVX512BW-SLOW-NEXT: vpermw %zmm5, %zmm2, %zmm5 +; AVX512BW-SLOW-NEXT: vpermw %zmm11, %zmm3, %zmm5 {%k1} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm20[8],xmm18[8],xmm20[9],xmm18[9],xmm20[10],xmm18[10],xmm20[11],xmm18[11],xmm20[12],xmm18[12],xmm20[13],xmm18[13],xmm20[14],xmm18[14],xmm20[15],xmm18[15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm11, %ymm11 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] ; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,3,3,3] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm6, %zmm6 -; AVX512BW-SLOW-NEXT: vpermw %zmm6, %zmm14, %zmm4 {%k2} +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm12, %ymm6 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm6 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512BW-SLOW-NEXT: vpermw %zmm11, %zmm4, %zmm6 {%k2} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm12 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm11, %zmm11, %zmm11 +; AVX512BW-SLOW-NEXT: vpermw %zmm11, %zmm2, %zmm11 +; AVX512BW-SLOW-NEXT: vpermw %zmm12, %zmm3, %zmm11 {%k1} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm18 = xmm12[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm20 = xmm12[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm18, %ymm18 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm12, %ymm20, %ymm12 +; AVX512BW-SLOW-NEXT: vmovdqa64 16(%rdx), %xmm23 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm12, %zmm12 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm23[0],xmm17[0],xmm23[1],xmm17[1],xmm23[2],xmm17[2],xmm23[3],xmm17[3],xmm23[4],xmm17[4],xmm23[5],xmm17[5],xmm23[6],xmm17[6],xmm23[7],xmm17[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm18, %zmm18, %zmm18 +; AVX512BW-SLOW-NEXT: vpermw %zmm18, %zmm4, %zmm12 {%k2} +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rsi), %xmm20 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdi), %xmm22 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rcx), %xmm18 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm13, %zmm15 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm14, %zmm14, %zmm13 +; AVX512BW-SLOW-NEXT: vpermw %zmm13, %zmm2, %zmm13 +; AVX512BW-SLOW-NEXT: vpermw %zmm15, %zmm3, %zmm13 {%k1} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm14[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm15, %ymm15 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm16, %ymm14 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm23[8],xmm17[8],xmm23[9],xmm17[9],xmm23[10],xmm17[10],xmm23[11],xmm17[11],xmm23[12],xmm17[12],xmm23[13],xmm17[13],xmm23[14],xmm17[14],xmm23[15],xmm17[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm15 +; AVX512BW-SLOW-NEXT: vpermw %zmm15, %zmm4, %zmm14 {%k2} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm16 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm15, %zmm15, %zmm15 +; AVX512BW-SLOW-NEXT: vpermw %zmm15, %zmm2, %zmm15 +; AVX512BW-SLOW-NEXT: vpermw %zmm16, %zmm3, %zmm15 {%k1} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm22[0],xmm20[0],xmm22[1],xmm20[1],xmm22[2],xmm20[2],xmm22[3],xmm20[3],xmm22[4],xmm20[4],xmm22[5],xmm20[5],xmm22[6],xmm20[6],xmm22[7],xmm20[7] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm16[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm19 = xmm16[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm19, %ymm17, %ymm17 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm16 = xmm16[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm19, %ymm16 +; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdx), %xmm19 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm19[0],xmm18[0],xmm19[1],xmm18[1],xmm19[2],xmm18[2],xmm19[3],xmm18[3],xmm19[4],xmm18[4],xmm19[5],xmm18[5],xmm19[6],xmm18[6],xmm19[7],xmm18[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm17, %zmm17, %zmm17 +; AVX512BW-SLOW-NEXT: vpermw %zmm17, %zmm4, %zmm16 {%k2} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r10), %xmm17 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512BW-SLOW-NEXT: vmovdqa 48(%rax), %xmm9 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm7, %zmm7, %zmm10 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm8, %zmm7 +; AVX512BW-SLOW-NEXT: vpermw %zmm7, %zmm2, %zmm7 +; AVX512BW-SLOW-NEXT: vpermw %zmm10, %zmm3, %zmm7 {%k1} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm22[8],xmm20[8],xmm22[9],xmm20[9],xmm22[10],xmm20[10],xmm22[11],xmm20[11],xmm22[12],xmm20[12],xmm22[13],xmm20[13],xmm22[14],xmm20[14],xmm22[15],xmm20[15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm20 = xmm8[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm10, %ymm10 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm8, %ymm20, %ymm8 +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%r9), %xmm20 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512BW-SLOW-NEXT: vmovdqa 48(%r8), %xmm10 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm19[8],xmm18[8],xmm19[9],xmm18[9],xmm19[10],xmm18[10],xmm19[11],xmm18[11],xmm19[12],xmm18[12],xmm19[13],xmm18[13],xmm19[14],xmm18[14],xmm19[15],xmm18[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm18, %zmm18, %zmm18 +; AVX512BW-SLOW-NEXT: vpermw %zmm18, %zmm4, %zmm8 {%k2} +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm9[0],xmm17[0],xmm9[1],xmm17[1],xmm9[2],xmm17[2],xmm9[3],xmm17[3],xmm9[4],xmm17[4],xmm9[5],xmm17[5],xmm9[6],xmm17[6],xmm9[7],xmm17[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm18, %zmm18, %zmm18 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm10[0],xmm20[0],xmm10[1],xmm20[1],xmm10[2],xmm20[2],xmm10[3],xmm20[3],xmm10[4],xmm20[4],xmm10[5],xmm20[5],xmm10[6],xmm20[6],xmm10[7],xmm20[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm19, %zmm19 +; AVX512BW-SLOW-NEXT: vpermw %zmm19, %zmm2, %zmm19 +; AVX512BW-SLOW-NEXT: vpermw %zmm18, %zmm3, %zmm19 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rsi), %xmm18 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm17[8],xmm9[9],xmm17[9],xmm9[10],xmm17[10],xmm9[11],xmm17[11],xmm9[12],xmm17[12],xmm9[13],xmm17[13],xmm9[14],xmm17[14],xmm9[15],xmm17[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rdi), %xmm17 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm20[8],xmm10[9],xmm20[9],xmm10[10],xmm20[10],xmm10[11],xmm20[11],xmm10[12],xmm20[12],xmm10[13],xmm20[13],xmm10[14],xmm20[14],xmm10[15],xmm20[15] +; AVX512BW-SLOW-NEXT: vmovdqa64 48(%rcx), %xmm20 +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 +; AVX512BW-SLOW-NEXT: vpermw %zmm10, %zmm2, %zmm2 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3],xmm17[4],xmm18[4],xmm17[5],xmm18[5],xmm17[6],xmm18[6],xmm17[7],xmm18[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 +; AVX512BW-SLOW-NEXT: vpermw %zmm9, %zmm3, %zmm2 {%k1} +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm3 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 +; AVX512BW-SLOW-NEXT: vmovdqa 48(%rdx), %xmm10 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm3 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm20[0],xmm10[1],xmm20[1],xmm10[2],xmm20[2],xmm10[3],xmm20[3],xmm10[4],xmm20[4],xmm10[5],xmm20[5],xmm10[6],xmm20[6],xmm10[7],xmm20[7] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm9, %zmm9 +; AVX512BW-SLOW-NEXT: vpermw %zmm9, %zmm4, %zmm3 {%k2} +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm17[8],xmm18[8],xmm17[9],xmm18[9],xmm17[10],xmm18[10],xmm17[11],xmm18[11],xmm17[12],xmm18[12],xmm17[13],xmm18[13],xmm17[14],xmm18[14],xmm17[15],xmm18[15] +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm9[2,3,2,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm18 = xmm9[3,3,3,3] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm17, %ymm17 +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,1,1] +; AVX512BW-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm9, %ymm18, %ymm9 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm9 +; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm20[8],xmm10[9],xmm20[9],xmm10[10],xmm20[10],xmm10[11],xmm20[11],xmm10[12],xmm20[12],xmm10[13],xmm20[13],xmm10[14],xmm20[14],xmm10[15],xmm20[15] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm10, %zmm10 +; AVX512BW-SLOW-NEXT: vpermw %zmm10, %zmm4, %zmm9 {%k2} ; AVX512BW-SLOW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm2, %zmm5 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm17, %zmm18 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm19, %zmm20 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm25, %zmm26 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm21, %zmm22 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm24, %zmm27 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm5, %zmm6 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm13, %zmm14 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm15, %zmm16 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm19, %zmm3 {%k1} +; AVX512BW-SLOW-NEXT: vmovdqa32 %zmm2, %zmm9 {%k1} ; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm27, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm22, 320(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm26, 256(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, 448(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm18, 384(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, 448(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, 320(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, 256(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq ; @@ -6784,234 +6613,234 @@ ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FAST-NEXT: vmovdqa (%r10), %xmm0 +; AVX512BW-FAST-NEXT: vmovdqa (%r10), %xmm8 ; AVX512BW-FAST-NEXT: vmovdqa 16(%r10), %xmm12 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r10), %xmm18 -; AVX512BW-FAST-NEXT: vmovdqa64 48(%r10), %xmm17 -; AVX512BW-FAST-NEXT: vmovdqa (%rax), %xmm1 -; AVX512BW-FAST-NEXT: vmovdqa 16(%rax), %xmm15 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rax), %xmm19 -; AVX512BW-FAST-NEXT: vmovdqa64 48(%rax), %xmm20 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512BW-FAST-NEXT: vmovdqa 32(%r10), %xmm9 +; AVX512BW-FAST-NEXT: vmovdqa 48(%r10), %xmm0 +; AVX512BW-FAST-NEXT: vmovdqa (%rax), %xmm11 +; AVX512BW-FAST-NEXT: vmovdqa 16(%rax), %xmm13 +; AVX512BW-FAST-NEXT: vmovdqa 32(%rax), %xmm10 +; AVX512BW-FAST-NEXT: vmovdqa 48(%rax), %xmm1 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm3 ; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7,0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm3, %zmm6 -; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm3 -; AVX512BW-FAST-NEXT: vmovdqa64 48(%r9), %xmm21 -; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm4 -; AVX512BW-FAST-NEXT: vmovdqa64 48(%r8), %xmm22 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm5, %ymm5, %ymm5 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm5, %zmm7 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7,0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] -; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm7, %zmm16 +; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm3, %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm14 +; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %xmm17 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3],xmm17[4],xmm14[4],xmm17[5],xmm14[5],xmm17[6],xmm14[6],xmm17[7],xmm14[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm5 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7,0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15] +; AVX512BW-FAST-NEXT: vpshufb %zmm3, %zmm5, %zmm16 ; AVX512BW-FAST-NEXT: movl $-2004318072, %eax # imm = 0x88888888 ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm6, %zmm16 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512BW-FAST-NEXT: vmovdqa64 48(%rcx), %xmm23 -; AVX512BW-FAST-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm8 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm8, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15,8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm8, %zmm24 -; AVX512BW-FAST-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512BW-FAST-NEXT: vmovdqa64 48(%rsi), %xmm25 -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512BW-FAST-NEXT: vmovdqa64 48(%rdi), %xmm26 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm14 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm14, %ymm27 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm8, %xmm28 -; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm28, %ymm8, %ymm8 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm4, %zmm16 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm18 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %xmm21 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm21[0],xmm18[0],xmm21[1],xmm18[1],xmm21[2],xmm18[2],xmm21[3],xmm18[3],xmm21[4],xmm18[4],xmm21[5],xmm18[5],xmm21[6],xmm18[6],xmm21[7],xmm18[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm15 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %xmm22 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %xmm23 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm7 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u> +; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u> +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm7 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 +; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15,8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15] +; AVX512BW-FAST-NEXT: vpshufb %zmm7, %zmm15, %zmm15 ; AVX512BW-FAST-NEXT: movl $572662306, %eax # imm = 0x22222222 ; AVX512BW-FAST-NEXT: kmovd %eax, %k2 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm8 {%k2} +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm15, %zmm4 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa 16(%r9), %xmm15 ; AVX512BW-FAST-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-FAST-NEXT: kmovd %eax, %k3 -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm16, %zmm8 {%k3} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm20[0],xmm17[0],xmm20[1],xmm17[1],xmm20[2],xmm17[2],xmm20[3],xmm17[3],xmm20[4],xmm17[4],xmm20[5],xmm17[5],xmm20[6],xmm17[6],xmm20[7],xmm17[7] +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm16, %zmm4 {%k3} +; AVX512BW-FAST-NEXT: vmovdqa64 16(%r8), %xmm19 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] +; AVX512BW-FAST-NEXT: vmovdqa64 16(%rcx), %xmm16 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15] +; AVX512BW-FAST-NEXT: vmovdqa 16(%rsi), %xmm14 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm8 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm11, %zmm11 +; AVX512BW-FAST-NEXT: vpshufb %zmm3, %zmm11, %zmm11 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm8, %zmm11 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdi), %xmm20 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm21[8],xmm18[8],xmm21[9],xmm18[9],xmm21[10],xmm18[10],xmm21[11],xmm18[11],xmm21[12],xmm18[12],xmm21[13],xmm18[13],xmm21[14],xmm18[14],xmm21[15],xmm18[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm8, %ymm8 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vpshufb %zmm7, %zmm8, %zmm17 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm8, %ymm8, %ymm18 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm18, %ymm18 +; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm21 +; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm8, %ymm8 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm8, %zmm8 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm17, %zmm8 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm11, %zmm8 {%k3} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm11, %zmm11 +; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm11, %zmm11 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm19[0],xmm15[0],xmm19[1],xmm15[1],xmm19[2],xmm15[2],xmm19[3],xmm15[3],xmm19[4],xmm15[4],xmm19[5],xmm15[5],xmm19[6],xmm15[6],xmm19[7],xmm15[7] +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm17 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm17, %zmm17 +; AVX512BW-FAST-NEXT: vpshufb %zmm3, %zmm17, %zmm17 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm11, %zmm17 {%k1} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm20[0],xmm14[0],xmm20[1],xmm14[1],xmm20[2],xmm14[2],xmm20[3],xmm14[3],xmm20[4],xmm14[4],xmm20[5],xmm14[5],xmm20[6],xmm14[6],xmm20[7],xmm14[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm18 +; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm21, %ymm18 +; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdx), %xmm22 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm11, %ymm11 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm11 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm18, %zmm11 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm22[0],xmm16[0],xmm22[1],xmm16[1],xmm22[2],xmm16[2],xmm22[3],xmm16[3],xmm22[4],xmm16[4],xmm22[5],xmm16[5],xmm22[6],xmm16[6],xmm22[7],xmm16[7] +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm18 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm18, %zmm18 +; AVX512BW-FAST-NEXT: vpshufb %zmm7, %zmm18, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm18, %zmm11 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %xmm18 +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm17, %zmm11 {%k3} +; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %xmm21 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rcx), %xmm17 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm19[8],xmm15[8],xmm19[9],xmm15[9],xmm19[10],xmm15[10],xmm19[11],xmm15[11],xmm19[12],xmm15[12],xmm19[13],xmm15[13],xmm19[14],xmm15[14],xmm19[15],xmm15[15] +; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %xmm15 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm12 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 +; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm12, %zmm12 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm13, %zmm13 +; AVX512BW-FAST-NEXT: vpshufb %zmm3, %zmm13, %zmm13 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm12, %zmm13 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm19 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm22[8],xmm16[8],xmm22[9],xmm16[9],xmm22[10],xmm16[10],xmm22[11],xmm16[11],xmm22[12],xmm16[12],xmm22[13],xmm16[13],xmm22[14],xmm16[14],xmm22[15],xmm16[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm12 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 +; AVX512BW-FAST-NEXT: vpshufb %zmm7, %zmm12, %zmm16 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm20[8],xmm14[8],xmm20[9],xmm14[9],xmm20[10],xmm14[10],xmm20[11],xmm14[11],xmm20[12],xmm14[12],xmm20[13],xmm14[13],xmm20[14],xmm14[14],xmm20[15],xmm14[15] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm14 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm14 +; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm12, %xmm20 +; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm12, %ymm12 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm12 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm16, %zmm12 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm13, %zmm12 {%k3} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm13, %zmm13 +; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm13, %zmm13 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm21[0],xmm18[0],xmm21[1],xmm18[1],xmm21[2],xmm18[2],xmm21[3],xmm18[3],xmm21[4],xmm18[4],xmm21[5],xmm18[5],xmm21[6],xmm18[6],xmm21[7],xmm18[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm14, %zmm14 +; AVX512BW-FAST-NEXT: vpshufb %zmm3, %zmm14, %zmm16 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm13, %zmm16 {%k1} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm19[0],xmm15[0],xmm19[1],xmm15[1],xmm19[2],xmm15[2],xmm19[3],xmm15[3],xmm19[4],xmm15[4],xmm19[5],xmm15[5],xmm19[6],xmm15[6],xmm19[7],xmm15[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm13, %xmm14 +; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14 +; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm22 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm13 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm13 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm22[0],xmm17[0],xmm22[1],xmm17[1],xmm22[2],xmm17[2],xmm22[3],xmm17[3],xmm22[4],xmm17[4],xmm22[5],xmm17[5],xmm22[6],xmm17[6],xmm22[7],xmm17[7] +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm14, %ymm14, %ymm14 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm14, %zmm14 +; AVX512BW-FAST-NEXT: vpshufb %zmm7, %zmm14, %zmm14 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm14, %zmm13 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa 48(%r9), %xmm14 +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm16, %zmm13 {%k3} +; AVX512BW-FAST-NEXT: vmovdqa64 48(%r8), %xmm20 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512BW-FAST-NEXT: vmovdqa 48(%rcx), %xmm10 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm21[8],xmm18[8],xmm21[9],xmm18[9],xmm21[10],xmm18[10],xmm21[11],xmm18[11],xmm21[12],xmm18[12],xmm21[13],xmm18[13],xmm21[14],xmm18[14],xmm21[15],xmm18[15] +; AVX512BW-FAST-NEXT: vmovdqa 48(%rsi), %xmm9 ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm16, %ymm16 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm16, %zmm16 ; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm16, %zmm16 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm24, %ymm24, %ymm24 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm24, %zmm24 -; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm24, %zmm24 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm16, %zmm24 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm16, %xmm27 -; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm28, %ymm27 -; AVX512BW-FAST-NEXT: vmovdqa64 48(%rdx), %xmm30 -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm16, %ymm16, %ymm16 -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm16, %ymm16 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm30[0],xmm23[0],xmm30[1],xmm23[1],xmm30[2],xmm23[2],xmm30[3],xmm23[3],xmm30[4],xmm23[4],xmm30[5],xmm23[5],xmm30[6],xmm23[6],xmm30[7],xmm23[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm27, %ymm27 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm27, %zmm27 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm27, %zmm27 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm27, %zmm16 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r9), %xmm27 -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm24, %zmm16 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%r8), %xmm29 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm20[8],xmm17[8],xmm20[9],xmm17[9],xmm20[10],xmm17[10],xmm20[11],xmm17[11],xmm20[12],xmm17[12],xmm20[13],xmm17[13],xmm20[14],xmm17[14],xmm20[15],xmm17[15] -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rcx), %xmm24 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %xmm21 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm18 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm18, %zmm18 +; AVX512BW-FAST-NEXT: vpshufb %zmm3, %zmm18, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm16, %zmm18 {%k1} +; AVX512BW-FAST-NEXT: vmovdqa64 48(%rdi), %xmm16 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm22[8],xmm17[8],xmm22[9],xmm17[9],xmm22[10],xmm17[10],xmm22[11],xmm17[11],xmm22[12],xmm17[12],xmm22[13],xmm17[13],xmm22[14],xmm17[14],xmm22[15],xmm17[15] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm17 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm17, %zmm17 -; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm17, %zmm17 -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm20, %zmm20 -; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm20, %zmm20 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm17, %zmm20 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm28 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm30[8],xmm23[8],xmm30[9],xmm23[9],xmm30[10],xmm23[10],xmm30[11],xmm23[11],xmm30[12],xmm23[12],xmm30[13],xmm23[13],xmm30[14],xmm23[14],xmm30[15],xmm23[15] +; AVX512BW-FAST-NEXT: vpshufb %zmm7, %zmm17, %zmm17 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm19[8],xmm15[8],xmm19[9],xmm15[9],xmm19[10],xmm15[10],xmm19[11],xmm15[11],xmm19[12],xmm15[12],xmm19[13],xmm15[13],xmm19[14],xmm15[14],xmm19[15],xmm15[15] +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm15, %ymm19 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm19, %ymm19 +; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm15, %xmm21 +; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm15, %ymm15 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm15, %zmm15 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm17, %zmm15 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm18, %zmm15 {%k3} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm17 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm17, %zmm17 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm17, %zmm22 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm23 -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm23, %ymm23 -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm17, %xmm25 -; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm17, %ymm17 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm17, %zmm17 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm22, %zmm17 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm20, %zmm17 {%k3} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm19[0],xmm18[0],xmm19[1],xmm18[1],xmm19[2],xmm18[2],xmm19[3],xmm18[3],xmm19[4],xmm18[4],xmm19[5],xmm18[5],xmm19[6],xmm18[6],xmm19[7],xmm18[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm20, %zmm20 -; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm20, %zmm20 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm29[0],xmm27[0],xmm29[1],xmm27[1],xmm29[2],xmm27[2],xmm29[3],xmm27[3],xmm29[4],xmm27[4],xmm29[5],xmm27[5],xmm29[6],xmm27[6],xmm29[7],xmm27[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm22, %ymm22 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm22, %zmm22 -; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm22, %zmm22 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm20, %zmm22 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm28[0],xmm21[0],xmm28[1],xmm21[1],xmm28[2],xmm21[2],xmm28[3],xmm21[3],xmm28[4],xmm21[4],xmm28[5],xmm21[5],xmm28[6],xmm21[6],xmm28[7],xmm21[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm20, %xmm23 -; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm23, %ymm25, %ymm23 -; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdx), %xmm30 -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20 -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm20, %ymm20 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm20, %zmm23, %zmm20 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm30[0],xmm24[0],xmm30[1],xmm24[1],xmm30[2],xmm24[2],xmm30[3],xmm24[3],xmm30[4],xmm24[4],xmm30[5],xmm24[5],xmm30[6],xmm24[6],xmm30[7],xmm24[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm23, %ymm23, %ymm23 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm23, %zmm23 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm23, %zmm23 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm23, %zmm20 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa64 16(%r9), %xmm23 -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm22, %zmm20 {%k3} -; AVX512BW-FAST-NEXT: vmovdqa64 16(%r8), %xmm26 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm19[8],xmm18[8],xmm19[9],xmm18[9],xmm19[10],xmm18[10],xmm19[11],xmm18[11],xmm19[12],xmm18[12],xmm19[13],xmm18[13],xmm19[14],xmm18[14],xmm19[15],xmm18[15] -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rcx), %xmm22 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm29[8],xmm27[8],xmm29[9],xmm27[9],xmm29[10],xmm27[10],xmm29[11],xmm27[11],xmm29[12],xmm27[12],xmm29[13],xmm27[13],xmm29[14],xmm27[14],xmm29[15],xmm27[15] -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rsi), %xmm19 -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm18 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm18, %zmm18 -; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm18, %zmm18 -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm25, %ymm25 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm25, %zmm25 -; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm25, %zmm27 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm18, %zmm27 {%k1} -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdi), %xmm25 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm30[8],xmm24[8],xmm30[9],xmm24[9],xmm30[10],xmm24[10],xmm30[11],xmm24[11],xmm30[12],xmm24[12],xmm30[13],xmm24[13],xmm30[14],xmm24[14],xmm30[15],xmm24[15] +; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm17, %zmm17 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm20[0],xmm14[0],xmm20[1],xmm14[1],xmm20[2],xmm14[2],xmm20[3],xmm14[3],xmm20[4],xmm14[4],xmm20[5],xmm14[5],xmm20[6],xmm14[6],xmm20[7],xmm14[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm18 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm18, %zmm18 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm18, %zmm24 -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm28[8],xmm21[8],xmm28[9],xmm21[9],xmm28[10],xmm21[10],xmm28[11],xmm21[11],xmm28[12],xmm21[12],xmm28[13],xmm21[13],xmm28[14],xmm21[14],xmm28[15],xmm21[15] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm21 -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm21, %ymm21 -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm18, %xmm28 -; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm28, %ymm18, %ymm18 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm24, %zmm18 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm27, %zmm18 {%k3} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3],xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm21, %ymm21 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm21, %zmm21 -; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm21, %zmm21 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm26[0],xmm23[0],xmm26[1],xmm23[1],xmm26[2],xmm23[2],xmm26[3],xmm23[3],xmm26[4],xmm23[4],xmm26[5],xmm23[5],xmm26[6],xmm23[6],xmm26[7],xmm23[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm24, %ymm24, %ymm24 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm24, %zmm24 -; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm24, %zmm24 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm21, %zmm24 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm25[0],xmm19[0],xmm25[1],xmm19[1],xmm25[2],xmm19[2],xmm25[3],xmm19[3],xmm25[4],xmm19[4],xmm25[5],xmm19[5],xmm25[6],xmm19[6],xmm25[7],xmm19[7] -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm21, %xmm27 -; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm28 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm28, %ymm27 -; AVX512BW-FAST-NEXT: vmovdqa64 16(%rdx), %xmm28 -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm21, %ymm21 -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm21, %ymm21 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm27, %zmm21 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm28[0],xmm22[0],xmm28[1],xmm22[1],xmm28[2],xmm22[2],xmm28[3],xmm22[3],xmm28[4],xmm22[4],xmm28[5],xmm22[5],xmm28[6],xmm22[6],xmm28[7],xmm22[7] -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm27, %ymm27 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm27, %zmm27 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm27, %zmm27 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm27, %zmm21 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm24, %zmm21 {%k3} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm26[8],xmm23[8],xmm26[9],xmm23[9],xmm26[10],xmm23[10],xmm26[11],xmm23[11],xmm26[12],xmm23[12],xmm26[13],xmm23[13],xmm26[14],xmm23[14],xmm26[15],xmm23[15] -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm12 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 -; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm12, %zmm12 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm15, %zmm15 -; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm15, %zmm15 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm12, %zmm15 {%k1} -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm28[8],xmm22[8],xmm28[9],xmm22[9],xmm28[10],xmm22[10],xmm28[11],xmm22[11],xmm28[12],xmm22[12],xmm28[13],xmm22[13],xmm28[14],xmm22[14],xmm28[15],xmm22[15] -; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm25[8],xmm19[8],xmm25[9],xmm19[9],xmm25[10],xmm19[10],xmm25[11],xmm19[11],xmm25[12],xmm19[12],xmm25[13],xmm19[13],xmm25[14],xmm19[14],xmm25[15],xmm19[15] -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm19, %xmm22 -; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm22, %ymm23, %ymm22 +; AVX512BW-FAST-NEXT: vpshufb %zmm3, %zmm18, %zmm18 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm17, %zmm18 {%k1} +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm16[0],xmm9[0],xmm16[1],xmm9[1],xmm16[2],xmm9[2],xmm16[3],xmm9[3],xmm16[4],xmm9[4],xmm16[5],xmm9[5],xmm16[6],xmm9[6],xmm16[7],xmm9[7] +; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm17, %xmm19 +; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm21, %ymm19 +; AVX512BW-FAST-NEXT: vmovdqa64 48(%rdx), %xmm21 +; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm17 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm17, %ymm17 +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm19, %zmm17 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm21[0],xmm10[0],xmm21[1],xmm10[1],xmm21[2],xmm10[2],xmm21[3],xmm10[3],xmm21[4],xmm10[4],xmm21[5],xmm10[5],xmm21[6],xmm10[6],xmm21[7],xmm10[7] ; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm19, %ymm19, %ymm19 -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm19, %ymm19 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm22, %zmm19 -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm12 -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm12, %zmm12 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm12, %zmm12 -; AVX512BW-FAST-NEXT: vmovdqu16 %zmm12, %zmm19 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa32 %zmm15, %zmm19 {%k3} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm19, %zmm19 +; AVX512BW-FAST-NEXT: vpshufb %zmm7, %zmm19, %zmm19 +; AVX512BW-FAST-NEXT: vmovdqu16 %zmm19, %zmm17 {%k2} +; AVX512BW-FAST-NEXT: vmovdqa32 %zmm18, %zmm17 {%k3} +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpshufb %zmm2, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm20[8],xmm14[8],xmm20[9],xmm14[9],xmm20[10],xmm14[10],xmm20[11],xmm14[11],xmm20[12],xmm14[12],xmm20[13],xmm14[13],xmm20[14],xmm14[14],xmm20[15],xmm14[15] ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512BW-FAST-NEXT: vpshufb %zmm5, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpshufb %zmm3, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm21[8],xmm10[8],xmm21[9],xmm10[9],xmm21[10],xmm10[10],xmm21[11],xmm10[11],xmm21[12],xmm10[12],xmm21[13],xmm10[13],xmm21[14],xmm10[14],xmm21[15],xmm10[15] ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpshufb %zmm9, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512BW-FAST-NEXT: vpshufb %zmm7, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm16[8],xmm9[8],xmm16[9],xmm9[9],xmm16[10],xmm9[10],xmm16[11],xmm9[11],xmm16[12],xmm9[12],xmm16[13],xmm9[13],xmm16[14],xmm9[14],xmm16[15],xmm9[15] ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm4 +; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512BW-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX512BW-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm0, %zmm2 {%k2} ; AVX512BW-FAST-NEXT: vmovdqa32 %zmm1, %zmm2 {%k3} ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm19, 192(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm21, 128(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm18, 320(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm20, 256(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm16, 384(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm2, 448(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 @@ -7034,7 +6863,9 @@ ret void } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} ; AVX1: {{.*}} +; AVX2: {{.*}} ; AVX512BW: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} ; AVX512BW-ONLY-SLOW: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-intrinsics.ll b/llvm/test/CodeGen/X86/vector-intrinsics.ll --- a/llvm/test/CodeGen/X86/vector-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-intrinsics.ll @@ -161,14 +161,15 @@ ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movaps (%rdi), %xmm0 -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rdi), %xmm1 +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 16(%rdi), %xmm0 -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps 32(%rdi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps 32(%rdi), %xmm0 +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps 48(%rdi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm1, %xmm0 ; CHECK-NEXT: callq exp@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -209,13 +210,13 @@ ; CHECK-NEXT: callq exp@PLT ; CHECK-NEXT: movsd %xmm0, 64(%rbx) ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm0, (%rbx) -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm0, 16(%rbx) +; CHECK-NEXT: movaps %xmm0, 48(%rbx) ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps %xmm0, 32(%rbx) ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm0, 48(%rbx) +; CHECK-NEXT: movaps %xmm0, 16(%rbx) +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) ; CHECK-NEXT: addq $96, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: retq @@ -232,24 +233,24 @@ ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movaps (%rdi), %xmm0 -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rdi), %xmm2 +; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 16(%rdi), %xmm0 -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 32(%rdi), %xmm0 +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps 48(%rdi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps 48(%rdi), %xmm2 -; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movaps (%rsi), %xmm0 -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rsi), %xmm1 +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 16(%rsi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 32(%rsi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps 48(%rsi), %xmm1 -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps 48(%rsi), %xmm0 +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm2, %xmm0 ; CHECK-NEXT: callq pow@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -304,13 +305,13 @@ ; CHECK-NEXT: callq pow@PLT ; CHECK-NEXT: movsd %xmm0, 64(%rbx) ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm0, (%rbx) -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm0, 16(%rbx) +; CHECK-NEXT: movaps %xmm0, 48(%rbx) ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps %xmm0, 32(%rbx) ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm0, 48(%rbx) +; CHECK-NEXT: movaps %xmm0, 16(%rbx) +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) ; CHECK-NEXT: addq $160, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: retq @@ -330,14 +331,15 @@ ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movaps (%rdi), %xmm0 -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rdi), %xmm1 +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 16(%rdi), %xmm0 -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps 32(%rdi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps 32(%rdi), %xmm0 +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps 48(%rdi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm1, %xmm0 ; CHECK-NEXT: movl %esi, %edi ; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -387,13 +389,13 @@ ; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movsd %xmm0, 64(%rbx) ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm0, (%rbx) -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm0, 16(%rbx) +; CHECK-NEXT: movaps %xmm0, 48(%rbx) ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps %xmm0, 32(%rbx) ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps %xmm0, 48(%rbx) +; CHECK-NEXT: movaps %xmm0, 16(%rbx) +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) ; CHECK-NEXT: addq $104, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/vector-narrow-binop.ll b/llvm/test/CodeGen/X86/vector-narrow-binop.ll --- a/llvm/test/CodeGen/X86/vector-narrow-binop.ll +++ b/llvm/test/CodeGen/X86/vector-narrow-binop.ll @@ -107,9 +107,11 @@ ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: psubd %xmm0, %xmm2 -; SSE-NEXT: psrld $16, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -117,9 +119,8 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: retq %sub = sub <2 x i32> , %x %bc = bitcast <2 x i32> %sub to <8 x i8> @@ -180,8 +181,8 @@ ; ; AVX512-LABEL: fmul_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1] -; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX512-NEXT: vmulpd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm0 = (xmm2 * xmm2) + xmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] diff --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll --- a/llvm/test/CodeGen/X86/vector-pcmp.ll +++ b/llvm/test/CodeGen/X86/vector-pcmp.ll @@ -534,11 +534,20 @@ ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: retq ; -; AVX512-LABEL: cmpne_knownzeros_zext_v8i16_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlw $15, %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: retq +; AVX512F-LABEL: cmpne_knownzeros_zext_v8i16_v8i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512F-NEXT: retq +; +; AVX512DQBW-LABEL: cmpne_knownzeros_zext_v8i16_v8i32: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX512DQBW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQBW-NEXT: retq %a = lshr <8 x i16> %x, %b = icmp ne <8 x i16> %a, zeroinitializer %c = zext <8 x i1> %b to <8 x i32> diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll @@ -818,24 +818,23 @@ ; ; BITALG_NOVLX-LABEL: eq_1_v2i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2 -; BITALG_NOVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; BITALG_NOVLX-NEXT: vpaddq %xmm3, %xmm0, %xmm3 -; BITALG_NOVLX-NEXT: vpand %xmm3, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpandn %xmm0, %xmm2, %xmm0 +; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG_NOVLX-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; BITALG_NOVLX-NEXT: vptestnmq %zmm1, %zmm0, %k1 +; BITALG_NOVLX-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} +; BITALG_NOVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: eq_1_v2i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2 -; BITALG-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; BITALG-NEXT: vpaddq %xmm3, %xmm0, %xmm3 -; BITALG-NEXT: vpand %xmm3, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpandn %xmm0, %xmm2, %xmm0 +; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; BITALG-NEXT: vptestnmq %xmm2, %xmm0, %k1 +; BITALG-NEXT: vptestmq %xmm0, %xmm0, %k1 {%k1} +; BITALG-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp eq <2 x i64> %2, @@ -955,26 +954,25 @@ ; ; BITALG_NOVLX-LABEL: ne_1_v2i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2 -; BITALG_NOVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; BITALG_NOVLX-NEXT: vpaddq %xmm3, %xmm0, %xmm3 -; BITALG_NOVLX-NEXT: vpand %xmm3, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpor %xmm0, %xmm2, %xmm0 +; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG_NOVLX-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; BITALG_NOVLX-NEXT: vptestmq %zmm1, %zmm0, %k0 +; BITALG_NOVLX-NEXT: vptestnmq %zmm0, %zmm0, %k1 +; BITALG_NOVLX-NEXT: korw %k0, %k1, %k1 +; BITALG_NOVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ne_1_v2i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2 -; BITALG-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; BITALG-NEXT: vpaddq %xmm3, %xmm0, %xmm4 -; BITALG-NEXT: vpand %xmm4, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpternlogq $222, %xmm3, %xmm2, %xmm0 +; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; BITALG-NEXT: vptestmq %xmm2, %xmm0, %k0 +; BITALG-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; BITALG-NEXT: korw %k0, %k1, %k1 +; BITALG-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) %3 = icmp ne <2 x i64> %2, @@ -1034,24 +1032,23 @@ ; ; BITALG_NOVLX-LABEL: eq_1_v4i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm2 -; BITALG_NOVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; BITALG_NOVLX-NEXT: vpaddd %xmm3, %xmm0, %xmm3 -; BITALG_NOVLX-NEXT: vpand %xmm3, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpandn %xmm0, %xmm2, %xmm0 +; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG_NOVLX-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; BITALG_NOVLX-NEXT: vptestnmd %zmm1, %zmm0, %k1 +; BITALG_NOVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} +; BITALG_NOVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: eq_1_v4i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm2 -; BITALG-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; BITALG-NEXT: vpaddd %xmm3, %xmm0, %xmm3 -; BITALG-NEXT: vpand %xmm3, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpandn %xmm0, %xmm2, %xmm0 +; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; BITALG-NEXT: vptestnmd %xmm2, %xmm0, %k1 +; BITALG-NEXT: vptestmd %xmm0, %xmm0, %k1 {%k1} +; BITALG-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp eq <4 x i32> %2, @@ -1118,26 +1115,25 @@ ; ; BITALG_NOVLX-LABEL: ne_1_v4i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm2 -; BITALG_NOVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; BITALG_NOVLX-NEXT: vpaddd %xmm3, %xmm0, %xmm3 -; BITALG_NOVLX-NEXT: vpand %xmm3, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpor %xmm0, %xmm2, %xmm0 +; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG_NOVLX-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; BITALG_NOVLX-NEXT: vptestmd %zmm1, %zmm0, %k0 +; BITALG_NOVLX-NEXT: vptestnmd %zmm0, %zmm0, %k1 +; BITALG_NOVLX-NEXT: korw %k0, %k1, %k1 +; BITALG_NOVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ne_1_v4i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm2 -; BITALG-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; BITALG-NEXT: vpaddd %xmm3, %xmm0, %xmm4 -; BITALG-NEXT: vpand %xmm4, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpternlogd $222, %xmm3, %xmm2, %xmm0 +; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; BITALG-NEXT: vptestmd %xmm2, %xmm0, %k0 +; BITALG-NEXT: vptestnmd %xmm0, %xmm0, %k1 +; BITALG-NEXT: korw %k0, %k1, %k1 +; BITALG-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) %3 = icmp ne <4 x i32> %2, diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll @@ -10,18 +10,18 @@ define <32 x i8> @ugt_1_v32i8(<32 x i8> %0) { ; AVX1-LABEL: ugt_1_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_1_v32i8: @@ -76,13 +76,13 @@ define <32 x i8> @ult_2_v32i8(<32 x i8> %0) { ; AVX1-LABEL: ult_2_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1003,18 +1003,18 @@ define <16 x i16> @ugt_1_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ugt_1_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_1_v16i16: @@ -1069,13 +1069,13 @@ define <16 x i16> @ult_2_v16i16(<16 x i16> %0) { ; AVX1-LABEL: ult_2_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -3302,18 +3302,18 @@ define <8 x i32> @ugt_1_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ugt_1_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_1_v8i32: @@ -3370,13 +3370,13 @@ define <8 x i32> @ult_2_v8i32(<8 x i32> %0) { ; AVX1-LABEL: ult_2_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -9406,18 +9406,18 @@ define <4 x i64> @ugt_1_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ugt_1_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpeqq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ugt_1_v4i64: @@ -9474,13 +9474,13 @@ define <4 x i64> @ult_2_v4i64(<4 x i64> %0) { ; AVX1-LABEL: ult_2_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256.ll b/llvm/test/CodeGen/X86/vector-popcnt-256.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256.ll @@ -450,17 +450,25 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpackssdw %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vandnps %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: eq_1_v4i64: @@ -468,10 +476,18 @@ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm2, %xmm2 ; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm3 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX2-NEXT: retq ; ; XOP-LABEL: eq_1_v4i64: @@ -480,7 +496,7 @@ ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOP-NEXT: vpcomneqq %xmm2, %xmm1, %xmm3 ; XOP-NEXT: vpcomneqq %xmm2, %xmm0, %xmm4 -; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; XOP-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 ; XOP-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; XOP-NEXT: vpaddq %xmm4, %xmm1, %xmm5 ; XOP-NEXT: vpand %xmm5, %xmm1, %xmm1 @@ -488,8 +504,14 @@ ; XOP-NEXT: vpaddq %xmm4, %xmm0, %xmm4 ; XOP-NEXT: vpand %xmm4, %xmm0, %xmm0 ; XOP-NEXT: vpcomeqq %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; XOP-NEXT: vandps %ymm0, %ymm3, %ymm0 +; XOP-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: vpand %xmm0, %xmm3, %xmm0 +; XOP-NEXT: vpslld $31, %xmm0, %xmm0 +; XOP-NEXT: vpsrad $31, %xmm0, %xmm0 +; XOP-NEXT: vpmovsxdq %xmm0, %xmm1 +; XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; XOP-NEXT: vpmovsxdq %xmm0, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOP-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: eq_1_v4i64: @@ -509,24 +531,22 @@ ; ; BITALG_NOVLX-LABEL: eq_1_v4i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm2 -; BITALG_NOVLX-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; BITALG_NOVLX-NEXT: vpaddq %ymm3, %ymm0, %ymm3 -; BITALG_NOVLX-NEXT: vpand %ymm3, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpandn %ymm0, %ymm2, %ymm0 +; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG_NOVLX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; BITALG_NOVLX-NEXT: vptestnmq %zmm1, %zmm0, %k1 +; BITALG_NOVLX-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} +; BITALG_NOVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: eq_1_v4i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm2 -; BITALG-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; BITALG-NEXT: vpaddq %ymm3, %ymm0, %ymm3 -; BITALG-NEXT: vpand %ymm3, %ymm0, %ymm0 -; BITALG-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpandn %ymm0, %ymm2, %ymm0 +; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; BITALG-NEXT: vptestnmq %ymm2, %ymm0, %k1 +; BITALG-NEXT: vptestmq %ymm0, %ymm0, %k1 {%k1} +; BITALG-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp eq <4 x i64> %2, @@ -541,7 +561,7 @@ ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 @@ -551,20 +571,33 @@ ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ne_1_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm4 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX2-NEXT: retq ; ; XOP-LABEL: ne_1_v4i64: @@ -573,7 +606,7 @@ ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOP-NEXT: vpcomeqq %xmm2, %xmm1, %xmm3 ; XOP-NEXT: vpcomeqq %xmm2, %xmm0, %xmm4 -; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; XOP-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 ; XOP-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; XOP-NEXT: vpaddq %xmm4, %xmm1, %xmm5 ; XOP-NEXT: vpand %xmm5, %xmm1, %xmm1 @@ -581,8 +614,14 @@ ; XOP-NEXT: vpaddq %xmm4, %xmm0, %xmm4 ; XOP-NEXT: vpand %xmm4, %xmm0, %xmm0 ; XOP-NEXT: vpcomneqq %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; XOP-NEXT: vorps %ymm0, %ymm3, %ymm0 +; XOP-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: vpor %xmm0, %xmm3, %xmm0 +; XOP-NEXT: vpslld $31, %xmm0, %xmm0 +; XOP-NEXT: vpsrad $31, %xmm0, %xmm0 +; XOP-NEXT: vpmovsxdq %xmm0, %xmm1 +; XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; XOP-NEXT: vpmovsxdq %xmm0, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOP-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ne_1_v4i64: @@ -605,25 +644,24 @@ ; ; BITALG_NOVLX-LABEL: ne_1_v4i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm2 -; BITALG_NOVLX-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; BITALG_NOVLX-NEXT: vpaddq %ymm3, %ymm0, %ymm3 -; BITALG_NOVLX-NEXT: vpand %ymm3, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpor %ymm0, %ymm2, %ymm0 +; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG_NOVLX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; BITALG_NOVLX-NEXT: vptestmq %zmm1, %zmm0, %k0 +; BITALG_NOVLX-NEXT: vptestnmq %zmm0, %zmm0, %k1 +; BITALG_NOVLX-NEXT: korw %k0, %k1, %k1 +; BITALG_NOVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ne_1_v4i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm2 -; BITALG-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; BITALG-NEXT: vpaddq %ymm3, %ymm0, %ymm4 -; BITALG-NEXT: vpand %ymm4, %ymm0, %ymm0 -; BITALG-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpternlogq $222, %ymm3, %ymm2, %ymm0 +; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG-NEXT: vpaddq %ymm1, %ymm0, %ymm2 +; BITALG-NEXT: vptestmq %ymm2, %ymm0, %k0 +; BITALG-NEXT: vptestnmq %ymm0, %ymm0, %k1 +; BITALG-NEXT: korw %k0, %k1, %k1 +; BITALG-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %0) %3 = icmp ne <4 x i64> %2, @@ -637,17 +675,23 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpackssdw %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vandnps %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: eq_1_v8i32: @@ -655,10 +699,16 @@ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm2, %xmm2 ; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm3 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX2-NEXT: retq ; ; XOP-LABEL: eq_1_v8i32: @@ -667,7 +717,7 @@ ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOP-NEXT: vpcomneqd %xmm2, %xmm1, %xmm3 ; XOP-NEXT: vpcomneqd %xmm2, %xmm0, %xmm4 -; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; XOP-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 ; XOP-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; XOP-NEXT: vpaddd %xmm4, %xmm1, %xmm5 ; XOP-NEXT: vpand %xmm5, %xmm1, %xmm1 @@ -675,8 +725,12 @@ ; XOP-NEXT: vpaddd %xmm4, %xmm0, %xmm4 ; XOP-NEXT: vpand %xmm4, %xmm0, %xmm0 ; XOP-NEXT: vpcomeqd %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; XOP-NEXT: vandps %ymm0, %ymm3, %ymm0 +; XOP-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: vpand %xmm0, %xmm3, %xmm0 +; XOP-NEXT: vpmovsxwd %xmm0, %xmm1 +; XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; XOP-NEXT: vpmovsxwd %xmm0, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOP-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: eq_1_v8i32: @@ -696,24 +750,22 @@ ; ; BITALG_NOVLX-LABEL: eq_1_v8i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm2 -; BITALG_NOVLX-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; BITALG_NOVLX-NEXT: vpaddd %ymm3, %ymm0, %ymm3 -; BITALG_NOVLX-NEXT: vpand %ymm3, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpandn %ymm0, %ymm2, %ymm0 +; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG_NOVLX-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; BITALG_NOVLX-NEXT: vptestnmd %zmm1, %zmm0, %k1 +; BITALG_NOVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} +; BITALG_NOVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: eq_1_v8i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm2 -; BITALG-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; BITALG-NEXT: vpaddd %ymm3, %ymm0, %ymm3 -; BITALG-NEXT: vpand %ymm3, %ymm0, %ymm0 -; BITALG-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpandn %ymm0, %ymm2, %ymm0 +; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; BITALG-NEXT: vptestnmd %ymm2, %ymm0, %k1 +; BITALG-NEXT: vptestmd %ymm0, %ymm0, %k1 {%k1} +; BITALG-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp eq <8 x i32> %2, @@ -728,7 +780,7 @@ ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 @@ -738,20 +790,29 @@ ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ne_1_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm4 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX2-NEXT: retq ; ; XOP-LABEL: ne_1_v8i32: @@ -760,7 +821,7 @@ ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOP-NEXT: vpcomeqd %xmm2, %xmm1, %xmm3 ; XOP-NEXT: vpcomeqd %xmm2, %xmm0, %xmm4 -; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; XOP-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 ; XOP-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; XOP-NEXT: vpaddd %xmm4, %xmm1, %xmm5 ; XOP-NEXT: vpand %xmm5, %xmm1, %xmm1 @@ -768,8 +829,12 @@ ; XOP-NEXT: vpaddd %xmm4, %xmm0, %xmm4 ; XOP-NEXT: vpand %xmm4, %xmm0, %xmm0 ; XOP-NEXT: vpcomneqd %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; XOP-NEXT: vorps %ymm0, %ymm3, %ymm0 +; XOP-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; XOP-NEXT: vpor %xmm0, %xmm3, %xmm0 +; XOP-NEXT: vpmovsxwd %xmm0, %xmm1 +; XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; XOP-NEXT: vpmovsxwd %xmm0, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOP-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ne_1_v8i32: @@ -792,25 +857,24 @@ ; ; BITALG_NOVLX-LABEL: ne_1_v8i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm2 -; BITALG_NOVLX-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; BITALG_NOVLX-NEXT: vpaddd %ymm3, %ymm0, %ymm3 -; BITALG_NOVLX-NEXT: vpand %ymm3, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vpor %ymm0, %ymm2, %ymm0 +; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG_NOVLX-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; BITALG_NOVLX-NEXT: vptestmd %zmm1, %zmm0, %k0 +; BITALG_NOVLX-NEXT: vptestnmd %zmm0, %zmm0, %k1 +; BITALG_NOVLX-NEXT: korw %k0, %k1, %k1 +; BITALG_NOVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ne_1_v8i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm2 -; BITALG-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; BITALG-NEXT: vpaddd %ymm3, %ymm0, %ymm4 -; BITALG-NEXT: vpand %ymm4, %ymm0, %ymm0 -; BITALG-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpternlogd $222, %ymm3, %ymm2, %ymm0 +; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG-NEXT: vpaddd %ymm1, %ymm0, %ymm2 +; BITALG-NEXT: vptestmd %ymm2, %ymm0, %k0 +; BITALG-NEXT: vptestnmd %ymm0, %ymm0, %k1 +; BITALG-NEXT: korw %k0, %k1, %k1 +; BITALG-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} {z} ; BITALG-NEXT: retq %2 = tail call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %0) %3 = icmp ne <8 x i32> %2, @@ -824,17 +888,23 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpacksswb %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddw %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vandnps %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: eq_1_v16i16: @@ -842,10 +912,16 @@ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-NEXT: vpacksswb %xmm4, %xmm2, %xmm2 ; AVX2-NEXT: vpaddw %ymm3, %ymm0, %ymm3 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX2-NEXT: retq ; ; XOP-LABEL: eq_1_v16i16: @@ -854,7 +930,7 @@ ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOP-NEXT: vpcomneqw %xmm2, %xmm1, %xmm3 ; XOP-NEXT: vpcomneqw %xmm2, %xmm0, %xmm4 -; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; XOP-NEXT: vpacksswb %xmm3, %xmm4, %xmm3 ; XOP-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; XOP-NEXT: vpaddw %xmm4, %xmm1, %xmm5 ; XOP-NEXT: vpand %xmm5, %xmm1, %xmm1 @@ -862,8 +938,12 @@ ; XOP-NEXT: vpaddw %xmm4, %xmm0, %xmm4 ; XOP-NEXT: vpand %xmm4, %xmm0, %xmm0 ; XOP-NEXT: vpcomeqw %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; XOP-NEXT: vandps %ymm0, %ymm3, %ymm0 +; XOP-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; XOP-NEXT: vpand %xmm0, %xmm3, %xmm0 +; XOP-NEXT: vpmovsxbw %xmm0, %xmm1 +; XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; XOP-NEXT: vpmovsxbw %xmm0, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOP-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: eq_1_v16i16: @@ -913,7 +993,7 @@ ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 @@ -923,20 +1003,29 @@ ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ne_1_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpacksswb %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpaddw %ymm3, %ymm0, %ymm4 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX2-NEXT: retq ; ; XOP-LABEL: ne_1_v16i16: @@ -945,7 +1034,7 @@ ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOP-NEXT: vpcomeqw %xmm2, %xmm1, %xmm3 ; XOP-NEXT: vpcomeqw %xmm2, %xmm0, %xmm4 -; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; XOP-NEXT: vpacksswb %xmm3, %xmm4, %xmm3 ; XOP-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; XOP-NEXT: vpaddw %xmm4, %xmm1, %xmm5 ; XOP-NEXT: vpand %xmm5, %xmm1, %xmm1 @@ -953,8 +1042,12 @@ ; XOP-NEXT: vpaddw %xmm4, %xmm0, %xmm4 ; XOP-NEXT: vpand %xmm4, %xmm0, %xmm0 ; XOP-NEXT: vpcomneqw %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; XOP-NEXT: vorps %ymm0, %ymm3, %ymm0 +; XOP-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; XOP-NEXT: vpor %xmm0, %xmm3, %xmm0 +; XOP-NEXT: vpmovsxbw %xmm0, %xmm1 +; XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; XOP-NEXT: vpmovsxbw %xmm0, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOP-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ne_1_v16i16: diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll --- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll @@ -100,34 +100,23 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v4i64_v4i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: test_v4i64_v4i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovq %xmm0, %rax -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: test_v4i64_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = and <4 x i64> %a0, %2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %1) ret i64 %2 @@ -185,7 +174,7 @@ ; AVX2-NEXT: vpsrlq $60, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -195,9 +184,12 @@ ; AVX512-LABEL: test_v8i64_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlq $60, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -288,7 +280,7 @@ ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -417,7 +409,8 @@ ; AVX1-FAST-LABEL: test_v4i32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vpsrld $31, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: retq @@ -496,8 +489,9 @@ ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: vzeroupper @@ -507,34 +501,27 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v8i32_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: test_v8i32_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovd %xmm0, %eax -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: test_v8i32_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = and <8 x i32> %a0, %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1) ret i32 %2 @@ -617,9 +604,9 @@ ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -628,11 +615,15 @@ ; ; AVX512-LABEL: test_v16i32_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -754,9 +745,9 @@ ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -961,15 +952,47 @@ ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax ; SSE41-NEXT: retq ; -; AVX-LABEL: test_v8i16_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq +; AVX1-SLOW-LABEL: test_v8i16_v8i8: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v8i16_v8i8: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: test_v8i16_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i16_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq %1 = and <8 x i16> %a0, %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %1) ret i16 %2 @@ -1006,62 +1029,67 @@ ; SSE41-NEXT: # kill: def $ax killed $ax killed $eax ; SSE41-NEXT: retq ; -; AVX1-LABEL: test_v16i16_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq +; AVX1-SLOW-LABEL: test_v16i16_v16i8: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-SLOW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v16i16_v16i8: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: test_v16i16_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v16i16_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: test_v16i16_v16i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BWVL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovd %xmm0, %eax -; AVX512BWVL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: test_v16i16_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = and <16 x i16> %a0, %2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1) ret i16 %2 @@ -1145,13 +1173,16 @@ ; AVX512-LABEL: test_v32i16_v32i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -1322,3 +1353,5 @@ declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>) declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll b/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll --- a/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll @@ -84,7 +84,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -95,7 +95,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxwq %xmm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -174,7 +174,7 @@ ; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -187,7 +187,7 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -320,7 +320,7 @@ ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -337,7 +337,7 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -441,7 +441,8 @@ ; AVX1-FAST-LABEL: test_v4i32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: retq @@ -517,8 +518,9 @@ ; AVX1-FAST-NEXT: vpmovsxbd %xmm0, %xmm1 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-FAST-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: retq @@ -527,9 +529,9 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -540,9 +542,9 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -639,9 +641,9 @@ ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -654,9 +656,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -810,9 +812,9 @@ ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -828,9 +830,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1026,8 +1028,10 @@ ; AVX1-FAST-LABEL: test_v8i16_v8i8: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vpmovsxbw %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -1120,9 +1124,11 @@ ; AVX1-FAST-NEXT: vpmovsxbw %xmm0, %xmm1 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpmovsxbw %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -1132,11 +1138,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1148,11 +1154,11 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1263,11 +1269,11 @@ ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1281,11 +1287,11 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1445,11 +1451,11 @@ ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1466,11 +1472,11 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1560,7 +1566,8 @@ ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: retq @@ -1641,8 +1648,10 @@ ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -1773,11 +1782,15 @@ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper @@ -1788,11 +1801,15 @@ ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper @@ -1888,13 +1905,17 @@ ; AVX512-NEXT: vpmovb2m %zmm0, %k0 ; AVX512-NEXT: vpmovm2b %k0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll --- a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll @@ -81,7 +81,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -92,7 +92,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -111,12 +111,32 @@ ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq ; -; AVX-LABEL: test_v8i64_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq +; AVX1-LABEL: test_v8i64_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i64_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i64_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = zext <8 x i8> %a0 to <8 x i64> %2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %1) ret i64 %2 @@ -220,13 +240,22 @@ ; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: retq ; -; AVX1-LABEL: test_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: retq +; AVX1-SLOW-LABEL: test_v4i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX1-SLOW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v4i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: test_v4i32: ; AVX2: # %bb.0: @@ -257,12 +286,56 @@ ; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_v8i32_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq +; AVX1-SLOW-LABEL: test_v8i32_v8i8: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-SLOW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v8i32_v8i8: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-FAST-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: test_v8i32_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i32_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = zext <8 x i8> %a0 to <8 x i32> %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1) ret i32 %2 @@ -278,14 +351,38 @@ ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_v16i32_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq +; AVX1-LABEL: test_v16i32_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i32_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i32_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = zext <16 x i8> %a0 to <16 x i32> %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) ret i32 %2 @@ -492,13 +589,41 @@ ; SSE-NEXT: # kill: def $ax killed $ax killed $eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_v8i16_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq +; AVX1-SLOW-LABEL: test_v8i16_v8i8: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v8i16_v8i8: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: test_v8i16_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i16_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq %1 = zext <8 x i8> %a0 to <8 x i16> %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %1) ret i16 %2 @@ -515,15 +640,68 @@ ; SSE-NEXT: # kill: def $ax killed $ax killed $eax ; SSE-NEXT: retq ; -; AVX-LABEL: test_v16i16_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq +; AVX1-SLOW-LABEL: test_v16i16_v16i8: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: test_v16i16_v16i8: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-FAST-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: test_v16i16_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i16_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = zext <16 x i8> %a0 to <16 x i16> %2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1) ret i16 %2 @@ -571,12 +749,17 @@ ; ; AVX512-LABEL: test_v32i16_v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-add.ll b/llvm/test/CodeGen/X86/vector-reduce-add.ll --- a/llvm/test/CodeGen/X86/vector-reduce-add.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add.ll @@ -58,7 +58,7 @@ ; AVX2-LABEL: test_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -68,7 +68,7 @@ ; AVX512-LABEL: test_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -106,7 +106,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -118,7 +118,7 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -168,7 +168,7 @@ ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -181,7 +181,7 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -254,7 +254,8 @@ ; ; AVX1-FAST-LABEL: test_v4i32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: retq @@ -306,8 +307,9 @@ ; AVX1-FAST-LABEL: test_v8i32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: vzeroupper @@ -316,9 +318,9 @@ ; AVX2-LABEL: test_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -328,9 +330,9 @@ ; AVX512-LABEL: test_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -386,9 +388,9 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -400,9 +402,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -476,9 +478,9 @@ ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -491,9 +493,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -633,8 +635,10 @@ ; ; AVX1-FAST-LABEL: test_v8i16: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -700,9 +704,11 @@ ; AVX1-FAST-LABEL: test_v16i16: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -712,11 +718,11 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -727,11 +733,11 @@ ; AVX512-LABEL: test_v16i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -798,11 +804,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -815,11 +821,11 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -904,11 +910,11 @@ ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -922,11 +928,11 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1201,11 +1207,15 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper @@ -1214,11 +1224,15 @@ ; AVX512-LABEL: test_v32i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper @@ -1261,11 +1275,15 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper @@ -1274,13 +1292,17 @@ ; AVX512-LABEL: test_v64i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper @@ -1292,34 +1314,34 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE-LABEL: test_v128i8: ; SSE: # %bb.0: -; SSE-NEXT: paddb %xmm7, %xmm3 -; SSE-NEXT: paddb %xmm5, %xmm1 -; SSE-NEXT: paddb %xmm3, %xmm1 ; SSE-NEXT: paddb %xmm6, %xmm2 ; SSE-NEXT: paddb %xmm4, %xmm0 ; SSE-NEXT: paddb %xmm2, %xmm0 -; SSE-NEXT: paddb %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: paddb %xmm7, %xmm3 +; SSE-NEXT: paddb %xmm5, %xmm1 +; SSE-NEXT: paddb %xmm3, %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: psadbw %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: paddb %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: psadbw %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: # kill: def $al killed $al killed $eax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v128i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1335,11 +1357,15 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper @@ -1349,13 +1375,17 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -12,43 +12,49 @@ ; define i1 @trunc_v2i64_v2i1(<2 x i64>) { -; SSE2-LABEL: trunc_v2i64_v2i1: -; SSE2: # %bb.0: -; SSE2-NEXT: psllq $63, %xmm0 -; SSE2-NEXT: movmskpd %xmm0, %eax -; SSE2-NEXT: cmpl $3, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v2i64_v2i1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v2i64_v2i1: +; SSE: # %bb.0: +; SSE-NEXT: psllq $63, %xmm0 +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: cmpl $3, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: trunc_v2i64_v2i1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vtestpd %xmm1, %xmm0 ; AVX1OR2-NEXT: setb %al ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: trunc_v2i64_v2i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512F-NEXT: setb %al +; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $3, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_v2i64_v2i1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512BW-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $3, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: trunc_v2i64_v2i1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; AVX512VL-NEXT: vptest %xmm1, %xmm0 -; AVX512VL-NEXT: setb %al +; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: testb $3, %al +; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: retq %a = trunc <2 x i64> %0 to <2 x i1> %b = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a) @@ -56,43 +62,49 @@ } define i1 @trunc_v4i32_v4i1(<4 x i32>) { -; SSE2-LABEL: trunc_v4i32_v4i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: xorl $15, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v4i32_v4i1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v4i32_v4i1: +; SSE: # %bb.0: +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: trunc_v4i32_v4i1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vtestps %xmm1, %xmm0 ; AVX1OR2-NEXT: setb %al ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i32_v4i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512F-NEXT: setb %al +; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_v4i32_v4i1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: trunc_v4i32_v4i1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967297,4294967297] -; AVX512VL-NEXT: vptest %xmm1, %xmm0 -; AVX512VL-NEXT: setb %al +; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: testb $15, %al +; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: retq %a = trunc <4 x i32> %0 to <4 x i1> %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) @@ -100,25 +112,22 @@ } define i1 @trunc_v8i16_v8i1(<8 x i16>) { -; SSE2-LABEL: trunc_v8i16_v8i1: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v8i16_v8i1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v8i16_v8i1: +; SSE: # %bb.0: +; SSE-NEXT: psllw $15, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpb $-1, %al +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: trunc_v8i16_v8i1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: setb %al +; AVX1OR2-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: cmpb $-1, %al +; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: trunc_v8i16_v8i1: @@ -145,88 +154,85 @@ } define i1 @trunc_v16i8_v16i1(<16 x i8>) { -; SSE2-LABEL: trunc_v16i8_v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v16i8_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq -; -; AVX1OR2-LABEL: trunc_v16i8_v16i1: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: setb %al -; AVX1OR2-NEXT: retq -; -; AVX512F-LABEL: trunc_v16i8_v16i1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512F-NEXT: setb %al -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_v16i8_v16i1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512BW-NEXT: setb %al -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: trunc_v16i8_v16i1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [72340172838076673,72340172838076673] -; AVX512VL-NEXT: vptest %xmm1, %xmm0 -; AVX512VL-NEXT: setb %al -; AVX512VL-NEXT: retq +; SSE-LABEL: trunc_v16i8_v16i1: +; SSE: # %bb.0: +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: retq +; +; AVX-LABEL: trunc_v16i8_v16i1: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX-NEXT: sete %al +; AVX-NEXT: retq %a = trunc <16 x i8> %0 to <16 x i1> %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a) ret i1 %b } define i1 @trunc_v4i64_v4i1(<4 x i64>) { -; SSE2-LABEL: trunc_v4i64_v4i1: -; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: cmpl $15, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v4i64_v4i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v4i64_v4i1: +; SSE: # %bb.0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v4i64_v4i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vtestps %xmm1, %xmm0 ; AVX1-NEXT: setb %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v4i64_v4i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vtestpd %ymm1, %ymm0 ; AVX2-NEXT: setb %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_v4i64_v4i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setb %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_v4i64_v4i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i64_v4i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX512BW-NEXT: vptestnmq %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i64_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX512VL-NEXT: vptestnmq %ymm0, %ymm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: testb $15, %al +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = trunc <4 x i64> %0 to <4 x i1> %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b @@ -235,31 +241,47 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) { ; SSE2-LABEL: trunc_v8i32_v8i1: ; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: psllw $15, %xmm0 +; SSE2-NEXT: packsswb %xmm0, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: cmpb $-1, %al ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: trunc_v8i32_v8i1: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: psllw $15, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpb $-1, %al +; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_v8i32_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vtestps %xmm1, %xmm0 ; AVX1-NEXT: setb %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v8i32_v8i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vtestps %ymm1, %ymm0 ; AVX2-NEXT: setb %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -277,35 +299,39 @@ } define i1 @trunc_v16i16_v16i1(<16 x i16>) { -; SSE2-LABEL: trunc_v16i16_v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v16i16_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v16i16_v16i1: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v16i16_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v16i16_v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -322,42 +348,41 @@ } define i1 @trunc_v32i8_v32i1(<32 x i8>) { -; SSE2-LABEL: trunc_v32i8_v32i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v32i8_v32i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v32i8_v32i1: +; SSE: # %bb.0: +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v32i8_v32i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v32i8_v32i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_v32i8_v32i1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setb %al +; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512-NEXT: vpmovmskb %ymm0, %eax +; AVX512-NEXT: cmpl $-1, %eax +; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %a = trunc <32 x i8> %0 to <32 x i1> @@ -388,26 +413,43 @@ ; ; SSE41-LABEL: trunc_v8i64_v8i1: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm2, %xmm0 +; SSE41-NEXT: psllw $15, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpb $-1, %al +; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_v8i64_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vtestps %xmm1, %xmm0 ; AVX1-NEXT: setb %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v8i64_v8i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vtestps %ymm1, %ymm0 ; AVX2-NEXT: setb %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -429,38 +471,66 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) { ; SSE2-LABEL: trunc_v16i32_v16i1: ; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: psllw $7, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: trunc_v16i32_v16i1: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm1 +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: psllw $7, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_v16i32_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v16i32_v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -477,41 +547,50 @@ } define i1 @trunc_v32i16_v32i1(<32 x i16>) { -; SSE2-LABEL: trunc_v32i16_v32i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v32i16_v32i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v32i16_v32i1: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: psllw $7, %xmm2 +; SSE-NEXT: pmovmskb %xmm2, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v32i16_v32i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v32i16_v32i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -589,10 +668,11 @@ ; ; AVX512F-LABEL: trunc_v64i8_v64i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovmskb %ymm0, %eax +; AVX512F-NEXT: cmpl $-1, %eax ; AVX512F-NEXT: sete %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -630,10 +710,12 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movmskpd %xmm0, %eax +; SSE2-NEXT: cmpl $3, %eax +; SSE2-NEXT: sete %al +; SSE2-NEXT: retq ; ; SSE41-LABEL: icmp0_v2i64_v2i1: ; SSE41: # %bb.0: @@ -641,11 +723,39 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX-LABEL: icmp0_v2i64_v2i1: -; AVX: # %bb.0: -; AVX-NEXT: vptest %xmm0, %xmm0 -; AVX-NEXT: sete %al -; AVX-NEXT: retq +; AVX1OR2-LABEL: icmp0_v2i64_v2i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vptest %xmm0, %xmm0 +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: retq +; +; AVX512F-LABEL: icmp0_v2i64_v2i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $3, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp0_v2i64_v2i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $3, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp0_v2i64_v2i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: cmpb $3, %al +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: retq %a = icmp eq <2 x i64> %0, zeroinitializer %b = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a) ret i1 %b @@ -657,7 +767,7 @@ ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: cmpl $15, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; @@ -667,37 +777,84 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX-LABEL: icmp0_v4i32_v4i1: -; AVX: # %bb.0: -; AVX-NEXT: vptest %xmm0, %xmm0 -; AVX-NEXT: sete %al -; AVX-NEXT: retq +; AVX1OR2-LABEL: icmp0_v4i32_v4i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vptest %xmm0, %xmm0 +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: retq +; +; AVX512F-LABEL: icmp0_v4i32_v4i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp0_v4i32_v4i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp0_v4i32_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: cmpb $15, %al +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: retq %a = icmp eq <4 x i32> %0, zeroinitializer %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b } define i1 @icmp0_v8i16_v8i1(<8 x i16>) { -; SSE2-LABEL: icmp0_v8i16_v8i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq +; SSE-LABEL: icmp0_v8i16_v8i1: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpeqw %xmm0, %xmm1 +; SSE-NEXT: packsswb %xmm1, %xmm1 +; SSE-NEXT: pmovmskb %xmm1, %eax +; SSE-NEXT: cmpb $-1, %al +; SSE-NEXT: sete %al +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: icmp0_v8i16_v8i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: cmpb $-1, %al +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: retq ; -; SSE41-LABEL: icmp0_v8i16_v8i1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; AVX512F-LABEL: icmp0_v8i16_v8i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: retq ; -; AVX-LABEL: icmp0_v8i16_v8i1: -; AVX: # %bb.0: -; AVX-NEXT: vptest %xmm0, %xmm0 -; AVX-NEXT: sete %al -; AVX-NEXT: retq +; AVX512BW-LABEL: icmp0_v8i16_v8i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vptest %xmm0, %xmm0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp0_v8i16_v8i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptest %xmm0, %xmm0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: retq %a = icmp eq <8 x i16> %0, zeroinitializer %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a) ret i1 %b @@ -709,7 +866,7 @@ ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; @@ -732,49 +889,82 @@ define i1 @icmp0_v4i64_v4i1(<4 x i64>) { ; SSE2-LABEL: icmp0_v4i64_v4i1: ; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 ; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: cmpl $15, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: icmp0_v4i64_v4i1: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pcmpeqq %xmm2, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm2, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: movmskps %xmm0, %eax +; SSE41-NEXT: cmpl $15, %eax ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX-LABEL: icmp0_v4i64_v4i1: -; AVX: # %bb.0: -; AVX-NEXT: vptest %ymm0, %ymm0 -; AVX-NEXT: sete %al -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1OR2-LABEL: icmp0_v4i64_v4i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vptest %ymm0, %ymm0 +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: vzeroupper +; AVX1OR2-NEXT: retq +; +; AVX512F-LABEL: icmp0_v4i64_v4i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp0_v4i64_v4i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp0_v4i64_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptestnmq %ymm0, %ymm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: cmpb $15, %al +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <4 x i64> %0, zeroinitializer %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b } define i1 @icmp0_v8i32_v8i1(<8 x i32>) { -; SSE2-LABEL: icmp0_v8i32_v8i1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: icmp0_v8i32_v8i1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: icmp0_v8i32_v8i1: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpb $-1, %al +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX-LABEL: icmp0_v8i32_v8i1: ; AVX: # %bb.0: @@ -788,29 +978,60 @@ } define i1 @icmp0_v16i16_v16i1(<16 x i16>) { -; SSE2-LABEL: icmp0_v16i16_v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq +; SSE-LABEL: icmp0_v16i16_v16i1: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqw %xmm2, %xmm1 +; SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: retq +; +; AVX1-LABEL: icmp0_v16i16_v16i1: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: sete %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq ; -; SSE41-LABEL: icmp0_v16i16_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; AVX2-LABEL: icmp0_v16i16_v16i1: +; AVX2: # %bb.0: +; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: sete %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; -; AVX-LABEL: icmp0_v16i16_v16i1: -; AVX: # %bb.0: -; AVX-NEXT: vptest %ymm0, %ymm0 -; AVX-NEXT: sete %al -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX512F-LABEL: icmp0_v16i16_v16i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; AVX512F-NEXT: vptest %ymm1, %ymm0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp0_v16i16_v16i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vptest %ymm0, %ymm0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp0_v16i16_v16i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptest %ymm0, %ymm0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <16 x i16> %0, zeroinitializer %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a) ret i1 %b @@ -819,11 +1040,11 @@ define i1 @icmp0_v32i8_v32i1(<32 x i8>) { ; SSE2-LABEL: icmp0_v32i8_v32i1: ; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; SSE2-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; @@ -848,38 +1069,71 @@ define i1 @icmp0_v8i64_v8i1(<8 x i64>) { ; SSE2-LABEL: icmp0_v8i64_v8i1: ; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,0,3,2] +; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: packssdw %xmm5, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 +; SSE2-NEXT: packsswb %xmm1, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: cmpb $-1, %al ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: icmp0_v8i64_v8i1: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm3 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm2 +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: packssdw %xmm2, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpb $-1, %al ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: icmp0_v8i64_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 -; AVX1-NEXT: sete %al +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vtestps %xmm1, %xmm0 +; AVX1-NEXT: setb %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp0_v8i64_v8i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 -; AVX2-NEXT: sete %al +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vtestps %ymm1, %ymm0 +; AVX2-NEXT: setb %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -896,39 +1150,47 @@ } define i1 @icmp0_v16i32_v16i1(<16 x i32>) { -; SSE2-LABEL: icmp0_v16i32_v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: icmp0_v16i32_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: icmp0_v16i32_v16i1: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: icmp0_v16i32_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp0_v16i32_v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -946,50 +1208,81 @@ } define i1 @icmp0_v32i16_v32i1(<32 x i16>) { -; SSE2-LABEL: icmp0_v32i16_v32i1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: icmp0_v32i16_v32i1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: icmp0_v32i16_v32i1: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: pcmpeqw %xmm4, %xmm1 +; SSE-NEXT: pcmpeqw %xmm4, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pcmpeqw %xmm4, %xmm3 +; SSE-NEXT: pcmpeqw %xmm4, %xmm2 +; SSE-NEXT: packsswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pmovmskb %xmm2, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: icmp0_v32i16_v32i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp0_v32i16_v32i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: icmp0_v32i16_v32i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: icmp0_v32i16_v32i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp0_v32i16_v32i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kortestw %k0, %k0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp0_v32i16_v32i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512VL-NEXT: kortestw %k0, %k0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <32 x i16> %0, zeroinitializer %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a) ret i1 %b @@ -998,48 +1291,75 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) { ; SSE2-LABEL: icmp0_v64i8_v64i1: ; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 ; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: icmp0_v64i8_v64i1: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: ptest %xmm1, %xmm1 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: icmp0_v64i8_v64i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp0_v64i8_v64i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vptest %ymm0, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: icmp0_v64i8_v64i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: icmp0_v64i8_v64i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vptest %ymm0, %ymm0 +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp0_v64i8_v64i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kortestw %k0, %k0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp0_v64i8_v64i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512VL-NEXT: kortestw %k0, %k0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <64 x i8> %0, zeroinitializer %b = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> %a) ret i1 %b @@ -1049,60 +1369,60 @@ ; SSE2-LABEL: icmp0_v8i1: ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psllw $15, %xmm0 -; SSE2-NEXT: psraw $15, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: psllw $15, %xmm1 +; SSE2-NEXT: packsswb %xmm1, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: cmpb $-1, %al ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: icmp0_v8i1: ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: psllw $15, %xmm0 -; SSE41-NEXT: psraw $15, %xmm0 -; SSE41-NEXT: pmovmskb %xmm0, %eax -; SSE41-NEXT: testl %eax, %eax +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: psllw $15, %xmm1 +; SSE41-NEXT: packsswb %xmm1, %xmm1 +; SSE41-NEXT: pmovmskb %xmm1, %eax +; SSE41-NEXT: cmpb $-1, %al ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1OR2-LABEL: icmp0_v8i1: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1OR2-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax -; AVX1OR2-NEXT: testl %eax, %eax +; AVX1OR2-NEXT: cmpb $-1, %al ; AVX1OR2-NEXT: sete %al ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: icmp0_v8i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb %al, %al +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: testq %rax, %rax ; AVX512F-NEXT: sete %al -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: icmp0_v8i1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: testb %al, %al +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: testq %rax, %rax ; AVX512BW-NEXT: sete %al -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: icmp0_v8i1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovb2m %xmm0, %k0 -; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: testb %al, %al +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: testq %rax, %rax ; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: retq %a = trunc <8 x i8> %0 to <8 x i1> @@ -1121,8 +1441,10 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movmskpd %xmm0, %eax +; SSE2-NEXT: cmpl $3, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; @@ -1133,12 +1455,43 @@ ; SSE41-NEXT: setb %al ; SSE41-NEXT: retq ; -; AVX-LABEL: icmp1_v2i64_v2i1: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vptest %xmm1, %xmm0 -; AVX-NEXT: setb %al -; AVX-NEXT: retq +; AVX1OR2-LABEL: icmp1_v2i64_v2i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vptest %xmm1, %xmm0 +; AVX1OR2-NEXT: setb %al +; AVX1OR2-NEXT: retq +; +; AVX512F-LABEL: icmp1_v2i64_v2i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $3, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp1_v2i64_v2i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $3, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp1_v2i64_v2i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: cmpb $3, %al +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: retq %a = icmp eq <2 x i64> %0, %b = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a) ret i1 %b @@ -1150,7 +1503,7 @@ ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: cmpl $15, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; @@ -1161,40 +1514,90 @@ ; SSE41-NEXT: setb %al ; SSE41-NEXT: retq ; -; AVX-LABEL: icmp1_v4i32_v4i1: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vptest %xmm1, %xmm0 -; AVX-NEXT: setb %al -; AVX-NEXT: retq +; AVX1OR2-LABEL: icmp1_v4i32_v4i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vptest %xmm1, %xmm0 +; AVX1OR2-NEXT: setb %al +; AVX1OR2-NEXT: retq +; +; AVX512F-LABEL: icmp1_v4i32_v4i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp1_v4i32_v4i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp1_v4i32_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: cmpb $15, %al +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: retq %a = icmp eq <4 x i32> %0, %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b } define i1 @icmp1_v8i16_v8i1(<8 x i16>) { -; SSE2-LABEL: icmp1_v8i16_v8i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq +; SSE-LABEL: icmp1_v8i16_v8i1: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: pcmpeqw %xmm0, %xmm1 +; SSE-NEXT: packsswb %xmm1, %xmm1 +; SSE-NEXT: pmovmskb %xmm1, %eax +; SSE-NEXT: cmpb $-1, %al +; SSE-NEXT: sete %al +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: icmp1_v8i16_v8i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: cmpb $-1, %al +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: retq ; -; SSE41-LABEL: icmp1_v8i16_v8i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; AVX512F-LABEL: icmp1_v8i16_v8i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: retq ; -; AVX-LABEL: icmp1_v8i16_v8i1: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vptest %xmm1, %xmm0 -; AVX-NEXT: setb %al -; AVX-NEXT: retq +; AVX512BW-LABEL: icmp1_v8i16_v8i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vptest %xmm1, %xmm0 +; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp1_v8i16_v8i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vptest %xmm1, %xmm0 +; AVX512VL-NEXT: setb %al +; AVX512VL-NEXT: retq %a = icmp eq <8 x i16> %0, %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a) ret i1 %b @@ -1206,7 +1609,7 @@ ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; @@ -1231,28 +1634,39 @@ define i1 @icmp1_v4i64_v4i1(<4 x i64>) { ; SSE2-LABEL: icmp1_v4i64_v4i1: ; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 ; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: cmpl $15, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: icmp1_v4i64_v4i1: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 -; SSE41-NEXT: setb %al +; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE41-NEXT: pcmpeqq %xmm2, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm2, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: movmskps %xmm0, %eax +; SSE41-NEXT: cmpl $15, %eax +; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: icmp1_v4i64_v4i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vptest %xmm0, %xmm0 +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1264,43 +1678,64 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: icmp1_v4i64_v4i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setb %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: icmp1_v4i64_v4i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp1_v4i64_v4i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512BW-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp1_v4i64_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: cmpb $15, %al +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <4 x i64> %0, %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b } define i1 @icmp1_v8i32_v8i1(<8 x i32>) { -; SSE2-LABEL: icmp1_v8i32_v8i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: icmp1_v8i32_v8i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: icmp1_v8i32_v8i1: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpb $-1, %al +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: icmp1_v8i32_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vptest %xmm0, %xmm0 +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1313,42 +1748,39 @@ ; AVX2-NEXT: retq ; ; AVX512-LABEL: icmp1_v8i32_v8i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setb %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %a = icmp eq <8 x i32> %0, - %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a) - ret i1 %b -} - -define i1 @icmp1_v16i16_v16i1(<16 x i16>) { -; SSE2-LABEL: icmp1_v16i16_v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: icmp1_v16i16_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; AVX512: # %bb.0: +; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512-NEXT: vptest %ymm1, %ymm0 +; AVX512-NEXT: setb %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %a = icmp eq <8 x i32> %0, + %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a) + ret i1 %b +} + +define i1 @icmp1_v16i16_v16i1(<16 x i16>) { +; SSE-LABEL: icmp1_v16i16_v16i1: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE-NEXT: pcmpeqw %xmm2, %xmm1 +; SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: icmp1_v16i16_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1360,13 +1792,31 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: icmp1_v16i16_v16i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setb %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: icmp1_v16i16_v16i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; AVX512F-NEXT: vptest %ymm1, %ymm0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp1_v16i16_v16i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512BW-NEXT: vptest %ymm1, %ymm0 +; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp1_v16i16_v16i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vptest %ymm1, %ymm0 +; AVX512VL-NEXT: setb %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <16 x i16> %0, %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a) ret i1 %b @@ -1375,11 +1825,11 @@ define i1 @icmp1_v32i8_v32i1(<32 x i8>) { ; SSE2-LABEL: icmp1_v32i8_v32i1: ; SSE2: # %bb.0: +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; SSE2-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; @@ -1393,10 +1843,13 @@ ; ; AVX1-LABEL: icmp1_v32i8_v32i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vptest %xmm0, %xmm0 +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1423,41 +1876,68 @@ define i1 @icmp1_v8i64_v8i1(<8 x i64>) { ; SSE2-LABEL: icmp1_v8i64_v8i1: ; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,0,3,2] +; SSE2-NEXT: pand %xmm3, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: packssdw %xmm5, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 +; SSE2-NEXT: packsswb %xmm1, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: cmpb $-1, %al ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: icmp1_v8i64_v8i1: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 -; SSE41-NEXT: setb %al +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm3 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm2 +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: packssdw %xmm2, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpb $-1, %al +; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: icmp1_v8i64_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vtestps %xmm3, %xmm0 ; AVX1-NEXT: setb %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp1_v8i64_v8i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vtestps %ymm2, %ymm0 ; AVX2-NEXT: setb %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1476,44 +1956,48 @@ } define i1 @icmp1_v16i32_v16i1(<16 x i32>) { -; SSE2-LABEL: icmp1_v16i32_v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax -; SSE2-NEXT: xorl $15, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: icmp1_v16i32_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: icmp1_v16i32_v16i1: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: icmp1_v16i32_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp1_v16i32_v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1531,55 +2015,83 @@ } define i1 @icmp1_v32i16_v32i1(<32 x i16>) { -; SSE2-LABEL: icmp1_v32i16_v32i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: icmp1_v32i16_v32i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: icmp1_v32i16_v32i1: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE-NEXT: pcmpeqw %xmm4, %xmm1 +; SSE-NEXT: pcmpeqw %xmm4, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pcmpeqw %xmm4, %xmm3 +; SSE-NEXT: pcmpeqw %xmm4, %xmm2 +; SSE-NEXT: packsswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pmovmskb %xmm2, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: icmp1_v32i16_v32i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp1_v32i16_v32i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: icmp1_v32i16_v32i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: icmp1_v32i16_v32i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp1_v32i16_v32i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kortestw %k0, %k0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp1_v32i16_v32i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512VL-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512VL-NEXT: kortestw %k0, %k0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <32 x i16> %0, %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a) ret i1 %b @@ -1588,53 +2100,81 @@ define i1 @icmp1_v64i8_v64i1(<64 x i8>) { ; SSE2-LABEL: icmp1_v64i8_v64i1: ; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 ; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: icmp1_v64i8_v64i1: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm3, %xmm1 ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm1 ; SSE41-NEXT: setb %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: icmp1_v64i8_v64i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp1_v64i8_v64i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vptest %ymm1, %ymm0 ; AVX2-NEXT: setb %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: icmp1_v64i8_v64i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: icmp1_v64i8_v64i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 +; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vptest %ymm0, %ymm0 +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp1_v64i8_v64i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kortestw %k0, %k0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp1_v64i8_v64i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512VL-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512VL-NEXT: kortestw %k0, %k0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <64 x i8> %0, %b = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> %a) ret i1 %b @@ -1673,31 +2213,28 @@ ; ; AVX512F-LABEL: icmp1_v8i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: cmpb $-1, %al +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vmovq %xmm0, %rax +; AVX512F-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 +; AVX512F-NEXT: cmpq %rcx, %rax ; AVX512F-NEXT: sete %al -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: icmp1_v8i1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: cmpb $-1, %al +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, %rax +; AVX512BW-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 +; AVX512BW-NEXT: cmpq %rcx, %rax ; AVX512BW-NEXT: sete %al -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: icmp1_v8i1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovb2m %xmm0, %k0 -; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: cmpb $-1, %al +; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, %rax +; AVX512VL-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 +; AVX512VL-NEXT: cmpq %rcx, %rax ; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: retq %a = trunc <8 x i8> %0 to <8 x i1> @@ -1715,8 +2252,10 @@ ; SSE2-LABEL: icmp_v2i64_v2i1: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movmskpd %xmm1, %eax +; SSE2-NEXT: cmpl $3, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; @@ -1727,12 +2266,42 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX-LABEL: icmp_v2i64_v2i1: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vptest %xmm0, %xmm0 -; AVX-NEXT: sete %al -; AVX-NEXT: retq +; AVX1OR2-LABEL: icmp_v2i64_v2i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vptest %xmm0, %xmm0 +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: retq +; +; AVX512F-LABEL: icmp_v2i64_v2i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $3, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp_v2i64_v2i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $3, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp_v2i64_v2i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: cmpb $3, %al +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: retq %a = icmp eq <2 x i64> %0, %1 %b = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a) ret i1 %b @@ -1743,7 +2312,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: cmpl $15, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; @@ -1754,39 +2323,86 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX-LABEL: icmp_v4i32_v4i1: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vptest %xmm0, %xmm0 -; AVX-NEXT: sete %al -; AVX-NEXT: retq +; AVX1OR2-LABEL: icmp_v4i32_v4i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vptest %xmm0, %xmm0 +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: retq +; +; AVX512F-LABEL: icmp_v4i32_v4i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp_v4i32_v4i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512BW-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp_v4i32_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: cmpb $15, %al +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: retq %a = icmp eq <4 x i32> %0, %1 %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b } - -define i1 @icmp_v8i16_v8i1(<8 x i16>, <8 x i16>) { -; SSE2-LABEL: icmp_v8i16_v8i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq + +define i1 @icmp_v8i16_v8i1(<8 x i16>, <8 x i16>) { +; SSE-LABEL: icmp_v8i16_v8i1: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpb $-1, %al +; SSE-NEXT: sete %al +; SSE-NEXT: retq +; +; AVX1OR2-LABEL: icmp_v8i16_v8i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: cmpb $-1, %al +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: retq ; -; SSE41-LABEL: icmp_v8i16_v8i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; AVX512F-LABEL: icmp_v8i16_v8i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: retq ; -; AVX-LABEL: icmp_v8i16_v8i1: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vptest %xmm0, %xmm0 -; AVX-NEXT: sete %al -; AVX-NEXT: retq +; AVX512BW-LABEL: icmp_v8i16_v8i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vptest %xmm0, %xmm0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp_v8i16_v8i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vptest %xmm0, %xmm0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: retq %a = icmp eq <8 x i16> %0, %1 %b = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %a) ret i1 %b @@ -1797,7 +2413,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; @@ -1823,26 +2439,35 @@ ; SSE2-LABEL: icmp_v4i64_v4i1: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 +; SSE2-NEXT: movmskps %xmm1, %eax +; SSE2-NEXT: cmpl $15, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: icmp_v4i64_v4i1: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: pcmpeqq %xmm3, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm2, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: movmskps %xmm0, %eax +; SSE41-NEXT: cmpl $15, %eax ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: icmp_v4i64_v4i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vptest %xmm0, %xmm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1855,42 +2480,61 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: icmp_v4i64_v4i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vptest %ymm0, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: icmp_v4i64_v4i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp_v4i64_v4i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp_v4i64_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: cmpb $15, %al +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <4 x i64> %0, %1 %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b } define i1 @icmp_v8i32_v8i1(<8 x i32>, <8 x i32>) { -; SSE2-LABEL: icmp_v8i32_v8i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: xorl $15, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: icmp_v8i32_v8i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: icmp_v8i32_v8i1: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpb $-1, %al +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: icmp_v8i32_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vptest %xmm0, %xmm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1916,29 +2560,25 @@ } define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) { -; SSE2-LABEL: icmp_v16i16_v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pcmpeqb %xmm3, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: icmp_v16i16_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: icmp_v16i16_v16i1: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqw %xmm3, %xmm1 +; SSE-NEXT: pcmpeqw %xmm2, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: icmp_v16i16_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1951,13 +2591,30 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: icmp_v16i16_v16i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vptest %ymm0, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: icmp_v16i16_v16i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; AVX512F-NEXT: vptest %ymm1, %ymm0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp_v16i16_v16i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vptest %ymm0, %ymm0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp_v16i16_v16i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vptest %ymm0, %ymm0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <16 x i16> %0, %1 %b = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %a) ret i1 %b @@ -1966,27 +2623,31 @@ define i1 @icmp_v32i8_v32i1(<32 x i8>, <32 x i8>) { ; SSE2-LABEL: icmp_v32i8_v32i1: ; SSE2: # %bb.0: -; SSE2-NEXT: pcmpeqb %xmm3, %xmm1 ; SSE2-NEXT: pcmpeqb %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF +; SSE2-NEXT: pcmpeqb %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: icmp_v32i8_v32i1: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm3, %xmm1 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: ptest %xmm1, %xmm1 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: icmp_v32i8_v32i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vptest %xmm0, %xmm0 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2015,47 +2676,68 @@ ; SSE2-LABEL: icmp_v8i64_v8i1: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm7, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,0,3,2] +; SSE2-NEXT: pand %xmm3, %xmm7 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: packssdw %xmm7, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 +; SSE2-NEXT: packsswb %xmm1, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: cmpb $-1, %al ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: icmp_v8i64_v8i1: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm7, %xmm3 -; SSE41-NEXT: pxor %xmm5, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: pcmpeqq %xmm7, %xmm3 +; SSE41-NEXT: pcmpeqq %xmm6, %xmm2 +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: pcmpeqq %xmm5, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm0 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: packssdw %xmm2, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpb $-1, %al ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: icmp_v8i64_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 -; AVX1-NEXT: sete %al +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpcmpeqq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpcmpeqq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vtestps %xmm1, %xmm0 +; AVX1-NEXT: setb %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp_v8i64_v8i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 -; AVX2-NEXT: sete %al +; AVX2-NEXT: vpcmpeqq %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vtestps %ymm1, %ymm0 +; AVX2-NEXT: setb %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -2072,49 +2754,46 @@ } define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) { -; SSE2-LABEL: icmp_v16i32_v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pcmpeqd %xmm7, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: xorl $15, %eax -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: icmp_v16i32_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm7, %xmm3 -; SSE41-NEXT: pxor %xmm5, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: icmp_v16i32_v16i1: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqd %xmm7, %xmm3 +; SSE-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: pcmpeqd %xmm5, %xmm1 +; SSE-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: icmp_v16i32_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp_v16i32_v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2132,120 +2811,156 @@ } define i1 @icmp_v32i16_v32i1(<32 x i16>, <32 x i16>) { -; SSE2-LABEL: icmp_v32i16_v32i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pcmpeqb %xmm7, %xmm3 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm6, %xmm2 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: icmp_v32i16_v32i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm7, %xmm3 -; SSE41-NEXT: pxor %xmm5, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: icmp_v32i16_v32i1: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqw %xmm5, %xmm1 +; SSE-NEXT: pcmpeqw %xmm4, %xmm0 +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: pcmpeqw %xmm7, %xmm3 +; SSE-NEXT: pcmpeqw %xmm6, %xmm2 +; SSE-NEXT: packsswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pmovmskb %xmm2, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: icmp_v32i16_v32i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp_v32i16_v32i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: icmp_v32i16_v32i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: icmp_v32i16_v32i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp_v32i16_v32i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kortestw %k0, %k0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp_v32i16_v32i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512VL-NEXT: kortestw %k0, %k0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <32 x i16> %0, %1 %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a) ret i1 %b } define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) { -; SSE2-LABEL: icmp_v64i8_v64i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pcmpeqb %xmm7, %xmm3 -; SSE2-NEXT: pcmpeqb %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm6, %xmm2 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: sete %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: icmp_v64i8_v64i1: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm7, %xmm3 -; SSE41-NEXT: pxor %xmm5, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: icmp_v64i8_v64i1: +; SSE: # %bb.0: +; SSE-NEXT: pcmpeqb %xmm5, %xmm1 +; SSE-NEXT: pcmpeqb %xmm7, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: pcmpeqb %xmm4, %xmm0 +; SSE-NEXT: pcmpeqb %xmm6, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: pmovmskb %xmm2, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: icmp_v64i8_v64i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; AVX1-NEXT: vpcmpeqb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: icmp_v64i8_v64i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vptest %ymm0, %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: icmp_v64i8_v64i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: icmp_v64i8_v64i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vptest %ymm0, %ymm0 +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: icmp_v64i8_v64i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kortestw %k0, %k0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: icmp_v64i8_v64i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512VL-NEXT: kortestw %k0, %k0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = icmp eq <64 x i8> %0, %1 %b = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> %a) ret i1 %b @@ -2257,5 +2972,3 @@ declare i1 @llvm.vector.reduce.and.v16i1(<16 x i1>) declare i1 @llvm.vector.reduce.and.v32i1(<32 x i1>) declare i1 @llvm.vector.reduce.and.v64i1(<64 x i1>) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; SSE: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll @@ -60,26 +60,37 @@ ; ; AVX1-LABEL: test_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setae %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: cmpq $-1, %rax +; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setae %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: cmpq $-1, %rax +; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v4i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setae %al +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: cmpq $-1, %rax +; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %a0) @@ -113,27 +124,39 @@ ; AVX1-LABEL: test_v8i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: cmpq $-1, %rax +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v8i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: cmpq $-1, %rax +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: cmpq $-1, %rax ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -145,31 +168,31 @@ define i1 @test_v16i64(<16 x i64> %a0) { ; SSE2-LABEL: test_v16i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm6, %xmm2 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax ; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm7, %xmm3 -; SSE41-NEXT: pand %xmm5, %xmm1 -; SSE41-NEXT: pand %xmm3, %xmm1 ; SSE41-NEXT: pand %xmm6, %xmm2 ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm3 +; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm1 ; SSE41-NEXT: setae %al ; SSE41-NEXT: retq ; @@ -178,10 +201,13 @@ ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setae %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: cmpq $-1, %rax +; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -190,18 +216,27 @@ ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setae %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: cmpq $-1, %rax +; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v16i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: cmpq $-1, %rax ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -282,26 +317,43 @@ ; ; AVX1-LABEL: test_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cmpl $-1, %eax +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setb %al +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: cmpl $-1, %eax +; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %a0) @@ -335,27 +387,45 @@ ; AVX1-LABEL: test_v16i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setae %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cmpl $-1, %eax +; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v16i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setae %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v16i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: cmpl $-1, %eax ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -367,31 +437,31 @@ define i1 @test_v32i32(<32 x i32> %a0) { ; SSE2-LABEL: test_v32i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm6, %xmm2 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax ; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v32i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm7, %xmm3 -; SSE41-NEXT: pand %xmm5, %xmm1 -; SSE41-NEXT: pand %xmm3, %xmm1 ; SSE41-NEXT: pand %xmm6, %xmm2 ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm3 +; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm1 ; SSE41-NEXT: setb %al ; SSE41-NEXT: retq ; @@ -400,10 +470,15 @@ ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cmpl $-1, %eax +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -412,18 +487,31 @@ ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v32i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: cmpl $-1, %eax ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -523,26 +611,49 @@ ; ; AVX1-LABEL: test_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setae %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setae %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: cmpw $-1, %ax +; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setae %al +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: cmpw $-1, %ax +; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %a0) @@ -576,27 +687,51 @@ ; AVX1-LABEL: test_v32i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v32i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: cmpw $-1, %ax +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v32i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: cmpw $-1, %ax ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -608,31 +743,31 @@ define i1 @test_v64i16(<64 x i16> %a0) { ; SSE2-LABEL: test_v64i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm6, %xmm2 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v64i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm7, %xmm3 -; SSE41-NEXT: pand %xmm5, %xmm1 -; SSE41-NEXT: pand %xmm3, %xmm1 ; SSE41-NEXT: pand %xmm6, %xmm2 ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm3 +; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm1 ; SSE41-NEXT: setae %al ; SSE41-NEXT: retq ; @@ -641,10 +776,17 @@ ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setae %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cmpw $-1, %ax +; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -653,18 +795,35 @@ ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setae %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: cmpw $-1, %ax +; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v64i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: cmpw $-1, %ax ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -783,26 +942,55 @@ ; ; AVX1-LABEL: test_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cmpb $-1, %al +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: cmpb $-1, %al +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setb %al +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: cmpb $-1, %al +; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %a0) @@ -836,27 +1024,57 @@ ; AVX1-LABEL: test_v64i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setae %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cmpb $-1, %al +; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_v64i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setae %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: cmpb $-1, %al +; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v64i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: cmpb $-1, %al ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -868,31 +1086,31 @@ define i1 @test_v128i8(<128 x i8> %a0) { ; SSE2-LABEL: test_v128i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm6, %xmm2 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v128i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm7, %xmm3 -; SSE41-NEXT: pand %xmm5, %xmm1 -; SSE41-NEXT: pand %xmm3, %xmm1 ; SSE41-NEXT: pand %xmm6, %xmm2 ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm0 +; SSE41-NEXT: pand %xmm7, %xmm3 +; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm1 ; SSE41-NEXT: setb %al ; SSE41-NEXT: retq ; @@ -901,10 +1119,19 @@ ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vptest %ymm1, %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: cmpb $-1, %al +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -913,18 +1140,39 @@ ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: cmpb $-1, %al +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v128i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: cmpb $-1, %al ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-and.ll b/llvm/test/CodeGen/X86/vector-reduce-and.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and.ll @@ -40,9 +40,9 @@ ; AVX1-LABEL: test_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -50,7 +50,7 @@ ; AVX2-LABEL: test_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -60,7 +60,7 @@ ; AVX512-LABEL: test_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -85,7 +85,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -96,7 +96,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -108,7 +108,7 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -139,7 +139,7 @@ ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -152,7 +152,7 @@ ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -165,7 +165,7 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -233,11 +233,11 @@ ; AVX1-LABEL: test_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -245,7 +245,7 @@ ; AVX2-LABEL: test_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -257,7 +257,7 @@ ; AVX512-LABEL: test_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -286,7 +286,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -299,7 +299,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -313,9 +313,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -348,7 +348,7 @@ ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -363,7 +363,7 @@ ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -378,9 +378,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -487,11 +487,11 @@ ; AVX1-LABEL: test_v16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -502,11 +502,11 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -517,11 +517,11 @@ ; AVX512-LABEL: test_v16i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -553,11 +553,11 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -569,11 +569,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -628,11 +628,11 @@ ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -646,11 +646,11 @@ ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -815,11 +815,11 @@ ; AVX1-LABEL: test_v32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 @@ -832,7 +832,7 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -849,7 +849,7 @@ ; AVX512-LABEL: test_v32i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -890,7 +890,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -908,7 +908,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -927,13 +927,13 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -974,7 +974,7 @@ ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -994,7 +994,7 @@ ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1014,13 +1014,13 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -90,7 +90,8 @@ ; ; AVX1-FAST-LABEL: test_v4f32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: retq @@ -143,7 +144,7 @@ ; AVX1-SLOW-LABEL: test_v8f32: ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -155,9 +156,11 @@ ; AVX1-FAST-LABEL: test_v8f32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm2, %xmm1 -; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -165,7 +168,7 @@ ; AVX2-LABEL: test_v8f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -177,7 +180,7 @@ ; AVX512-LABEL: test_v8f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -221,7 +224,7 @@ ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -234,7 +237,7 @@ ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 @@ -246,7 +249,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -260,7 +263,7 @@ ; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 ; AVX512-NEXT: vaddps %zmm2, %zmm1, %zmm1 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vaddps %zmm2, %zmm1, %zmm1 ; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -346,7 +349,8 @@ ; ; AVX1-FAST-LABEL: test_v4f32_zero: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: retq ; @@ -394,7 +398,7 @@ ; AVX1-SLOW-LABEL: test_v8f32_zero: ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -405,16 +409,18 @@ ; AVX1-FAST-LABEL: test_v8f32_zero: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: test_v8f32_zero: ; AVX2: # %bb.0: ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -425,7 +431,7 @@ ; AVX512-LABEL: test_v8f32_zero: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -466,7 +472,7 @@ ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -478,7 +484,7 @@ ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 @@ -489,7 +495,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -502,7 +508,7 @@ ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -587,7 +593,8 @@ ; ; AVX1-FAST-LABEL: test_v4f32_undef: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: retq ; @@ -635,7 +642,7 @@ ; AVX1-SLOW-LABEL: test_v8f32_undef: ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -646,16 +653,18 @@ ; AVX1-FAST-LABEL: test_v8f32_undef: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: test_v8f32_undef: ; AVX2: # %bb.0: ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -666,7 +675,7 @@ ; AVX512-LABEL: test_v8f32_undef: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -707,7 +716,7 @@ ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -719,7 +728,7 @@ ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 @@ -730,7 +739,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -743,7 +752,7 @@ ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -820,8 +829,9 @@ ; AVX1-FAST-LABEL: test_v4f64: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm2, %xmm1 -; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-FAST-NEXT: vaddsd %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -1037,8 +1047,9 @@ ; AVX1-FAST-LABEL: test_v4f64_zero: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq ; @@ -1241,8 +1252,9 @@ ; AVX1-FAST-LABEL: test_v4f64_undef: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll @@ -107,7 +107,7 @@ ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -118,7 +118,7 @@ ; AVX512-LABEL: test_v8f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -159,7 +159,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -172,7 +172,7 @@ ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll @@ -112,7 +112,7 @@ ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -123,7 +123,7 @@ ; AVX512-LABEL: test_v8f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -164,7 +164,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -177,7 +177,7 @@ ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmaxps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll @@ -153,7 +153,7 @@ ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -164,7 +164,7 @@ ; AVX512-LABEL: test_v8f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -205,7 +205,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -218,7 +218,7 @@ ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vminps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vminps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll @@ -113,7 +113,7 @@ ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -125,7 +125,7 @@ ; AVX512-LABEL: test_v8f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmulps %ymm2, %ymm1, %ymm1 ; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -169,7 +169,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -183,7 +183,7 @@ ; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 ; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm1 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm1 ; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] @@ -292,7 +292,7 @@ ; AVX-LABEL: test_v8f32_zero: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -303,7 +303,7 @@ ; AVX512-LABEL: test_v8f32_zero: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -344,7 +344,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -357,7 +357,7 @@ ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -465,7 +465,7 @@ ; AVX-LABEL: test_v8f32_undef: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -476,7 +476,7 @@ ; AVX512-LABEL: test_v8f32_undef: ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -517,7 +517,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -530,7 +530,7 @@ ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -225,7 +225,7 @@ ; AVX512DQVL-LABEL: test_v4i64: ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullq %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax @@ -437,7 +437,7 @@ ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax @@ -757,7 +757,7 @@ ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax @@ -869,9 +869,9 @@ ; AVX2-LABEL: test_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -881,9 +881,9 @@ ; AVX512-LABEL: test_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -945,9 +945,9 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -959,9 +959,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1047,9 +1047,9 @@ ; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1062,9 +1062,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1186,11 +1186,11 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1201,11 +1201,11 @@ ; AVX512-LABEL: test_v16i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1255,11 +1255,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1272,11 +1272,11 @@ ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax @@ -1289,11 +1289,11 @@ ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax @@ -1306,11 +1306,11 @@ ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vmovd %xmm0, %eax @@ -1323,11 +1323,11 @@ ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovd %xmm0, %eax @@ -1389,11 +1389,11 @@ ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1407,11 +1407,11 @@ ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax @@ -1425,11 +1425,11 @@ ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax @@ -1445,11 +1445,11 @@ ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vmovd %xmm0, %eax @@ -1465,11 +1465,11 @@ ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovd %xmm0, %eax @@ -1646,13 +1646,13 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw %xmm2, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm2, %xmm0 ; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: pmullw %xmm3, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmullw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -1668,19 +1668,19 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm2, %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm2, %xmm0 ; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE41-NEXT: pmullw %xmm2, %xmm3 +; SSE41-NEXT: pmullw %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; SSE41-NEXT: pmullw %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; @@ -1688,17 +1688,18 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $8, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -1707,39 +1708,127 @@ ; ; AVX2-LABEL: test_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: test_v32i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: test_v32i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BW-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: test_v32i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, %eax +; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; AVX512DQ-LABEL: test_v32i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovd %xmm0, %eax +; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512DQVL-LABEL: test_v32i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vmovd %xmm0, %eax +; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq %1 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %a0) ret i8 %1 } @@ -1748,24 +1837,24 @@ ; SSE2-LABEL: test_v64i8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm4, %xmm1 -; SSE2-NEXT: pmullw %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm4, %xmm6 +; SSE2-NEXT: pmullw %xmm5, %xmm6 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm3, %xmm0 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw %xmm3, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: pmullw %xmm2, %xmm0 ; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: pmullw %xmm6, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmullw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -1781,27 +1870,27 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm4, %xmm3 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm4, %xmm1 ; SSE41-NEXT: pmullw %xmm3, %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm3, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm3, %xmm0 ; SSE41-NEXT: pmullw %xmm2, %xmm0 ; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE41-NEXT: pmullw %xmm4, %xmm5 +; SSE41-NEXT: pmullw %xmm3, %xmm6 +; SSE41-NEXT: pmullw %xmm5, %xmm6 +; SSE41-NEXT: pmullw %xmm0, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE41-NEXT: pmullw %xmm6, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; @@ -1809,26 +1898,27 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $8, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -1838,19 +1928,31 @@ ; AVX2-LABEL: test_v64i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax @@ -1859,18 +1961,33 @@ ; ; AVX512BW-LABEL: test_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpmullw %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax @@ -1879,18 +1996,39 @@ ; ; AVX512BWVL-LABEL: test_v64i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512BWVL-NEXT: vpmullw %zmm3, %zmm2, %zmm2 +; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BWVL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmullw %ymm0, %ymm1, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax @@ -1901,19 +2039,31 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQ-NEXT: vpmullw %ymm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vmovd %xmm0, %eax ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax @@ -1924,19 +2074,31 @@ ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm3, %ymm2 +; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQVL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovd %xmm0, %eax ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax @@ -1949,142 +2111,143 @@ define i8 @test_v128i8(<128 x i8> %a0) { ; SSE2-LABEL: test_v128i8: ; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm4, %xmm8 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm0, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw %xmm8, %xmm10 +; SSE2-NEXT: pmullw %xmm9, %xmm10 ; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm8, %xmm7 -; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: movdqa %xmm3, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm5, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm1, %xmm8 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm8, %xmm3 -; SSE2-NEXT: pmullw %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm7, %xmm5 -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm7, %xmm1 -; SSE2-NEXT: pmullw %xmm5, %xmm1 -; SSE2-NEXT: pmullw %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm6, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pmullw %xmm11, %xmm8 +; SSE2-NEXT: pmullw %xmm9, %xmm8 +; SSE2-NEXT: pmullw %xmm10, %xmm8 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm3, %xmm2 ; SSE2-NEXT: pmullw %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm3, %xmm0 ; SSE2-NEXT: pmullw %xmm4, %xmm0 ; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: pmullw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm7, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm5, %xmm1 +; SSE2-NEXT: pmullw %xmm3, %xmm1 ; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: pmullw %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: pmullw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v128i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm6, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm4, %xmm0 +; SSE41-NEXT: pmullw %xmm2, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm8, %xmm7 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm8, %xmm3 ; SSE41-NEXT: pmullw %xmm7, %xmm3 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm7, %xmm5 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm11 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm7, %xmm1 ; SSE41-NEXT: pmullw %xmm5, %xmm1 ; SSE41-NEXT: pmullw %xmm3, %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm3, %xmm6 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm3, %xmm2 -; SSE41-NEXT: pmullw %xmm6, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm3, %xmm4 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm3, %xmm0 -; SSE41-NEXT: pmullw %xmm4, %xmm0 -; SSE41-NEXT: pmullw %xmm2, %xmm0 -; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pmullw %xmm8, %xmm9 +; SSE41-NEXT: pmullw %xmm6, %xmm10 +; SSE41-NEXT: pmullw %xmm9, %xmm10 +; SSE41-NEXT: pmullw %xmm2, %xmm4 +; SSE41-NEXT: pmullw %xmm7, %xmm11 +; SSE41-NEXT: pmullw %xmm4, %xmm11 +; SSE41-NEXT: pmullw %xmm10, %xmm11 +; SSE41-NEXT: pmullw %xmm1, %xmm11 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,3,2,3] +; SSE41-NEXT: pmullw %xmm11, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pmullw %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_v128i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero -; AVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm10 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw %xmm9, %xmm11, %xmm9 +; AVX1-NEXT: vpmullw %xmm6, %xmm9, %xmm6 +; AVX1-NEXT: vpmullw %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT: vpmullw %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero +; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpsrld $8, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -2093,28 +2256,40 @@ ; ; AVX2-LABEL: test_v128i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX2-NEXT: vpmullw %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax @@ -2123,22 +2298,36 @@ ; ; AVX512BW-LABEL: test_v128i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512BW-NEXT: vpmullw %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vpandq %zmm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpackuswb %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpmullw %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax @@ -2147,22 +2336,42 @@ ; ; AVX512BWVL-LABEL: test_v128i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm1, %zmm1 -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm3, %zmm2 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vpbroadcastw {{.*#+}} zmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512BWVL-NEXT: vpmullw %zmm3, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512BWVL-NEXT: vpmullw %zmm3, %zmm2, %zmm2 +; AVX512BWVL-NEXT: vpandq %zmm1, %zmm2, %zmm1 +; AVX512BWVL-NEXT: vpackuswb %zmm0, %zmm1, %zmm0 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BWVL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmullw %ymm0, %ymm1, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax @@ -2172,29 +2381,41 @@ ; AVX512DQ-LABEL: test_v128i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQ-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQ-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQ-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQ-NEXT: vpmullw %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQ-NEXT: vpmullw %ymm3, %ymm5, %ymm3 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQ-NEXT: vpmullw %ymm5, %ymm6, %ymm5 +; AVX512DQ-NEXT: vpmullw %ymm3, %ymm5, %ymm3 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQ-NEXT: vpmullw %ymm2, %ymm4, %ymm2 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX512DQ-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vmovd %xmm0, %eax ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax @@ -2204,29 +2425,41 @@ ; AVX512DQVL-LABEL: test_v128i8: ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm5, %ymm3 +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQVL-NEXT: vpmullw %ymm5, %ymm6, %ymm5 +; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm5, %ymm3 +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm4, %ymm2 +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX512DQVL-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovd %xmm0, %eax ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -12,42 +12,47 @@ ; define i1 @trunc_v2i64_v2i1(<2 x i64>) { -; SSE2-LABEL: trunc_v2i64_v2i1: -; SSE2: # %bb.0: -; SSE2-NEXT: psllq $63, %xmm0 -; SSE2-NEXT: movmskpd %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: setne %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v2i64_v2i1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v2i64_v2i1: +; SSE: # %bb.0: +; SSE-NEXT: psllq $63, %xmm0 +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: trunc_v2i64_v2i1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX1OR2-NEXT: vtestpd %xmm0, %xmm0 ; AVX1OR2-NEXT: setne %al ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: trunc_v2i64_v2i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $3, %al ; AVX512F-NEXT: setne %al +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_v2i64_v2i1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512BW-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $3, %al ; AVX512BW-NEXT: setne %al +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: trunc_v2i64_v2i1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; AVX512VL-NEXT: vptest %xmm1, %xmm0 +; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: testb %al, %al ; AVX512VL-NEXT: setne %al ; AVX512VL-NEXT: retq %a = trunc <2 x i64> %0 to <2 x i1> @@ -56,42 +61,47 @@ } define i1 @trunc_v4i32_v4i1(<4 x i32>) { -; SSE2-LABEL: trunc_v4i32_v4i1: -; SSE2: # %bb.0: -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: setne %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v4i32_v4i1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v4i32_v4i1: +; SSE: # %bb.0: +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: trunc_v4i32_v4i1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1OR2-NEXT: vtestps %xmm0, %xmm0 ; AVX1OR2-NEXT: setne %al ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i32_v4i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al ; AVX512F-NEXT: setne %al +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: trunc_v4i32_v4i1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al ; AVX512BW-NEXT: setne %al +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: trunc_v4i32_v4i1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967297,4294967297] -; AVX512VL-NEXT: vptest %xmm1, %xmm0 +; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: testb %al, %al ; AVX512VL-NEXT: setne %al ; AVX512VL-NEXT: retq %a = trunc <4 x i32> %0 to <4 x i1> @@ -100,23 +110,19 @@ } define i1 @trunc_v8i16_v8i1(<8 x i16>) { -; SSE2-LABEL: trunc_v8i16_v8i1: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: setne %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v8i16_v8i1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v8i16_v8i1: +; SSE: # %bb.0: +; SSE-NEXT: psllw $15, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: trunc_v8i16_v8i1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax +; AVX1OR2-NEXT: testl $43690, %eax # imm = 0xAAAA ; AVX1OR2-NEXT: setne %al ; AVX1OR2-NEXT: retq ; @@ -144,88 +150,83 @@ } define i1 @trunc_v16i8_v16i1(<16 x i8>) { -; SSE2-LABEL: trunc_v16i8_v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: setne %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v16i8_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq -; -; AVX1OR2-LABEL: trunc_v16i8_v16i1: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1OR2-NEXT: setne %al -; AVX1OR2-NEXT: retq -; -; AVX512F-LABEL: trunc_v16i8_v16i1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512F-NEXT: setne %al -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_v16i8_v16i1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX512BW-NEXT: setne %al -; AVX512BW-NEXT: retq +; SSE-LABEL: trunc_v16i8_v16i1: +; SSE: # %bb.0: +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; -; AVX512VL-LABEL: trunc_v16i8_v16i1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [72340172838076673,72340172838076673] -; AVX512VL-NEXT: vptest %xmm1, %xmm0 -; AVX512VL-NEXT: setne %al -; AVX512VL-NEXT: retq +; AVX-LABEL: trunc_v16i8_v16i1: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: testl %eax, %eax +; AVX-NEXT: setne %al +; AVX-NEXT: retq %a = trunc <16 x i8> %0 to <16 x i1> %b = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a) ret i1 %b } define i1 @trunc_v4i64_v4i1(<4 x i64>) { -; SSE2-LABEL: trunc_v4i64_v4i1: -; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: setne %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v4i64_v4i1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v4i64_v4i1: +; SSE: # %bb.0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v4i64_v4i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vtestps %xmm0, %xmm0 ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v4i64_v4i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX2-NEXT: vtestpd %ymm0, %ymm0 ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_v4i64_v4i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setne %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_v4i64_v4i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: testb $15, %al +; AVX512F-NEXT: setne %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_v4i64_v4i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: testb $15, %al +; AVX512BW-NEXT: setne %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: trunc_v4i64_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX512VL-NEXT: vptestmq %ymm0, %ymm0, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: testb %al, %al +; AVX512VL-NEXT: setne %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = trunc <4 x i64> %0 to <4 x i1> %b = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %a) ret i1 %b @@ -234,31 +235,43 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) { ; SSE2-LABEL: trunc_v8i32_v8i1: ; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: psllw $15, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: trunc_v8i32_v8i1: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: psllw $15, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_v8i32_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vtestps %xmm0, %xmm0 ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v8i32_v8i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vtestps %ymm0, %ymm0 ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -276,33 +289,38 @@ } define i1 @trunc_v16i16_v16i1(<16 x i16>) { -; SSE2-LABEL: trunc_v16i16_v16i1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: setne %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v16i16_v16i1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v16i16_v16i1: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v16i16_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v16i16_v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -320,41 +338,40 @@ } define i1 @trunc_v32i8_v32i1(<32 x i8>) { -; SSE2-LABEL: trunc_v32i8_v32i1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: setne %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v32i8_v32i1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v32i8_v32i1: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v32i8_v32i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v32i8_v32i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_v32i8_v32i1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX512-NEXT: vptest %ymm1, %ymm0 +; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512-NEXT: vpmovmskb %ymm0, %eax +; AVX512-NEXT: testl %eax, %eax ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -385,26 +402,40 @@ ; ; SSE41-LABEL: trunc_v8i64_v8i1: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm2, %xmm0 +; SSE41-NEXT: psllw $15, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_v8i64_v8i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vtestps %xmm0, %xmm0 ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v8i64_v8i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vtestps %ymm0, %ymm0 ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -425,37 +456,65 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) { ; SSE2-LABEL: trunc_v16i32_v16i1: ; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: movmskps %xmm0, %eax +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: psllw $7, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: trunc_v16i32_v16i1: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm1 +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: psllw $7, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: testl %eax, %eax ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_v16i32_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v16i32_v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -473,39 +532,49 @@ } define i1 @trunc_v32i16_v32i1(<32 x i16>) { -; SSE2-LABEL: trunc_v32i16_v32i1: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-NEXT: setne %al -; SSE2-NEXT: retq -; -; SSE41-LABEL: trunc_v32i16_v32i1: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v32i16_v32i1: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v32i16_v32i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v32i16_v32i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -560,13 +629,32 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_v64i8_v64i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: setne %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_v64i8_v64i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovmskb %ymm0, %eax +; AVX512F-NEXT: testl %eax, %eax +; AVX512F-NEXT: setne %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_v64i8_v64i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; AVX512BW-NEXT: kortestw %k0, %k0 +; AVX512BW-NEXT: setne %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: trunc_v64i8_v64i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 +; AVX512VL-NEXT: kortestw %k0, %k0 +; AVX512VL-NEXT: setne %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = trunc <64 x i8> %0 to <64 x i1> %b = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> %a) ret i1 %b @@ -710,11 +798,8 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AVX512F-NEXT: setne %al -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: icmp0_v8i16_v8i1: @@ -973,9 +1058,8 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; AVX512F-NEXT: vptest %ymm1, %ymm0 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1275,9 +1359,8 @@ ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 ; AVX512F-NEXT: kortestw %k0, %k0 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: vzeroupper @@ -1515,11 +1598,8 @@ ; AVX512F-LABEL: icmp_v8i16_v8i1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AVX512F-NEXT: setne %al -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: icmp_v8i16_v8i1: @@ -1773,9 +1853,8 @@ ; AVX512F-LABEL: icmp_v16i16_v16i1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] +; AVX512F-NEXT: vptest %ymm1, %ymm0 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2070,9 +2149,8 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 ; AVX512F-NEXT: kortestw %k0, %k0 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: vzeroupper @@ -2182,5 +2260,3 @@ declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1>) declare i1 @llvm.vector.reduce.or.v32i1(<32 x i1>) declare i1 @llvm.vector.reduce.or.v64i1(<64 x i1>) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; AVX: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll @@ -55,12 +55,41 @@ ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; -; AVX-LABEL: test_v4i64: -; AVX: # %bb.0: -; AVX-NEXT: vptest %ymm0, %ymm0 -; AVX-NEXT: setne %al -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_v4i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax +; AVX1-NEXT: setne %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v4i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: testq %rax, %rax +; AVX2-NEXT: setne %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: testq %rax, %rax +; AVX512-NEXT: setne %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %a0) %2 = icmp ne i64 %1, 0 ret i1 %2 @@ -91,7 +120,12 @@ ; AVX1-LABEL: test_v8i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -99,15 +133,26 @@ ; AVX2-LABEL: test_v8i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: testq %rax, %rax ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -119,30 +164,30 @@ define i1 @test_v16i64(<16 x i64> %a0) { ; SSE2-LABEL: test_v16i64: ; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm6, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax ; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16i64: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm7, %xmm3 -; SSE41-NEXT: por %xmm5, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: por %xmm6, %xmm2 ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm3 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: ptest %xmm1, %xmm1 ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; @@ -151,7 +196,12 @@ ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -161,7 +211,12 @@ ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -169,8 +224,14 @@ ; AVX512-LABEL: test_v16i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: testq %rax, %rax ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -246,12 +307,47 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX-LABEL: test_v8i32: -; AVX: # %bb.0: -; AVX-NEXT: vptest %ymm0, %ymm0 -; AVX-NEXT: sete %al -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax +; AVX1-NEXT: sete %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testl %eax, %eax +; AVX2-NEXT: sete %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sete %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0) %2 = icmp eq i32 %1, 0 ret i1 %2 @@ -282,7 +378,14 @@ ; AVX1-LABEL: test_v16i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -290,15 +393,30 @@ ; AVX2-LABEL: test_v16i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v16i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testl %eax, %eax ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -310,30 +428,30 @@ define i1 @test_v32i32(<32 x i32> %a0) { ; SSE2-LABEL: test_v32i32: ; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm6, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: movmskps %xmm1, %eax +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-NEXT: movmskps %xmm0, %eax ; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v32i32: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm7, %xmm3 -; SSE41-NEXT: por %xmm5, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: por %xmm6, %xmm2 ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm3 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: ptest %xmm1, %xmm1 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -342,7 +460,14 @@ ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -352,7 +477,14 @@ ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -360,8 +492,16 @@ ; AVX512-LABEL: test_v32i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testl %eax, %eax ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -456,12 +596,53 @@ ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; -; AVX-LABEL: test_v16i16: -; AVX: # %bb.0: -; AVX-NEXT: vptest %ymm0, %ymm0 -; AVX-NEXT: setne %al -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testw %ax, %ax +; AVX1-NEXT: setne %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testw %ax, %ax +; AVX2-NEXT: setne %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testw %ax, %ax +; AVX512-NEXT: setne %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %a0) %2 = icmp ne i16 %1, 0 ret i1 %2 @@ -492,7 +673,16 @@ ; AVX1-LABEL: test_v32i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testw %ax, %ax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -500,15 +690,34 @@ ; AVX2-LABEL: test_v32i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testw %ax, %ax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v32i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testw %ax, %ax ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -520,30 +729,30 @@ define i1 @test_v64i16(<64 x i16> %a0) { ; SSE2-LABEL: test_v64i16: ; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm6, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v64i16: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm7, %xmm3 -; SSE41-NEXT: por %xmm5, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: por %xmm6, %xmm2 ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm3 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: ptest %xmm1, %xmm1 ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; @@ -552,7 +761,16 @@ ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testw %ax, %ax ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -562,7 +780,16 @@ ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testw %ax, %ax ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -570,8 +797,18 @@ ; AVX512-LABEL: test_v64i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testw %ax, %ax ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -685,12 +922,59 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX-LABEL: test_v32i8: -; AVX: # %bb.0: -; AVX-NEXT: vptest %ymm0, %ymm0 -; AVX-NEXT: sete %al -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: test_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testb %al, %al +; AVX1-NEXT: sete %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testb %al, %al +; AVX2-NEXT: sete %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testb %al, %al +; AVX512-NEXT: sete %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %a0) %2 = icmp eq i8 %1, 0 ret i1 %2 @@ -721,7 +1005,18 @@ ; AVX1-LABEL: test_v64i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testb %al, %al ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -729,15 +1024,38 @@ ; AVX2-LABEL: test_v64i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testb %al, %al ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v64i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testb %al, %al ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -749,30 +1067,30 @@ define i1 @test_v128i8(<128 x i8> %a0) { ; SSE2-LABEL: test_v128i8: ; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm6, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v128i8: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm7, %xmm3 -; SSE41-NEXT: por %xmm5, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: por %xmm6, %xmm2 ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest %xmm0, %xmm0 +; SSE41-NEXT: por %xmm7, %xmm3 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: ptest %xmm1, %xmm1 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -781,7 +1099,18 @@ ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testb %al, %al ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -791,7 +1120,18 @@ ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testb %al, %al ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -799,8 +1139,20 @@ ; AVX512-LABEL: test_v128i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testb %al, %al ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -877,23 +1229,42 @@ ; ; AVX1-LABEL: mask_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testl $-2147483648, %eax # imm = 0x80000000 ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: mask_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testl $-2147483648, %eax # imm = 0x80000000 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: mask_v8i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456] -; AVX512-NEXT: vptest %ymm1, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testl $-2147483648, %eax # imm = 0x80000000 ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -904,30 +1275,68 @@ } define i1 @trunc_v16i16(<16 x i16> %a0) { -; SSE2-LABEL: trunc_v16i16: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax -; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: setne %al -; SSE2-NEXT: retq +; SSE-LABEL: trunc_v16i16: +; SSE: # %bb.0: +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: testb %al, %al +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; -; SSE41-LABEL: trunc_v16i16: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; AVX1-LABEL: trunc_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testb %al, %al +; AVX1-NEXT: setne %al +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq ; -; AVX-LABEL: trunc_v16i16: -; AVX: # %bb.0: -; AVX-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX-NEXT: setne %al -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX2-LABEL: trunc_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testb %al, %al +; AVX2-NEXT: setne %al +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: trunc_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testb %al, %al +; AVX512-NEXT: setne %al +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %a0) %2 = trunc i16 %1 to i8 %3 = icmp ne i8 %2, 0 @@ -937,29 +1346,29 @@ define i1 @mask_v128i8(<128 x i8> %a0) { ; SSE2-LABEL: mask_v128i8: ; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm6, %xmm2 ; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psllw $7, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: psllw $7, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; ; SSE41-LABEL: mask_v128i8: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm7, %xmm3 -; SSE41-NEXT: por %xmm5, %xmm1 -; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: por %xmm6, %xmm2 ; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: por %xmm7, %xmm3 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm1 +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -968,7 +1377,18 @@ ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -978,8 +1398,18 @@ ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -987,8 +1417,20 @@ ; AVX512-LABEL: mask_v128i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testb $1, %al ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-or.ll b/llvm/test/CodeGen/X86/vector-reduce-or.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or.ll @@ -40,9 +40,9 @@ ; AVX1-LABEL: test_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -50,7 +50,7 @@ ; AVX2-LABEL: test_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -60,7 +60,7 @@ ; AVX512-LABEL: test_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -85,7 +85,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -96,7 +96,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -108,7 +108,7 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -139,7 +139,7 @@ ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -152,7 +152,7 @@ ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -165,7 +165,7 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -233,11 +233,11 @@ ; AVX1-LABEL: test_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -245,7 +245,7 @@ ; AVX2-LABEL: test_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -257,7 +257,7 @@ ; AVX512-LABEL: test_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -286,7 +286,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -299,7 +299,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -313,9 +313,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -348,7 +348,7 @@ ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -363,7 +363,7 @@ ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -378,9 +378,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -487,11 +487,11 @@ ; AVX1-LABEL: test_v16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -502,11 +502,11 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -517,11 +517,11 @@ ; AVX512-LABEL: test_v16i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -553,11 +553,11 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vorps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vorps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -569,11 +569,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -628,11 +628,11 @@ ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vorps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vorps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -646,11 +646,11 @@ ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -815,11 +815,11 @@ ; AVX1-LABEL: test_v32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 @@ -832,7 +832,7 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -849,7 +849,7 @@ ; AVX512-LABEL: test_v32i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -890,7 +890,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -908,7 +908,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -927,13 +927,13 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -974,7 +974,7 @@ ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -994,7 +994,7 @@ ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1014,13 +1014,13 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/vector-reduce-smax.ll b/llvm/test/CodeGen/X86/vector-reduce-smax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smax.ll @@ -180,8 +180,8 @@ ; AVX2-LABEL: test_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -203,7 +203,7 @@ ; AVX512VL-LABEL: test_v4i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -372,8 +372,8 @@ ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -398,7 +398,7 @@ ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -697,8 +697,8 @@ ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -725,7 +725,7 @@ ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -871,9 +871,9 @@ ; AVX2-LABEL: test_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -883,9 +883,9 @@ ; AVX512-LABEL: test_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -959,9 +959,9 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -973,9 +973,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1081,9 +1081,9 @@ ; AVX2-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1096,9 +1096,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1269,38 +1269,32 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorl $32767, %eax # imm = 0x7FFF ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v16i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v16i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %a0) ret i16 %1 } @@ -1353,42 +1347,34 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorl $32767, %eax # imm = 0x7FFF ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v32i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v32i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> %a0) ret i16 %1 } @@ -1416,15 +1402,15 @@ ; ; SSE4-LABEL: test_v64i16: ; SSE4: # %bb.0: -; SSE4-NEXT: pmaxsw %xmm7, %xmm3 -; SSE4-NEXT: pmaxsw %xmm5, %xmm1 -; SSE4-NEXT: pmaxsw %xmm3, %xmm1 ; SSE4-NEXT: pmaxsw %xmm6, %xmm2 ; SSE4-NEXT: pmaxsw %xmm4, %xmm0 ; SSE4-NEXT: pmaxsw %xmm2, %xmm0 -; SSE4-NEXT: pmaxsw %xmm1, %xmm0 -; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: phminposuw %xmm0, %xmm0 +; SSE4-NEXT: pmaxsw %xmm7, %xmm3 +; SSE4-NEXT: pmaxsw %xmm5, %xmm1 +; SSE4-NEXT: pmaxsw %xmm3, %xmm1 +; SSE4-NEXT: pmaxsw %xmm0, %xmm1 +; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE4-NEXT: phminposuw %xmm1, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: xorl $32767, %eax # imm = 0x7FFF ; SSE4-NEXT: # kill: def $ax killed $ax killed $eax @@ -1432,17 +1418,17 @@ ; ; AVX1-LABEL: test_v64i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpmaxsw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmaxsw %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpmaxsw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmaxsw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsw %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -1457,44 +1443,35 @@ ; AVX2-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorl $32767, %eax # imm = 0x7FFF ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v64i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v64i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v64i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> %a0) ret i16 %1 } @@ -1818,44 +1795,36 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorb $127, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v32i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: xorb $127, %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v32i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: xorb $127, %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %a0) ret i8 %1 } @@ -1944,48 +1913,38 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorb $127, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v64i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: xorb $127, %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v64i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: xorb $127, %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> %a0) ret i8 %1 } @@ -2060,18 +2019,18 @@ ; ; SSE4-LABEL: test_v128i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pmaxsb %xmm7, %xmm3 -; SSE4-NEXT: pmaxsb %xmm5, %xmm1 -; SSE4-NEXT: pmaxsb %xmm3, %xmm1 ; SSE4-NEXT: pmaxsb %xmm6, %xmm2 ; SSE4-NEXT: pmaxsb %xmm4, %xmm0 ; SSE4-NEXT: pmaxsb %xmm2, %xmm0 -; SSE4-NEXT: pmaxsb %xmm1, %xmm0 -; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: movdqa %xmm0, %xmm1 -; SSE4-NEXT: psrlw $8, %xmm1 -; SSE4-NEXT: pminub %xmm0, %xmm1 -; SSE4-NEXT: phminposuw %xmm1, %xmm0 +; SSE4-NEXT: pmaxsb %xmm7, %xmm3 +; SSE4-NEXT: pmaxsb %xmm5, %xmm1 +; SSE4-NEXT: pmaxsb %xmm3, %xmm1 +; SSE4-NEXT: pmaxsb %xmm0, %xmm1 +; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm0 +; SSE4-NEXT: psrlw $8, %xmm0 +; SSE4-NEXT: pminub %xmm1, %xmm0 +; SSE4-NEXT: phminposuw %xmm0, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: xorb $127, %al ; SSE4-NEXT: # kill: def $al killed $al killed $eax @@ -2079,17 +2038,17 @@ ; ; AVX1-LABEL: test_v128i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpmaxsb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmaxsb %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpmaxsb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmaxsb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 @@ -2106,50 +2065,39 @@ ; AVX2-NEXT: vpmaxsb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorb $127, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v128i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: xorb $127, %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v128i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: xorb $127, %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v128i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> %a0) ret i8 %1 } diff --git a/llvm/test/CodeGen/X86/vector-reduce-smin.ll b/llvm/test/CodeGen/X86/vector-reduce-smin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smin.ll @@ -180,8 +180,8 @@ ; AVX2-LABEL: test_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -203,7 +203,7 @@ ; AVX512VL-LABEL: test_v4i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpminsq %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -372,8 +372,8 @@ ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -398,7 +398,7 @@ ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -697,8 +697,8 @@ ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -725,7 +725,7 @@ ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -871,9 +871,9 @@ ; AVX2-LABEL: test_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -883,9 +883,9 @@ ; AVX512-LABEL: test_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -959,9 +959,9 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -973,9 +973,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1081,9 +1081,9 @@ ; AVX2-NEXT: vpminsd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1096,9 +1096,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1269,38 +1269,32 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorl $32768, %eax # imm = 0x8000 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v16i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v16i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %a0) ret i16 %1 } @@ -1353,42 +1347,34 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorl $32768, %eax # imm = 0x8000 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v32i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v32i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> %a0) ret i16 %1 } @@ -1416,15 +1402,15 @@ ; ; SSE4-LABEL: test_v64i16: ; SSE4: # %bb.0: -; SSE4-NEXT: pminsw %xmm7, %xmm3 -; SSE4-NEXT: pminsw %xmm5, %xmm1 -; SSE4-NEXT: pminsw %xmm3, %xmm1 ; SSE4-NEXT: pminsw %xmm6, %xmm2 ; SSE4-NEXT: pminsw %xmm4, %xmm0 ; SSE4-NEXT: pminsw %xmm2, %xmm0 -; SSE4-NEXT: pminsw %xmm1, %xmm0 -; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: phminposuw %xmm0, %xmm0 +; SSE4-NEXT: pminsw %xmm7, %xmm3 +; SSE4-NEXT: pminsw %xmm5, %xmm1 +; SSE4-NEXT: pminsw %xmm3, %xmm1 +; SSE4-NEXT: pminsw %xmm0, %xmm1 +; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE4-NEXT: phminposuw %xmm1, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: xorl $32768, %eax # imm = 0x8000 ; SSE4-NEXT: # kill: def $ax killed $ax killed $eax @@ -1432,17 +1418,17 @@ ; ; AVX1-LABEL: test_v64i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpminsw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpminsw %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpminsw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpminsw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpminsw %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -1457,44 +1443,35 @@ ; AVX2-NEXT: vpminsw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorl $32768, %eax # imm = 0x8000 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v64i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v64i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminsw %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v64i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> %a0) ret i16 %1 } @@ -1818,44 +1795,36 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: addb $-128, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v32i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: addb $-128, %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v32i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: addb $-128, %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %a0) ret i8 %1 } @@ -1944,48 +1913,38 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: addb $-128, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v64i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: addb $-128, %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v64i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: addb $-128, %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> %a0) ret i8 %1 } @@ -2060,18 +2019,18 @@ ; ; SSE4-LABEL: test_v128i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pminsb %xmm7, %xmm3 -; SSE4-NEXT: pminsb %xmm5, %xmm1 -; SSE4-NEXT: pminsb %xmm3, %xmm1 ; SSE4-NEXT: pminsb %xmm6, %xmm2 ; SSE4-NEXT: pminsb %xmm4, %xmm0 ; SSE4-NEXT: pminsb %xmm2, %xmm0 -; SSE4-NEXT: pminsb %xmm1, %xmm0 -; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: movdqa %xmm0, %xmm1 -; SSE4-NEXT: psrlw $8, %xmm1 -; SSE4-NEXT: pminub %xmm0, %xmm1 -; SSE4-NEXT: phminposuw %xmm1, %xmm0 +; SSE4-NEXT: pminsb %xmm7, %xmm3 +; SSE4-NEXT: pminsb %xmm5, %xmm1 +; SSE4-NEXT: pminsb %xmm3, %xmm1 +; SSE4-NEXT: pminsb %xmm0, %xmm1 +; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm0 +; SSE4-NEXT: psrlw $8, %xmm0 +; SSE4-NEXT: pminub %xmm1, %xmm0 +; SSE4-NEXT: phminposuw %xmm0, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: addb $-128, %al ; SSE4-NEXT: # kill: def $al killed $al killed $eax @@ -2079,17 +2038,17 @@ ; ; AVX1-LABEL: test_v128i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpminsb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpminsb %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpminsb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpminsb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpminsb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 @@ -2106,50 +2065,39 @@ ; AVX2-NEXT: vpminsb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: addb $-128, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v128i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpminsb %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: addb $-128, %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v128i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminsb %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: addb $-128, %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v128i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> %a0) ret i8 %1 } diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -211,10 +211,10 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -238,7 +238,7 @@ ; AVX512VL-LABEL: test_v4i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -431,10 +431,10 @@ ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -461,7 +461,7 @@ ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -809,10 +809,10 @@ ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm2 -; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm3 -; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vxorpd %ymm4, %ymm0, %ymm2 +; AVX2-NEXT: vxorpd %ymm4, %ymm1, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 @@ -841,7 +841,7 @@ ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -1005,9 +1005,9 @@ ; AVX2-LABEL: test_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1017,9 +1017,9 @@ ; AVX512-LABEL: test_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1108,9 +1108,9 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1122,9 +1122,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1257,9 +1257,9 @@ ; AVX2-NEXT: vpmaxud %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1272,9 +1272,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1481,39 +1481,32 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: notl %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v16i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: notl %eax -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v16i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: notl %eax -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %a0) ret i16 %1 } @@ -1574,43 +1567,34 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: notl %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v32i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: notl %eax -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v32i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: notl %eax -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> %a0) ret i16 %1 } @@ -1648,16 +1632,16 @@ ; ; SSE4-LABEL: test_v64i16: ; SSE4: # %bb.0: -; SSE4-NEXT: pmaxuw %xmm7, %xmm3 -; SSE4-NEXT: pmaxuw %xmm5, %xmm1 -; SSE4-NEXT: pmaxuw %xmm3, %xmm1 ; SSE4-NEXT: pmaxuw %xmm6, %xmm2 ; SSE4-NEXT: pmaxuw %xmm4, %xmm0 ; SSE4-NEXT: pmaxuw %xmm2, %xmm0 -; SSE4-NEXT: pmaxuw %xmm1, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm0, %xmm1 -; SSE4-NEXT: phminposuw %xmm1, %xmm0 +; SSE4-NEXT: pmaxuw %xmm7, %xmm3 +; SSE4-NEXT: pmaxuw %xmm5, %xmm1 +; SSE4-NEXT: pmaxuw %xmm3, %xmm1 +; SSE4-NEXT: pmaxuw %xmm0, %xmm1 +; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE4-NEXT: pxor %xmm1, %xmm0 +; SSE4-NEXT: phminposuw %xmm0, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: notl %eax ; SSE4-NEXT: # kill: def $ax killed $ax killed $eax @@ -1665,17 +1649,17 @@ ; ; AVX1-LABEL: test_v64i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpmaxuw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmaxuw %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpmaxuw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmaxuw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxuw %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vphminposuw %xmm0, %xmm0 @@ -1691,45 +1675,35 @@ ; AVX2-NEXT: vpmaxuw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: notl %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v64i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: notl %eax -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v64i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: notl %eax -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v64i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> %a0) ret i16 %1 } @@ -1964,45 +1938,36 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: notb %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v32i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: notb %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v32i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: notb %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %a0) ret i8 %1 } @@ -2065,49 +2030,38 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: notb %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v64i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: notb %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v64i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: notb %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> %a0) ret i8 %1 } @@ -2138,19 +2092,19 @@ ; ; SSE4-LABEL: test_v128i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pmaxub %xmm7, %xmm3 -; SSE4-NEXT: pmaxub %xmm5, %xmm1 -; SSE4-NEXT: pmaxub %xmm3, %xmm1 ; SSE4-NEXT: pmaxub %xmm6, %xmm2 ; SSE4-NEXT: pmaxub %xmm4, %xmm0 ; SSE4-NEXT: pmaxub %xmm2, %xmm0 -; SSE4-NEXT: pmaxub %xmm1, %xmm0 -; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE4-NEXT: pxor %xmm0, %xmm1 -; SSE4-NEXT: movdqa %xmm1, %xmm0 -; SSE4-NEXT: psrlw $8, %xmm0 -; SSE4-NEXT: pminub %xmm1, %xmm0 -; SSE4-NEXT: phminposuw %xmm0, %xmm0 +; SSE4-NEXT: pmaxub %xmm7, %xmm3 +; SSE4-NEXT: pmaxub %xmm5, %xmm1 +; SSE4-NEXT: pmaxub %xmm3, %xmm1 +; SSE4-NEXT: pmaxub %xmm0, %xmm1 +; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE4-NEXT: pxor %xmm1, %xmm0 +; SSE4-NEXT: movdqa %xmm0, %xmm1 +; SSE4-NEXT: psrlw $8, %xmm1 +; SSE4-NEXT: pminub %xmm0, %xmm1 +; SSE4-NEXT: phminposuw %xmm1, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: notb %al ; SSE4-NEXT: # kill: def $al killed $al killed $eax @@ -2158,17 +2112,17 @@ ; ; AVX1-LABEL: test_v128i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpmaxub %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmaxub %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpmaxub %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmaxub %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxub %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 @@ -2186,51 +2140,39 @@ ; AVX2-NEXT: vpmaxub %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: notb %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512BW-LABEL: test_v128i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: notb %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v128i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: notb %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v128i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> %a0) ret i8 %1 } diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -212,10 +212,10 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -239,7 +239,7 @@ ; AVX512VL-LABEL: test_v4i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpminuq %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -434,10 +434,10 @@ ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -464,7 +464,7 @@ ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -813,10 +813,10 @@ ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 -; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 -; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vxorpd %ymm4, %ymm0, %ymm2 +; AVX2-NEXT: vxorpd %ymm4, %ymm1, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 @@ -845,7 +845,7 @@ ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -1009,9 +1009,9 @@ ; AVX2-LABEL: test_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1021,9 +1021,9 @@ ; AVX512-LABEL: test_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1112,9 +1112,9 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1126,9 +1126,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1261,9 +1261,9 @@ ; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1276,9 +1276,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1470,8 +1470,13 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -1480,8 +1485,13 @@ ; AVX512-LABEL: test_v16i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -1546,8 +1556,13 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -1556,10 +1571,15 @@ ; AVX512-LABEL: test_v32i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -1611,31 +1631,31 @@ ; ; SSE4-LABEL: test_v64i16: ; SSE4: # %bb.0: -; SSE4-NEXT: pminuw %xmm7, %xmm3 -; SSE4-NEXT: pminuw %xmm5, %xmm1 -; SSE4-NEXT: pminuw %xmm3, %xmm1 ; SSE4-NEXT: pminuw %xmm6, %xmm2 ; SSE4-NEXT: pminuw %xmm4, %xmm0 ; SSE4-NEXT: pminuw %xmm2, %xmm0 -; SSE4-NEXT: pminuw %xmm1, %xmm0 -; SSE4-NEXT: phminposuw %xmm0, %xmm0 +; SSE4-NEXT: pminuw %xmm7, %xmm3 +; SSE4-NEXT: pminuw %xmm5, %xmm1 +; SSE4-NEXT: pminuw %xmm3, %xmm1 +; SSE4-NEXT: pminuw %xmm0, %xmm1 +; SSE4-NEXT: phminposuw %xmm1, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: # kill: def $ax killed $ax killed $eax ; SSE4-NEXT: retq ; ; AVX1-LABEL: test_v64i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpminuw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpminuw %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpminuw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpminuw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpminuw %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax @@ -1648,8 +1668,13 @@ ; AVX2-NEXT: vpminuw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -1659,10 +1684,15 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -1874,10 +1904,15 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper @@ -1886,10 +1921,15 @@ ; AVX512-LABEL: test_v32i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper @@ -1950,10 +1990,15 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper @@ -1962,12 +2007,17 @@ ; AVX512-LABEL: test_v64i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper @@ -2002,34 +2052,34 @@ ; ; SSE4-LABEL: test_v128i8: ; SSE4: # %bb.0: -; SSE4-NEXT: pminub %xmm7, %xmm3 -; SSE4-NEXT: pminub %xmm5, %xmm1 -; SSE4-NEXT: pminub %xmm3, %xmm1 ; SSE4-NEXT: pminub %xmm6, %xmm2 ; SSE4-NEXT: pminub %xmm4, %xmm0 ; SSE4-NEXT: pminub %xmm2, %xmm0 -; SSE4-NEXT: pminub %xmm1, %xmm0 -; SSE4-NEXT: movdqa %xmm0, %xmm1 -; SSE4-NEXT: psrlw $8, %xmm1 +; SSE4-NEXT: pminub %xmm7, %xmm3 +; SSE4-NEXT: pminub %xmm5, %xmm1 +; SSE4-NEXT: pminub %xmm3, %xmm1 ; SSE4-NEXT: pminub %xmm0, %xmm1 -; SSE4-NEXT: phminposuw %xmm1, %xmm0 +; SSE4-NEXT: movdqa %xmm1, %xmm0 +; SSE4-NEXT: psrlw $8, %xmm0 +; SSE4-NEXT: pminub %xmm1, %xmm0 +; SSE4-NEXT: phminposuw %xmm0, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: # kill: def $al killed $al killed $eax ; SSE4-NEXT: retq ; ; AVX1-LABEL: test_v128i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpminub %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpminub %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpminub %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpminub %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpminub %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vphminposuw %xmm0, %xmm0 @@ -2044,10 +2094,15 @@ ; AVX2-NEXT: vpminub %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper @@ -2057,12 +2112,17 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -723,10 +723,14 @@ ; ; AVX1-LABEL: trunc_v32i16_v32i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: xorb %ah, %al diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor.ll b/llvm/test/CodeGen/X86/vector-reduce-xor.ll --- a/llvm/test/CodeGen/X86/vector-reduce-xor.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor.ll @@ -40,9 +40,9 @@ ; AVX1-LABEL: test_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -50,7 +50,7 @@ ; AVX2-LABEL: test_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -60,7 +60,7 @@ ; AVX512-LABEL: test_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -85,7 +85,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -96,7 +96,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -108,7 +108,7 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -139,7 +139,7 @@ ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -152,7 +152,7 @@ ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -165,7 +165,7 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax @@ -233,11 +233,11 @@ ; AVX1-LABEL: test_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -245,7 +245,7 @@ ; AVX2-LABEL: test_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -257,7 +257,7 @@ ; AVX512-LABEL: test_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -286,7 +286,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -299,7 +299,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -313,9 +313,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -348,7 +348,7 @@ ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -363,7 +363,7 @@ ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -378,9 +378,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -487,11 +487,11 @@ ; AVX1-LABEL: test_v16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -502,11 +502,11 @@ ; AVX2-LABEL: test_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -517,11 +517,11 @@ ; AVX512-LABEL: test_v16i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -553,11 +553,11 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vxorps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vxorps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -569,11 +569,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -628,11 +628,11 @@ ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vxorps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vxorps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -646,11 +646,11 @@ ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -815,11 +815,11 @@ ; AVX1-LABEL: test_v32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 @@ -832,7 +832,7 @@ ; AVX2-LABEL: test_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -849,7 +849,7 @@ ; AVX512-LABEL: test_v32i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -890,7 +890,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -908,7 +908,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -927,13 +927,13 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -974,7 +974,7 @@ ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -994,7 +994,7 @@ ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1014,13 +1014,13 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll --- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -267,10 +267,10 @@ ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k2 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -278,14 +278,14 @@ ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; @@ -294,10 +294,10 @@ ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 @@ -305,56 +305,56 @@ ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-ONLY-LABEL: mask_replication_factor2_vf32: ; AVX512BW-ONLY: # %bb.0: -; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512BW-ONLY-NEXT: kmovd (%rdi), %k0 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3] ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-ONLY-NEXT: vzeroupper ; AVX512BW-ONLY-NEXT: retq ; ; AVX512VBMI-ONLY-LABEL: mask_replication_factor2_vf32: ; AVX512VBMI-ONLY: # %bb.0: -; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: kmovd (%rdi), %k0 ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512VBMI-ONLY-NEXT: vzeroupper ; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 @@ -373,11 +373,11 @@ ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k4 ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} @@ -395,23 +395,23 @@ ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k7} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k6} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k7} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k5} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k6} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} ; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; @@ -422,11 +422,11 @@ ; AVX512DQ-NEXT: kmovw 4(%rdi), %k3 ; AVX512DQ-NEXT: kmovw 6(%rdi), %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 ; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0 @@ -444,23 +444,23 @@ ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k7} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k6} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k5} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} ; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -476,28 +476,28 @@ ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3] ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k2} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-ONLY-NEXT: vzeroupper ; AVX512BW-ONLY-NEXT: retq ; @@ -511,28 +511,28 @@ ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512VBMI-ONLY-NEXT: vzeroupper ; AVX512VBMI-ONLY-NEXT: retq %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 @@ -667,22 +667,17 @@ ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512F-ONLY-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa %ymm1, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512F-ONLY-NEXT: vmovdqa %ymm0, 64(%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; @@ -691,21 +686,16 @@ ; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 ; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,6,6,6,7,7,7] ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vpmovd2m %ymm0, %k2 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa %ymm1, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -740,25 +730,20 @@ ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq @@ -767,50 +752,59 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor3_vf16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovw (%rdi), %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] -; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] -; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor3_vf16: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,21,21,22,22,22,23,23,23,24,24,24,25,25,25,26,26,42,43,43,43,44,44,44,45,45,45,46,46,46,47,47,47,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor3_vf16: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10,10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <48 x i32> @@ -822,62 +816,52 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf32: ; AVX512F-ONLY: # %bb.0: -; AVX512F-ONLY-NEXT: kmovw (%rdi), %k2 +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k3 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k2} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k3 +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm3 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z} ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k5} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor3_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kmovw (%rdi), %k1 -; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k2 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm3 -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 @@ -887,480 +871,80 @@ ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z} -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k5} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor3_vf32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovd (%rdi), %k0 -; AVX512BW-NEXT: kshiftrd $1, %k0, %k1 -; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kandw %k3, %k0, %k2 -; AVX512BW-NEXT: kmovq %k3, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k3 -; AVX512BW-NEXT: kshiftrw $14, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: movw $-5, %ax -; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-9, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-17, %ax -; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-33, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: movw $-65, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $2, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: movw $-129, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $3, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $4, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $5, %k0, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $27, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k4 -; AVX512BW-NEXT: kshiftrd $26, %k0, %k1 -; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovq %k7, %k2 -; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $28, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $29, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $30, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $5, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $31, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k7 -; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $21, %k0, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $22, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $23, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $24, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $25, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k0, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $17, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $18, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $19, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $20, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $11, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrd $10, %k0, %k4 -; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k4, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $12, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $13, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $14, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $5, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $15, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k2} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $6, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $13, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $7, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $8, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $9, %k0, %k0 -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k2, %k0 -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 -; AVX512BW-NEXT: korw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor3_vf32: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovd (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [21,21,22,22,22,23,23,23,24,24,24,25,25,25,26,26,26,27,27,27,28,28,28,29,29,29,30,30,30,31,31,31] +; AVX512BW-ONLY-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-ONLY-NEXT: vpmovw2m %zmm0, %k1 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,2,3] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,21,21,22,22,22,23,23,23,24,24,24,25,25,25,26,26,42,43,43,43,44,44,44,45,45,45,46,46,46,47,47,47,48,48,48,49,49,49,50,50,50,51,51,51,52,52,52,53] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor3_vf32: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovd (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2w %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [21,21,22,22,22,23,23,23,24,24,24,25,25,25,26,26,26,27,27,27,28,28,28,29,29,29,30,30,30,31,31,31] +; AVX512VBMI-ONLY-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovw2m %zmm0, %k1 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10,10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15,16,16,16,17,17,17,18,18,18,19,19,19,20,20,20,21] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <96 x i32> @@ -1372,1023 +956,229 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-ONLY-LABEL: mask_replication_factor3_vf64: ; AVX512F-ONLY: # %bb.0: -; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} -; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm0 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm8 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm9 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm3 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm10 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm2, %zmm11 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm4 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm2, %zmm2 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm8 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm9 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm1 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm3, %zmm10 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm5, %zmm11 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm7, %zmm2 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm12, %zmm12, %zmm12 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm12, %zmm3, %zmm3 +; AVX512F-ONLY-NEXT: vpermd %zmm12, %zmm5, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm12, %zmm7, %zmm7 +; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm11 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm11 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm10 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm9 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 576(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 640(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 704(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 704(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 640(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 576(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor3_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 -; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm8 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm9 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm3 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm10 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm2, %zmm11 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm4 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm8 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm9 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm1 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm3, %zmm10 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm11 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm7, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm12 +; AVX512DQ-NEXT: vpermd %zmm12, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm12, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm12, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm7 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm11 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm11 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm10 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm10 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm9 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 576(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 640(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 704(%rdx) +; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 +; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 704(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 640(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 576(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor3_vf64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovq (%rdi), %k0 -; AVX512BW-NEXT: kshiftrq $1, %k0, %k1 -; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kandw %k3, %k0, %k2 -; AVX512BW-NEXT: kmovq %k3, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k3 -; AVX512BW-NEXT: kshiftrw $14, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: movw $-5, %ax -; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-9, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-17, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-33, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: movw $-65, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $2, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: movw $-129, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovq %k3, %k5 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $8, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $3, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $4, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k3 -; AVX512BW-NEXT: kshiftrw $3, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $2, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $5, %k0, %k2 -; AVX512BW-NEXT: kmovq %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $59, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrq $58, %k0, %k1 -; AVX512BW-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kmovq %k7, %k3 -; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $60, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $61, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $62, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $63, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k7 -; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $53, %k0, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $54, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $55, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $56, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $57, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 8-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $48, %k0, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $49, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $50, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $51, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $52, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $43, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $42, %k0, %k3 -; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $44, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $45, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $46, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $47, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $37, %k0, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrq $38, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrq $39, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrq $40, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrq $41, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $32, %k0, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $33, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $34, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $35, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $36, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $27, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $26, %k0, %k3 -; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $28, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $29, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $30, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $31, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $21, %k0, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrq $22, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrq $23, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrq $24, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrq $25, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $16, %k0, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $17, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $18, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $19, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $20, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $11, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrq $10, %k0, %k3 -; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $12, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $13, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $14, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $15, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k2} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $6, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $7, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $8, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrq $9, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k2, %k0 -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 -; AVX512BW-NEXT: korw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm11 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 512(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 704(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor3_vf64: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,6,7,6,7] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15,16,16,16,17,17,17,18,18,18,19,19,19,20,20,20,21,37,37,38,38,38,39,39,39,40,40,40,41,41,41,42,42,58,59,59,59,60,60,60,61,61,61,62,62,62,63,63,63] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,4,5,4,5] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10,26,27,27,27,28,28,28,29,29,29,30,30,30,31,31,31,32,32,32,33,33,33,34,34,34,35,35,35,36,36,36,37,53,53,54,54,54,55,55,55,56,56,56,57,57,57,58,58] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,2,3] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,21,21,22,22,22,23,23,23,24,24,24,25,25,25,26,26,42,43,43,43,44,44,44,45,45,45,46,46,46,47,47,47,48,48,48,49,49,49,50,50,50,51,51,51,52,52,52,53] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k3} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor3_vf64: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [42,43,43,43,44,44,44,45,45,45,46,46,46,47,47,47,48,48,48,49,49,49,50,50,50,51,51,51,52,52,52,53,53,53,54,54,54,55,55,55,56,56,56,57,57,57,58,58,58,59,59,59,60,60,60,61,61,61,62,62,62,63,63,63] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [21,21,22,22,22,23,23,23,24,24,24,25,25,25,26,26,26,27,27,27,28,28,28,29,29,29,30,30,30,31,31,31,32,32,32,33,33,33,34,34,34,35,35,35,36,36,36,37,37,37,38,38,38,39,39,39,40,40,40,41,41,41,42,42] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,9,10,10,10,11,11,11,12,12,12,13,13,13,14,14,14,15,15,15,16,16,16,17,17,17,18,18,18,19,19,19,20,20,20,21] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <192 x i32> %data = call <192 x i32> @llvm.masked.load.v192i32.p0(ptr %in.vec, i32 64, <192 x i1> %tgt.mask, <192 x i32> poison) @@ -2613,26 +1403,26 @@ ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; @@ -2640,68 +1430,68 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf16: ; AVX512BW-ONLY: # %bb.0: -; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63] ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-ONLY-NEXT: vzeroupper ; AVX512BW-ONLY-NEXT: retq ; ; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf16: ; AVX512VBMI-ONLY: # %bb.0: -; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k0 ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512VBMI-ONLY-NEXT: vzeroupper ; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 @@ -2718,17 +1508,17 @@ ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k4 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm3 ; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k5 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} @@ -2740,23 +1530,23 @@ ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k7 ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm4, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k5} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k7} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k6} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k5} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} ; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; @@ -2765,17 +1555,17 @@ ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 ; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm3 ; AVX512DQ-NEXT: vpmovd2m %zmm3, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 @@ -2787,23 +1577,23 @@ ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7 ; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k7} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k6} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k5} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} ; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2818,28 +1608,28 @@ ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k2} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-ONLY-NEXT: vzeroupper ; AVX512BW-ONLY-NEXT: retq ; @@ -2853,28 +1643,28 @@ ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512VBMI-ONLY-NEXT: vzeroupper ; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 @@ -3065,50 +1855,50 @@ ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k5} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k4, %k4 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5 -; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k3} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 -; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k4} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 -; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k3} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k2} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k3} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k1} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k2} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm15, 896(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm14, 960(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm13, 768(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm12, 832(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm11, 640(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 512(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 576(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm15, 960(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm14, 896(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-ONLY-NEXT: vzeroupper ; AVX512BW-ONLY-NEXT: retq ; @@ -3128,50 +1918,50 @@ ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k5} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k4, %k4 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k3} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k4} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k3} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm15, 896(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm14, 960(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm13, 768(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm12, 832(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm11, 640(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm10, 704(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 512(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 576(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm15, 960(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm14, 896(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512VBMI-ONLY-NEXT: vzeroupper ; AVX512VBMI-ONLY-NEXT: retq %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 @@ -3311,15 +2101,10 @@ ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k2 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 ; AVX512F-ONLY-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 @@ -3327,12 +2112,12 @@ ; AVX512F-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-ONLY-NEXT: vptestmd %ymm0, %ymm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512F-ONLY-NEXT: vmovdqa %ymm0, 128(%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; @@ -3340,27 +2125,22 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vpmovd2m %ymm0, %k3 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 128(%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3375,12 +2155,12 @@ ; AVX512BW-ONLY-NEXT: kmovq %rax, %k1 ; AVX512BW-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1} ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx) +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa %ymm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-ONLY-NEXT: vzeroupper ; AVX512BW-ONLY-NEXT: retq @@ -3396,12 +2176,12 @@ ; AVX512VBMI-ONLY-NEXT: kmovq %rax, %k1 ; AVX512VBMI-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa %ymm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512VBMI-ONLY-NEXT: vzeroupper ; AVX512VBMI-ONLY-NEXT: retq @@ -3419,35 +2199,30 @@ ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k5 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm1 {%k5} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k5} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq @@ -3456,70 +2231,87 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k5 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm1 {%k5} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k5} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor5_vf16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovw (%rdi), %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] -; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] -; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3 -; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] -; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4 -; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] -; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm0 -; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor5_vf16: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512BW-ONLY-NEXT: vpmovm2b %k1, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,19,19,19,19,20,20,20,20,20,21,21,21,21,21,22,22,38,38,38,39,39,39,39,39,40,40,40,40,40,41,41,41,57,57,58,58,58,58,58,59,59,59,59,59,60,60,60,60] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512BW-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512BW-ONLY-NEXT: vpermd %zmm4, %zmm5, %zmm4 +; AVX512BW-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor5_vf16: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6,6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9,9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512VBMI-ONLY-NEXT: vpermd %zmm4, %zmm5, %zmm4 +; AVX512VBMI-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <80 x i32> @@ -3531,834 +2323,199 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf32: ; AVX512F-ONLY: # %bb.0: -; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm0 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm5, %zmm5 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm7 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm3 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm10, %zmm10, %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm3, %zmm3 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm5, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm7, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm10, %zmm9, %zmm9 +; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm9 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 576(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 576(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor5_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm3 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm10 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm10, %zmm9, %zmm9 +; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm9 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm7 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm1 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm8 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 576(%rdx) +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 576(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor5_vf32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovd (%rdi), %k0 -; AVX512BW-NEXT: kshiftrd $1, %k0, %k1 -; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: kandw %k6, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-5, %ax -; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-9, %ax -; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-17, %ax -; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: movw $-33, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-65, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-129, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k7 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k3 -; AVX512BW-NEXT: kshiftrd $2, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF -; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $3, %k0, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $29, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 -; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k6, %k1, %k3 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $30, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $31, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $25, %k0, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k6, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $26, %k0, %k7 -; AVX512BW-NEXT: kmovq %k0, %k4 -; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kshiftlw $15, %k7, %k7 -; AVX512BW-NEXT: kshiftrw $13, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $27, %k4, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $8, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $22, %k0, %k3 -; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k3, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $13, %k3, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k5, %k6, %k6 -; AVX512BW-NEXT: kshiftrd $23, %k0, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k7, %k7 -; AVX512BW-NEXT: kshiftrw $12, %k7, %k5 -; AVX512BW-NEXT: korw %k5, %k6, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrd $24, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm3 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $19, %k0, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k6 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrd $20, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $9, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $8, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $7, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrd $21, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kandw %k3, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k4, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $16, %k0, %k1 -; AVX512BW-NEXT: kandw %k7, %k1, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $17, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $18, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 -; AVX512BW-NEXT: kshiftrw $5, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $13, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $12, %k0, %k3 -; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $14, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $15, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k4 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $9, %k0, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrd $10, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $13, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $12, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kandw %k7, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $11, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $9, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrd $11, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $4, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kandw %k3, %k4, %k4 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $15, %k7, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $2, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $14, %k7, %k3 -; AVX512BW-NEXT: korw %k3, %k4, %k3 -; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k3} {z} -; AVX512BW-NEXT: kshiftrd $6, %k0, %k4 -; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k4, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $13, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrd $7, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftrd $8, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $2, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $14, %k5, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k5, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k5, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $4, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $5, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: korw %k1, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm9 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 512(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor5_vf32: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovd (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [25,25,26,26,26,26,26,27,27,27,27,27,28,28,28,28,28,29,29,29,29,29,30,30,30,30,30,31,31,31,31,31] +; AVX512BW-ONLY-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-ONLY-NEXT: vpmovw2m %zmm0, %k1 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3,2,3,2,3] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15,16,16,16,16,16,17,17,17,17,17,18,18,18,18,18,19,35,35,35,35,36,36,36,36,36,37,37,37,37,37,38,38,54,54,54,55,55,55,55,55,56,56,56,56,56,57,57,57] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,19,19,19,19,20,20,20,20,20,21,21,21,21,21,22,22,38,38,38,39,39,39,39,39,40,40,40,40,40,41,41,41,57,57,58,58,58,58,58,59,59,59,59,59,60,60,60,60] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k3} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor5_vf32: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovd (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2w %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [25,25,26,26,26,26,26,27,27,27,27,27,28,28,28,28,28,29,29,29,29,29,30,30,30,30,30,31,31,31,31,31] +; AVX512VBMI-ONLY-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovw2m %zmm0, %k1 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15,16,16,16,16,16,17,17,17,17,17,18,18,18,18,18,19,19,19,19,19,20,20,20,20,20,21,21,21,21,21,22,22,22,22,22,23,23,23,23,23,24,24,24,24,24,25,25,25] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6,6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9,9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <160 x i32> @@ -4370,1545 +2527,353 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-ONLY-LABEL: mask_replication_factor5_vf64: ; AVX512F-ONLY: # %bb.0: -; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm12 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm3 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm2, %zmm13 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm14 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm15 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm16 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm4 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm2, %zmm17 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm18 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm19 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm5 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm9 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm0 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm12 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm13 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm14 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm9, %zmm15 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm11, %zmm1 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm3, %zmm16 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm5, %zmm17 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm7, %zmm18 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm9, %zmm19 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm11, %zmm2 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm20, %zmm20, %zmm20 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm20, %zmm3, %zmm3 +; AVX512F-ONLY-NEXT: vpermd %zmm20, %zmm5, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm20, %zmm7, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm20, %zmm9, %zmm9 +; AVX512F-ONLY-NEXT: vpermd %zmm20, %zmm11, %zmm11 +; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm9 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm19 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm19 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm18 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm17 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm16 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm16 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm15 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm15 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm14 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm13 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm12 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm12 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm10 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1216(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1152(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1088(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1024(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 960(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 896(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 832(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 768(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 704(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 640(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 576(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1216(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1152(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1088(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1024(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 960(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 896(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 768(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 704(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 640(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 576(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor5_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm12 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm2, %zmm13 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm14 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm15 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm16 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm2, %zmm17 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm18 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm19 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm9 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm0 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm12 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm13 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm14 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm9, %zmm15 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm11, %zmm1 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm3, %zmm16 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm17 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm7, %zmm18 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm9, %zmm19 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm11, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm20 +; AVX512DQ-NEXT: vpermd %zmm20, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm20, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm20, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm20, %zmm9, %zmm9 +; AVX512DQ-NEXT: vpermd %zmm20, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm9 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm7 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm19 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm19 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm18 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm18 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1 -; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm17 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm17 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 -; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm16 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm16 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm1 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 -; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm15 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm15 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 -; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm14 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 -; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm13 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 -; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm12 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm12 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 -; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm10 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm10 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm8 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm6 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1216(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1152(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1088(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1024(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 960(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 896(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 832(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 768(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 704(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 640(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 576(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 +; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1216(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1152(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1088(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1024(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 960(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 896(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 768(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 704(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 640(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 576(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor5_vf64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovq (%rdi), %k5 -; AVX512BW-NEXT: kshiftrq $1, %k5, %k0 -; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k5, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-5, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-9, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-17, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: movw $-33, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-65, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-129, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k2, %k0 -; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k0, %k3 -; AVX512BW-NEXT: kshiftrq $2, %k5, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k7 -; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k7, %k7 -; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k7, %k7 -; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: kandw %k6, %k7, %k7 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $3, %k5, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k7, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k7, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $4, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $5, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $6, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k6 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k6} {z} -; AVX512BW-NEXT: kandw %k2, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k7, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k7, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $7, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $8, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $9, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $10, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $11, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $12, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k6 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrq $13, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $14, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $15, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $16, %k5, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $17, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $18, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $19, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $20, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $21, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $22, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k7} {z} -; AVX512BW-NEXT: kandw %k4, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $23, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $24, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $25, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $26, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $27, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $28, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k6 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrq $29, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $30, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $31, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $32, %k5, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $33, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $34, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $35, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $36, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $37, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $38, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $39, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $40, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $41, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $42, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $43, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $44, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k6 -; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k6} {z} -; AVX512BW-NEXT: kandw %k2, %k1, %k0 -; AVX512BW-NEXT: kshiftrq $45, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $46, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $47, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $48, %k5, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $49, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $50, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $51, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $52, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $53, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $54, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $55, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $56, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $57, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $58, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $59, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $60, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k6 -; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrq $61, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $62, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $63, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k5, %k2 -; AVX512BW-NEXT: korw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k1, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 1152(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1088(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 768(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 640(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor5_vf64: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6,22,22,22,23,23,23,23,23,24,24,24,24,24,25,25,25,41,41,42,42,42,42,42,43,43,43,43,43,44,44,44,44,60,61,61,61,61,61,62,62,62,62,62,63,63,63,63,63] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,6,7] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9,25,25,26,26,26,26,26,27,27,27,27,27,28,28,28,28,44,45,45,45,45,45,46,46,46,46,46,47,47,47,47,47,48,48,48,48,48,49,49,49,49,49,50,50,50,50,50,51] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,4,5,4,5] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12,28,29,29,29,29,29,30,30,30,30,30,31,31,31,31,31,32,32,32,32,32,33,33,33,33,33,34,34,34,34,34,35,51,51,51,51,52,52,52,52,52,53,53,53,53,53,54,54] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k3 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3,2,3,2,3] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15,16,16,16,16,16,17,17,17,17,17,18,18,18,18,18,19,35,35,35,35,36,36,36,36,36,37,37,37,37,37,38,38,54,54,54,55,55,55,55,55,56,56,56,56,56,57,57,57] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k4 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,19,19,19,19,20,20,20,20,20,21,21,21,21,21,22,22,38,38,38,39,39,39,39,39,40,40,40,40,40,41,41,41,57,57,58,58,58,58,58,59,59,59,59,59,60,60,60,60] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k5 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k5} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k5, %k6 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k6} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k5, %k5 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k5} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k5, %k5 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k5} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5 +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k5} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k4, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k4} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k3} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm19, 1216(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm18, 1152(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm17, 1088(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm16, 1024(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm15, 960(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm14, 896(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor5_vf64: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [51,51,51,51,52,52,52,52,52,53,53,53,53,53,54,54,54,54,54,55,55,55,55,55,56,56,56,56,56,57,57,57,57,57,58,58,58,58,58,59,59,59,59,59,60,60,60,60,60,61,61,61,61,61,62,62,62,62,62,63,63,63,63,63] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [38,38,38,39,39,39,39,39,40,40,40,40,40,41,41,41,41,41,42,42,42,42,42,43,43,43,43,43,44,44,44,44,44,45,45,45,45,45,46,46,46,46,46,47,47,47,47,47,48,48,48,48,48,49,49,49,49,49,50,50,50,50,50,51] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [25,25,26,26,26,26,26,27,27,27,27,27,28,28,28,28,28,29,29,29,29,29,30,30,30,30,30,31,31,31,31,31,32,32,32,32,32,33,33,33,33,33,34,34,34,34,34,35,35,35,35,35,36,36,36,36,36,37,37,37,37,37,38,38] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,13,13,13,13,13,14,14,14,14,14,15,15,15,15,15,16,16,16,16,16,17,17,17,17,17,18,18,18,18,18,19,19,19,19,19,20,20,20,20,20,21,21,21,21,21,22,22,22,22,22,23,23,23,23,23,24,24,24,24,24,25,25,25] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6,6,6,6,7,7,7,7,7,8,8,8,8,8,9,9,9,9,9,10,10,10,10,10,11,11,11,11,11,12,12,12,12] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k5 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k5, %k6 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k6} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k5, %k5 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k5, %k5 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k4, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm19, 1216(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm18, 1152(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm17, 1088(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm16, 1024(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm15, 960(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm14, 896(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <320 x i32> %data = call <320 x i32> @llvm.masked.load.v320i32.p0(ptr %in.vec, i32 64, <320 x i1> %tgt.mask, <320 x i32> poison) @@ -6085,25 +3050,20 @@ ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq @@ -6112,50 +3072,59 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor6_vf8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovw (%rdi), %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] -; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] -; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor6_vf8: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,18,18,19,19,19,19,19,19,20,20,20,20,20,20,21,21,37,37,37,37,38,38,38,38,38,38,39,39,39,39,39,39,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor6_vf8: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <8 x i32> %tgt.mask = shufflevector <8 x i1> %src.mask, <8 x i1> poison, <48 x i32> @@ -6169,40 +3138,35 @@ ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k5} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq @@ -6211,80 +3175,98 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k6} {z} -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm2 {%k5} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm5 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k5} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor6_vf16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovw (%rdi), %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] -; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] -; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3 -; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] -; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4 -; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] -; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm5 -; AVX512BW-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] -; AVX512BW-NEXT: vpermd %zmm0, %zmm6, %zmm0 -; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor6_vf16: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,18,18,19,19,19,19,19,19,20,20,20,20,20,20,21,21,37,37,37,37,38,38,38,38,38,38,39,39,39,39,39,39,56,56,56,56,56,56,57,57,57,57,57,57,58,58,58,58] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512BW-ONLY-NEXT: vpmovm2w %k0, %zmm4 +; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512BW-ONLY-NEXT: vpermw %zmm4, %zmm5, %zmm4 +; AVX512BW-ONLY-NEXT: vpmovw2m %zmm4, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor6_vf16: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7,8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vpmovm2w %k0, %zmm4 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512VBMI-ONLY-NEXT: vpermw %zmm4, %zmm5, %zmm4 +; AVX512VBMI-ONLY-NEXT: vpmovw2m %zmm4, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <96 x i32> @@ -6296,986 +3278,227 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-ONLY-LABEL: mask_replication_factor6_vf32: ; AVX512F-ONLY: # %bb.0: -; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm0 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm5, %zmm5 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm7 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm9 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm3 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm12, %zmm12, %zmm12 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm12, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vpermd %zmm12, %zmm3, %zmm3 +; AVX512F-ONLY-NEXT: vpermd %zmm12, %zmm5, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm12, %zmm7, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm12, %zmm9, %zmm9 +; AVX512F-ONLY-NEXT: vpermd %zmm12, %zmm11, %zmm11 +; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm9 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm10 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm6 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 576(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 640(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 704(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 704(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 640(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 576(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor6_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm3 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm12 +; AVX512DQ-NEXT: vpermd %zmm12, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpermd %zmm12, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm12, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm12, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm12, %zmm9, %zmm9 +; AVX512DQ-NEXT: vpermd %zmm12, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm11 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 -; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm9 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm9 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm7 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm5 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm10 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm6 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 576(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 640(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 704(%rdx) +; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 704(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 640(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 576(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor6_vf32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovd (%rdi), %k5 -; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-5, %ax -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-9, %ax -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-17, %ax -; AVX512BW-NEXT: kmovd %eax, %k7 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-33, %ax -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: movw $-65, %ax -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k1, %k2 -; AVX512BW-NEXT: kshiftrd $1, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-129, %ax -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $6, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $2, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k3 -; AVX512BW-NEXT: kmovq %k2, %k4 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kshiftrw $3, %k3, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k2 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k4, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $29, %k5, %k0 -; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $30, %k5, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k3 -; AVX512BW-NEXT: kshiftrd $31, %k5, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k7, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftlw $14, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $26, %k5, %k3 -; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 -; AVX512BW-NEXT: kshiftrd $27, %k5, %k7 -; AVX512BW-NEXT: kmovq %k5, %k3 -; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kshiftlw $15, %k7, %k7 -; AVX512BW-NEXT: kshiftrw $13, %k7, %k5 -; AVX512BW-NEXT: korw %k5, %k6, %k5 -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $12, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrd $28, %k3, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k5, %k4 -; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: korw %k2, %k4, %k2 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k2} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $24, %k0, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k4, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $25, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k4 -; AVX512BW-NEXT: kmovq %k0, %k1 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $21, %k2, %k1 -; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k5 -; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k5, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k5, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k5, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $22, %k2, %k4 -; AVX512BW-NEXT: kmovq %k2, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k4 -; AVX512BW-NEXT: kshiftrd $23, %k6, %k5 -; AVX512BW-NEXT: kmovq %k6, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 -; AVX512BW-NEXT: korw %k3, %k4, %k3 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k3} {z} -; AVX512BW-NEXT: kmovq %k7, %k4 -; AVX512BW-NEXT: kshiftrd $18, %k7, %k6 -; AVX512BW-NEXT: kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrd $19, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrd $20, %k4, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $16, %k0, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $17, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $3, %k3, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $13, %k0, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $14, %k0, %k4 -; AVX512BW-NEXT: kmovq %k0, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k4 -; AVX512BW-NEXT: kshiftrd $15, %k7, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 -; AVX512BW-NEXT: korw %k3, %k4, %k3 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k3} {z} -; AVX512BW-NEXT: kmovq %k7, %k3 -; AVX512BW-NEXT: kshiftrd $10, %k7, %k0 -; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k1, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrd $11, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrd $12, %k3, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $8, %k3, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $9, %k3, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $3, %k3, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $5, %k1, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k6, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k7 -; AVX512BW-NEXT: kshiftrw $14, %k7, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k7, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k7, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $6, %k1, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k4 -; AVX512BW-NEXT: kshiftrd $7, %k1, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 -; AVX512BW-NEXT: korw %k3, %k4, %k3 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k3} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $3, %k1, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $13, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $4, %k1, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k3, %k0 -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 -; AVX512BW-NEXT: korw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k7, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm11 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 512(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 640(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 704(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor6_vf32: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovd (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7,24,24,24,24,24,24,25,25,25,25,25,25,26,26,26,26,42,42,43,43,43,43,43,43,44,44,44,44,44,44,45,45,61,61,61,61,62,62,62,62,62,62,63,63,63,63,63,63] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,2,3,2,3] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,29,29,29,29,30,30,30,30,30,30,31,31,31,31,31,31,32,32,32,32,32,32,33,33,33,33,33,33,34,34,34,34,50,50,51,51,51,51,51,51,52,52,52,52,52,52,53,53] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,18,18,19,19,19,19,19,19,20,20,20,20,20,20,21,21,37,37,37,37,38,38,38,38,38,38,39,39,39,39,39,39,56,56,56,56,56,56,57,57,57,57,57,57,58,58,58,58] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k3} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor6_vf32: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovd (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [21,21,21,21,22,22,22,22,22,22,23,23,23,23,23,23,24,24,24,24,24,24,25,25,25,25,25,25,26,26,26,26,26,26,27,27,27,27,27,27,28,28,28,28,28,28,29,29,29,29,29,29,30,30,30,30,30,30,31,31,31,31,31,31] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15,16,16,16,16,16,16,17,17,17,17,17,17,18,18,18,18,18,18,19,19,19,19,19,19,20,20,20,20,20,20,21,21] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7,8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <192 x i32> @@ -7287,1838 +3510,418 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-ONLY-LABEL: mask_replication_factor6_vf64: ; AVX512F-ONLY: # %bb.0: -; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm12 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm13, %zmm14 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm3 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm2, %zmm15 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm16 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm17 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm18 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm13, %zmm19 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm4 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm2, %zmm20 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm21 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm22 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm23 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm13, %zmm5 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm11 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm12 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm0 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm14 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm15 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm16 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm9, %zmm17 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm11, %zmm18 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm1 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm3, %zmm19 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm5, %zmm20 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm7, %zmm21 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm9, %zmm22 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm11, %zmm23 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm13, %zmm2 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm24, %zmm24, %zmm24 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm24, %zmm3, %zmm3 +; AVX512F-ONLY-NEXT: vpermd %zmm24, %zmm5, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm24, %zmm7, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm24, %zmm9, %zmm9 +; AVX512F-ONLY-NEXT: vpermd %zmm24, %zmm11, %zmm11 +; AVX512F-ONLY-NEXT: vpermd %zmm24, %zmm13, %zmm13 +; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm13 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm11 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm11 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm23 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm23 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm22 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm22 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm21 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm21 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm20 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm20 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm19 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm19 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm18 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm18 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm17 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm17 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm16 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm15 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm15 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm14 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm14 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm12 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm10 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1472(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1408(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1344(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1280(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1216(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1152(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 1088(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 1024(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 960(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 896(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 832(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 768(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 704(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 640(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 576(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1472(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1408(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1344(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1280(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1216(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 1152(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1088(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, 1024(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 960(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 896(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 832(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 768(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 704(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 640(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 576(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor6_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm13, %zmm14 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm2, %zmm15 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm16 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm17 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm18 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm13, %zmm19 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm2, %zmm20 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm21 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm22 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm23 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm13, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm0 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm14 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm15 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm16 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm9, %zmm17 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm11, %zmm18 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm1 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm3, %zmm19 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm20 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm7, %zmm21 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm9, %zmm22 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm11, %zmm23 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm13, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm24 +; AVX512DQ-NEXT: vpermd %zmm24, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm24, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm24, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm24, %zmm9, %zmm9 +; AVX512DQ-NEXT: vpermd %zmm24, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpermd %zmm24, %zmm13, %zmm13 +; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm13 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm11 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm11 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm23 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm23 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm22, %k1 -; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm22 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm22 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1 -; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm21 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm21 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1 -; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm20 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm20 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 -; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm19 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm19 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm1 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 -; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm18 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm18 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1 -; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm17 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm17 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 -; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm16 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 -; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm15 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm15 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 -; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm14 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm14 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 -; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm12 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 -; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm10 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm10 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm8 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm6 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1472(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1408(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1344(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1280(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1216(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1152(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1088(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1024(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 960(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 896(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 832(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 768(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 704(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 640(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 576(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 +; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1472(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1408(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1344(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1280(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1216(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 1152(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1088(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 1024(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 960(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 896(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 832(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 768(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 704(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 640(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 576(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor6_vf64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovq (%rdi), %k5 -; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k5, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-5, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-9, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-17, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-33, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: movw $-65, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $1, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-129, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k0, %k3 -; AVX512BW-NEXT: kshiftrq $2, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k7 -; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k7, %k7 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k0, %k6, %k6 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $3, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $4, %k5, %k1 -; AVX512BW-NEXT: kmovq %k5, %k7 -; AVX512BW-NEXT: kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $5, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k7} {z} -; AVX512BW-NEXT: kandw %k4, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $6, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $7, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $8, %k7, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $9, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $10, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $11, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $12, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $13, %k7, %k1 -; AVX512BW-NEXT: kmovq %k7, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovq %k2, %k7 -; AVX512BW-NEXT: kshiftrq $14, %k2, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $15, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $16, %k5, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $17, %k5, %k1 -; AVX512BW-NEXT: kmovq %k5, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $18, %k7, %k1 -; AVX512BW-NEXT: kmovq %k7, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovq %k3, %k7 -; AVX512BW-NEXT: kshiftrq $19, %k3, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $20, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kmovq %k7, %k4 -; AVX512BW-NEXT: kshiftrq $21, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $22, %k4, %k1 -; AVX512BW-NEXT: kmovq %k4, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $23, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $24, %k5, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $25, %k5, %k1 -; AVX512BW-NEXT: kmovq %k5, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $26, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $27, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $28, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $29, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $30, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $31, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $32, %k5, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $33, %k5, %k1 -; AVX512BW-NEXT: kmovq %k5, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $34, %k7, %k1 -; AVX512BW-NEXT: kmovq %k7, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovq %k4, %k7 -; AVX512BW-NEXT: kshiftrq $35, %k4, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $36, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovq %k7, %k3 -; AVX512BW-NEXT: kshiftrq $37, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $38, %k3, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $39, %k3, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z} -; AVX512BW-NEXT: kmovq %k3, %k7 -; AVX512BW-NEXT: kshiftrq $40, %k3, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $41, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $42, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k7} {z} -; AVX512BW-NEXT: kandw %k3, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $43, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $44, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $45, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $46, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $47, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $48, %k5, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $49, %k5, %k1 -; AVX512BW-NEXT: kmovq %k5, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $50, %k2, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovq %k2, %k7 -; AVX512BW-NEXT: kshiftrq $51, %k2, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $52, %k7, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $53, %k7, %k1 -; AVX512BW-NEXT: kmovq %k7, %k4 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kmovq %k4, %k7 -; AVX512BW-NEXT: kshiftrq $54, %k4, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $55, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $56, %k5, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $57, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $58, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $59, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $60, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $61, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $62, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $63, %k5, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k2, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k1, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm23, 1472(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 1408(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 1344(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 1280(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 1152(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1088(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 768(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 640(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor6_vf64: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7] +; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7,8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10,10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15] +; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,6,7,6,7] +; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15,0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5] +; AVX512BW-ONLY-NEXT: vpshufb %zmm3, %zmm1, %zmm1 +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5] +; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7,8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512BW-ONLY-NEXT: vpshufb %zmm4, %zmm1, %zmm1 +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k3 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] +; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k4 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,2,3,2,3] +; AVX512BW-ONLY-NEXT: vpshufb %zmm3, %zmm1, %zmm1 +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k5 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb %zmm4, %zmm0, %zmm0 +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k6 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k6, %k7 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k7} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k6, %k6 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k6} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k6, %k6 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k6} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k5} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k5, %k6 +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k6} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k5, %k5 +; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k5} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k5, %k5 +; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k5} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5 +; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k5} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k4, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k4} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k3} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm23, 1472(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm22, 1408(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm21, 1344(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm20, 1280(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm19, 1216(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm18, 1152(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm17, 1088(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm16, 1024(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm15, 960(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm14, 896(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor6_vf64: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [53,53,53,53,54,54,54,54,54,54,55,55,55,55,55,55,56,56,56,56,56,56,57,57,57,57,57,57,58,58,58,58,58,58,59,59,59,59,59,59,60,60,60,60,60,60,61,61,61,61,61,61,62,62,62,62,62,62,63,63,63,63,63,63] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [42,42,43,43,43,43,43,43,44,44,44,44,44,44,45,45,45,45,45,45,46,46,46,46,46,46,47,47,47,47,47,47,48,48,48,48,48,48,49,49,49,49,49,49,50,50,50,50,50,50,51,51,51,51,51,51,52,52,52,52,52,52,53,53] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,32,32,32,32,32,33,33,33,33,33,33,34,34,34,34,34,34,35,35,35,35,35,35,36,36,36,36,36,36,37,37,37,37,37,37,38,38,38,38,38,38,39,39,39,39,39,39,40,40,40,40,40,40,41,41,41,41,41,41,42,42,42,42] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [21,21,21,21,22,22,22,22,22,22,23,23,23,23,23,23,24,24,24,24,24,24,25,25,25,25,25,25,26,26,26,26,26,26,27,27,27,27,27,27,28,28,28,28,28,28,29,29,29,29,29,29,30,30,30,30,30,30,31,31,31,31,31,31] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15,16,16,16,16,16,16,17,17,17,17,17,17,18,18,18,18,18,18,19,19,19,19,19,19,20,20,20,20,20,20,21,21] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k5 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7,8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k6 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k6, %k7 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k7} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k6, %k6 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k6} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k6, %k6 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k6} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k5, %k6 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k6} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k5, %k5 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k5, %k5 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k4, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm23, 1472(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm22, 1408(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm21, 1344(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm20, 1280(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm19, 1216(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm18, 1152(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm17, 1088(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm16, 1024(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm15, 960(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm14, 896(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <384 x i32> %data = call <384 x i32> @llvm.masked.load.v384i32.p0(ptr %in.vec, i32 64, <384 x i1> %tgt.mask, <384 x i32> poison) @@ -9262,69 +4065,59 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-SLOW-LABEL: mask_replication_factor7_vf8: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: kmovw (%rdi), %k1 -; AVX512F-SLOW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-SLOW-NEXT: kmovw (%rdi), %k2 +; AVX512F-SLOW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-SLOW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; AVX512F-SLOW-NEXT: movw $1, %ax -; AVX512F-SLOW-NEXT: kmovw %eax, %k2 -; AVX512F-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} -; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k2 +; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512F-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vptestmd %zmm0, %zmm0, %k4 ; AVX512F-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512F-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k2} {z} ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,3,3,6,7,7,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,3] -; AVX512F-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1 -; AVX512F-SLOW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} -; AVX512F-SLOW-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k1} {z} -; AVX512F-SLOW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} -; AVX512F-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k3} {z} -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 192(%rdx) +; AVX512F-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k2 +; AVX512F-SLOW-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k2} {z} +; AVX512F-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} +; AVX512F-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k3} {z} +; AVX512F-SLOW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 192(%rdx) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: mask_replication_factor7_vf8: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: kmovw (%rdi), %k1 -; AVX512F-FAST-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-FAST-NEXT: kmovw (%rdi), %k2 +; AVX512F-FAST-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-FAST-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; AVX512F-FAST-NEXT: movw $1, %ax -; AVX512F-FAST-NEXT: kmovw %eax, %k2 -; AVX512F-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} -; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k2 +; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-FAST-NEXT: vptestmd %zmm0, %zmm0, %k4 ; AVX512F-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512F-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k2} {z} ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7] ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vptestmd %ymm0, %ymm0, %k1 -; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} -; AVX512F-FAST-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k1} {z} -; AVX512F-FAST-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} -; AVX512F-FAST-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k3} {z} -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512F-FAST-NEXT: vmovdqa %ymm1, 192(%rdx) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512F-FAST-NEXT: vptestmd %ymm0, %ymm0, %k2 +; AVX512F-FAST-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k2} {z} +; AVX512F-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} +; AVX512F-FAST-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k3} {z} +; AVX512F-FAST-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} +; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%rdx) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, 192(%rdx) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -9332,32 +4125,27 @@ ; AVX512DQ-SLOW: # %bb.0: ; AVX512DQ-SLOW-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-SLOW-NEXT: vpmovm2d %k1, %zmm1 -; AVX512DQ-SLOW-NEXT: movw $1, %ax -; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 -; AVX512DQ-SLOW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm0, %k3 ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,3,3,6,7,7,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,3,3] ; AVX512DQ-SLOW-NEXT: vpmovd2m %ymm0, %k4 -; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512DQ-SLOW-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k4} {z} -; AVX512DQ-SLOW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} -; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k2} {z} -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, 192(%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k4} {z} +; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} +; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} +; AVX512DQ-SLOW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 192(%rdx) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; @@ -9365,32 +4153,27 @@ ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-FAST-NEXT: vpmovm2d %k1, %zmm1 -; AVX512DQ-FAST-NEXT: movw $1, %ax -; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 -; AVX512DQ-FAST-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-FAST-NEXT: vpmovd2m %zmm0, %k3 ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,7,7,7,7,7,7] ; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512DQ-FAST-NEXT: vpmovd2m %ymm0, %k4 -; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512DQ-FAST-NEXT: vmovdqa32 192(%rsi), %zmm1 {%k4} {z} -; AVX512DQ-FAST-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k3} {z} -; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k2} {z} -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, 192(%rdx) -; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k4} {z} +; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} +; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k2} {z} +; AVX512DQ-FAST-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 64(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 192(%rdx) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq ; @@ -9408,11 +4191,11 @@ ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k3} {z} ; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512BW-ONLY-NEXT: vmovdqa %ymm0, 192(%rdx) ; AVX512BW-ONLY-NEXT: vzeroupper @@ -9432,11 +4215,11 @@ ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k3} {z} ; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx) ; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512VBMI-ONLY-NEXT: vmovdqa %ymm0, 192(%rdx) ; AVX512VBMI-ONLY-NEXT: vzeroupper @@ -9455,45 +4238,40 @@ ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k7 -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k7} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k6} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k5} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k7} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k6} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k5} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq @@ -9502,88 +4280,72 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k7 -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k7} {z} -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k6} {z} -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k5} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm6 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k7} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k6} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k5} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: mask_replication_factor7_vf16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovw (%rdi), %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512BW-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] -; AVX512BW-NEXT: vpermd %zmm0, %zmm3, %zmm3 -; AVX512BW-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] -; AVX512BW-NEXT: vpermd %zmm0, %zmm4, %zmm4 -; AVX512BW-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm0[9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11,27,27,27,27,28,28,28,28,28,28,28,29,29,29,29,29,45,45,46,46,46,46,46,46,46,47,47,47,47,47,47,47,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,18,18,18,18,18,19,19,19,19,19,19,19,20,20,20,20,36,36,36,37,37,37,37,37,37,37,38,38,38,38,38,38,54,55,55,55,55,55,55,55,56,56,56,56,56,56,56,57] +; AVX512BW-NEXT: vpmovb2m %zmm0, %k2 +; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} +; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] -; AVX512BW-NEXT: vpermd %zmm0, %zmm5, %zmm5 -; AVX512BW-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] -; AVX512BW-NEXT: vpermd %zmm0, %zmm6, %zmm6 -; AVX512BW-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] -; AVX512BW-NEXT: vpermd %zmm0, %zmm7, %zmm0 -; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rdx) +; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 @@ -9597,1129 +4359,263 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-ONLY-LABEL: mask_replication_factor7_vf32: ; AVX512F-ONLY: # %bb.0: -; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 ; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm12 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm0 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm2 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm5, %zmm5 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm7 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm9 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm11 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm13, %zmm3 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm14, %zmm14, %zmm14 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm14, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vpermd %zmm14, %zmm3, %zmm3 +; AVX512F-ONLY-NEXT: vpermd %zmm14, %zmm5, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm14, %zmm7, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm14, %zmm9, %zmm9 +; AVX512F-ONLY-NEXT: vpermd %zmm14, %zmm11, %zmm11 +; AVX512F-ONLY-NEXT: vpermd %zmm14, %zmm13, %zmm13 +; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm13 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm11 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm11 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm12 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm12 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm10 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm6 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 64(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 576(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 640(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 704(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 768(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 832(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 832(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 768(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 704(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 640(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 576(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor7_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm13, %zmm3 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm14 +; AVX512DQ-NEXT: vpermd %zmm14, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpermd %zmm14, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm14, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm14, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm14, %zmm9, %zmm9 +; AVX512DQ-NEXT: vpermd %zmm14, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpermd %zmm14, %zmm13, %zmm13 +; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm13 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 -; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm11 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm11 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 -; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm9 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm9 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm3 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm1 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm12 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm12 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm10 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm8 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm6 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm4 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 576(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 640(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 704(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 768(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 832(%rdx) +; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 832(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 768(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 704(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 640(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 576(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor7_vf32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovd (%rdi), %k6 -; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kandw %k0, %k6, %k1 -; AVX512BW-NEXT: kmovq %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-5, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovq %k2, %k3 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $13, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-9, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-17, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-33, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-65, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: movw $-129, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrd $1, %k6, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k7 -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $6, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrd $2, %k6, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kshiftlw $14, %k2, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: korw %k1, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: kmovq %k6, %k2 -; AVX512BW-NEXT: kshiftrd $29, %k6, %k1 -; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovq %k4, %k6 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k1, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrd $30, %k2, %k1 -; AVX512BW-NEXT: kmovq %k2, %k4 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k7, %k0, %k3 -; AVX512BW-NEXT: kshiftrd $31, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k3, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k1, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $27, %k2, %k1 -; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k6, %k1, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k7 -; AVX512BW-NEXT: kshiftrd $28, %k2, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k7, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k6, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $4, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k7, %k0, %k2 -; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k2} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $25, %k6, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k5 -; AVX512BW-NEXT: kshiftrd $26, %k6, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $2, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k6, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $23, %k3, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrd $22, %k3, %k5 -; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovq %k3, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k3 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $24, %k6, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k0, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k2} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $20, %k3, %k5 -; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k6 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k5 -; AVX512BW-NEXT: kshiftrd $21, %k3, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $15, %k7, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftlw $14, %k7, %k1 -; AVX512BW-NEXT: korw %k1, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $18, %k4, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k7 -; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $13, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $12, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k6 -; AVX512BW-NEXT: kshiftrd $19, %k4, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $9, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $8, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $6, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k6, %k5 -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 -; AVX512BW-NEXT: kmovq %k3, %k7 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k5, %k3 -; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm6 {%k3} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $16, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k3, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k3 -; AVX512BW-NEXT: kshiftrd $17, %k1, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k3, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 -; AVX512BW-NEXT: korw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: korw %k1, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $13, %k0, %k1 -; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k6, %k1, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k3 -; AVX512BW-NEXT: kshiftrd $14, %k0, %k2 -; AVX512BW-NEXT: kmovq %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k5 -; AVX512BW-NEXT: kshiftrd $15, %k1, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm8 {%k2} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $11, %k3, %k6 -; AVX512BW-NEXT: kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k7, %k5, %k6 -; AVX512BW-NEXT: kshiftrd $12, %k3, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $10, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $9, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $8, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $6, %k5, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k6, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $4, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k0, %k1 -; AVX512BW-NEXT: korw %k1, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm9 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $9, %k6, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k5 -; AVX512BW-NEXT: kshiftrd $10, %k6, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k5, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $2, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $7, %k4, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $6, %k4, %k5 -; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovq %k4, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k2 -; AVX512BW-NEXT: kshiftrd $8, %k6, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $4, %k6, %k1 -; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k5 -; AVX512BW-NEXT: kshiftrd $5, %k6, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $15, %k3, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k5, %k3 -; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm12 {%k2} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $14, %k4, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k4, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k4, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k3 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm13 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 512(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 640(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 704(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 768(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 832(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor7_vf32: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovd (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2w %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [27,27,27,27,28,28,28,28,28,28,28,29,29,29,29,29,29,29,30,30,30,30,30,30,30,31,31,31,31,31,31,31] +; AVX512BW-ONLY-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-ONLY-NEXT: vpmovw2m %zmm0, %k1 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4,20,20,20,21,21,21,21,21,21,21,22,22,22,22,22,22,38,39,39,39,39,39,39,39,40,40,40,40,40,40,40,41,57,57,57,57,57,57,58,58,58,58,58,58,58,59,59,59] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,2,3] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11,27,27,27,27,28,28,28,28,28,28,28,29,29,29,29,29,45,45,46,46,46,46,46,46,46,47,47,47,47,47,47,47,48,48,48,48,48,48,48,49,49,49,49,49,49,49,50,50] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k3 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,18,18,18,18,18,19,19,19,19,19,19,19,20,20,20,20,36,36,36,37,37,37,37,37,37,37,38,38,38,38,38,38,54,55,55,55,55,55,55,55,56,56,56,56,56,56,56,57] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k5} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k4, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k3} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k2} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor7_vf32: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovd (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2w %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [27,27,27,27,28,28,28,28,28,28,28,29,29,29,29,29,29,29,30,30,30,30,30,30,30,31,31,31,31,31,31,31] +; AVX512VBMI-ONLY-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovw2m %zmm0, %k1 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [18,18,18,18,18,19,19,19,19,19,19,19,20,20,20,20,20,20,20,21,21,21,21,21,21,21,22,22,22,22,22,22,22,23,23,23,23,23,23,23,24,24,24,24,24,24,24,25,25,25,25,25,25,25,26,26,26,26,26,26,26,27,27,27] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11,11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13,13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15,16,16,16,16,16,16,16,17,17,17,17,17,17,17,18,18] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6,6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k4, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <224 x i32> @@ -10731,2108 +4627,481 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-ONLY-LABEL: mask_replication_factor7_vf64: ; AVX512F-ONLY: # %bb.0: -; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: movw $1, %ax -; AVX512F-ONLY-NEXT: kmovw %eax, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k1 -; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm2, %zmm6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm7, %zmm8 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm9, %zmm10 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm11, %zmm12 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm13, %zmm14 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm15, %zmm16 -; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm3 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm2, %zmm17 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm7, %zmm18 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm9, %zmm19 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm11, %zmm20 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm13, %zmm21 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm15, %zmm22 -; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm4 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm2, %zmm23 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm7, %zmm24 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm9, %zmm25 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm11, %zmm26 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm13, %zmm27 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vpermd %zmm5, %zmm15, %zmm5 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm11 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm13 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm3 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm5 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm5, %zmm6 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm7, %zmm8 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm9, %zmm10 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm11, %zmm12 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm13, %zmm14 +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm15, %zmm0 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm3, %zmm16 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm5, %zmm17 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm7, %zmm18 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm9, %zmm19 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm11, %zmm20 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm13, %zmm21 +; AVX512F-ONLY-NEXT: vpermd %zmm1, %zmm15, %zmm1 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm3, %zmm22 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm5, %zmm23 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm7, %zmm24 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm9, %zmm25 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm11, %zmm26 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm13, %zmm27 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm15, %zmm2 +; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm28, %zmm28, %zmm28 {%k1} {z} +; AVX512F-ONLY-NEXT: vpermd %zmm28, %zmm3, %zmm3 +; AVX512F-ONLY-NEXT: vpermd %zmm28, %zmm5, %zmm5 +; AVX512F-ONLY-NEXT: vpermd %zmm28, %zmm7, %zmm7 +; AVX512F-ONLY-NEXT: vpermd %zmm28, %zmm9, %zmm9 +; AVX512F-ONLY-NEXT: vpermd %zmm28, %zmm11, %zmm11 +; AVX512F-ONLY-NEXT: vpermd %zmm28, %zmm13, %zmm13 +; AVX512F-ONLY-NEXT: vpermd %zmm28, %zmm15, %zmm15 +; AVX512F-ONLY-NEXT: vptestmd %zmm15, %zmm15, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm15 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm13, %zmm13, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm13 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm11, %zmm11, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm11 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm9, %zmm9, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm7, %zmm7, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm5, %zmm5, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm27, %zmm27, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm27 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm27 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm26, %zmm26, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm26 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm26 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm25, %zmm25, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm25 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm25 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm24, %zmm24, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm24 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm24 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm23, %zmm23, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm23 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm23 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm22, %zmm22, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm22 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm22 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm1 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm21, %zmm21, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm21 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm21 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm20, %zmm20, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm20 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm20 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm19, %zmm19, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm19 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm19 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm18, %zmm18, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm18 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm17, %zmm17, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm17 {%k1} {z} -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm17 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm16, %zmm16, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm16 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm16 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm14, %zmm14, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm14 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm14 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm12, %zmm12, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm12 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm12 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm10, %zmm10, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm10 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm10 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm8, %zmm8, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm8 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm8 {%k1} {z} ; AVX512F-ONLY-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm6 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1728(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1664(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1600(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1536(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1472(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 1408(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 1344(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 1280(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 1216(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 1152(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 1088(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 1024(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 960(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 896(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 832(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 768(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 704(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, 640(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 576(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 512(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k1 +; AVX512F-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm4 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 1728(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 1664(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm8, 1600(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm10, 1536(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm12, 1472(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm14, 1408(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 1344(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm16, 1280(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm17, 1216(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm18, 1152(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm19, 1088(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm20, 1024(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm21, 960(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 896(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm22, 832(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm23, 768(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm24, 704(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm25, 640(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm26, 576(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm27, 512(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm11, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm13, 64(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm15, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor7_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 -; AVX512DQ-NEXT: movw $1, %ax -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} ; AVX512DQ-NEXT: kmovw 6(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm3 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm2, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm9, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm11, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm13, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm15, %zmm16 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm2, %zmm17 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm18 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm19 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm11, %zmm20 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm13, %zmm21 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm15, %zmm22 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm4 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm2, %zmm23 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm7, %zmm24 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm9, %zmm25 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm11, %zmm26 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm13, %zmm27 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm5, %zmm15, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm13 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 +; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm7, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm9, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm11, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm13, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm15, %zmm0 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm16 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm17 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm7, %zmm18 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm9, %zmm19 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm11, %zmm20 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm13, %zmm21 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm15, %zmm1 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm3, %zmm22 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm23 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm7, %zmm24 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm9, %zmm25 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm11, %zmm26 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm13, %zmm27 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm15, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm28 +; AVX512DQ-NEXT: vpermd %zmm28, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm28, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpermd %zmm28, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpermd %zmm28, %zmm9, %zmm9 +; AVX512DQ-NEXT: vpermd %zmm28, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpermd %zmm28, %zmm13, %zmm13 +; AVX512DQ-NEXT: vpermd %zmm28, %zmm15, %zmm15 +; AVX512DQ-NEXT: vpmovd2m %zmm15, %k1 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm15 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm13, %k1 -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm13 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm13 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm11, %k1 -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm11 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm11 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm9, %k1 -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm9 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm2 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm7, %k1 +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm7 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm5, %k1 -; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm2 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm27, %k1 -; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm27 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 512(%rsi), %zmm27 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm26, %k1 -; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm26 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 576(%rsi), %zmm26 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm25, %k1 -; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm25 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 640(%rsi), %zmm25 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm24, %k1 -; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm24 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 704(%rsi), %zmm24 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm23, %k1 -; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm23 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 -; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 768(%rsi), %zmm23 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm22, %k1 -; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm22 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 832(%rsi), %zmm22 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 +; AVX512DQ-NEXT: vmovdqa32 896(%rsi), %zmm1 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm21, %k1 -; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm21 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 960(%rsi), %zmm21 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm20, %k1 -; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm20 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1024(%rsi), %zmm20 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm19, %k1 -; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm19 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1088(%rsi), %zmm19 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm18, %k1 -; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm18 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm17, %k1 -; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm17 {%k1} {z} -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k1 -; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1216(%rsi), %zmm17 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm16, %k1 -; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm16 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1280(%rsi), %zmm16 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 +; AVX512DQ-NEXT: vmovdqa32 1344(%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm14, %k1 -; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm14 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1408(%rsi), %zmm14 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm12, %k1 -; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm12 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1472(%rsi), %zmm12 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm10, %k1 -; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm10 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1536(%rsi), %zmm10 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm8, %k1 -; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm8 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 1600(%rsi), %zmm8 {%k1} {z} ; AVX512DQ-NEXT: vpmovd2m %zmm6, %k1 -; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm6 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1728(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1664(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1600(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1536(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1472(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1408(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 1344(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 1280(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 1216(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 1152(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 1088(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 1024(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 960(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 896(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 832(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 768(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 704(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 640(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 576(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 512(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa32 1664(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k1 +; AVX512DQ-NEXT: vmovdqa32 1728(%rsi), %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 1728(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 1664(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 1600(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 1536(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 1472(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 1408(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 1344(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 1280(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 1216(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 1152(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 1088(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 1024(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 960(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 896(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 832(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 768(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 704(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 640(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm26, 576(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm27, 512(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm15, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: mask_replication_factor7_vf64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovq (%rdi), %k4 -; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-5, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-9, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-17, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-33, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-65, %ax -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: movw $-129, %ax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $1, %k4, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kmovq %k2, %k5 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $7, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $2, %k4, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k0, %k6 -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k7, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k7, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k7, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k7, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq %k4, %k7 -; AVX512BW-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kshiftrq $3, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $4, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: kandw %k3, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $5, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $6, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k5, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $7, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $8, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $9, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kmovq %k5, %k4 -; AVX512BW-NEXT: kshiftrq $10, %k5, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $11, %k4, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $12, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $13, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k5, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k1, %k6, %k6 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $14, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $15, %k4, %k1 -; AVX512BW-NEXT: kmovq %k4, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: kmovq %k3, %k2 -; AVX512BW-NEXT: kshiftrq $16, %k3, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $17, %k2, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $18, %k4, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $19, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $20, %k4, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $21, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $22, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k5, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $23, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $24, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $25, %k4, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $26, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $27, %k4, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $28, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $29, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k1, %k6, %k6 -; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k6} {z} -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $30, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $31, %k4, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k5, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $32, %k4, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $33, %k4, %k0 -; AVX512BW-NEXT: kmovq %k4, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $34, %k7, %k1 -; AVX512BW-NEXT: kmovq %k7, %k3 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kmovq %k3, %k7 -; AVX512BW-NEXT: kshiftrq $35, %k3, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $36, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $37, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $38, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $39, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $40, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $41, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k7} {z} -; AVX512BW-NEXT: kandw %k3, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k1 -; AVX512BW-NEXT: kmovq %k5, %k7 -; AVX512BW-NEXT: kshiftrq $42, %k5, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $43, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $44, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $45, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k1, %k6, %k6 -; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $46, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $47, %k7, %k1 -; AVX512BW-NEXT: kmovq %k7, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k2, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $48, %k4, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $49, %k4, %k0 -; AVX512BW-NEXT: kmovq %k4, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $50, %k7, %k1 -; AVX512BW-NEXT: kmovq %k7, %k3 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k7} {z} -; AVX512BW-NEXT: kandw %k2, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k1 -; AVX512BW-NEXT: kmovq %k3, %k7 -; AVX512BW-NEXT: kshiftrq $51, %k3, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $52, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $53, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $54, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $55, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $56, %k7, %k0 -; AVX512BW-NEXT: kmovq %k7, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $57, %k2, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k7 -; AVX512BW-NEXT: vmovdqa32 1536(%rsi), %zmm24 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k1 -; AVX512BW-NEXT: kmovq %k2, %k7 -; AVX512BW-NEXT: kshiftrq $58, %k2, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $59, %k7, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $60, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $61, %k7, %k0 -; AVX512BW-NEXT: kmovq %k7, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k1, %k6, %k6 -; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $62, %k2, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrq $63, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k0, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm27, 1728(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 1664(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 1600(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 1536(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 1472(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 1408(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 1344(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 1280(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 1152(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1088(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 768(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 640(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-LABEL: mask_replication_factor7_vf64: +; AVX512BW-ONLY: # %bb.0: +; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[6,7,6,7,6,7,6,7] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9,25,25,25,25,25,25,26,26,26,26,26,26,26,27,27,27,43,43,43,43,44,44,44,44,44,44,44,45,45,45,45,45,61,61,62,62,62,62,62,62,62,63,63,63,63,63,63,63] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-ONLY-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,6,7,6,7] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15,16,16,16,16,16,16,16,17,17,17,17,17,17,17,18,18,34,34,34,34,34,35,35,35,35,35,35,35,36,36,36,36,52,52,52,53,53,53,53,53,53,53,54,54,54,54,54,54] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6,22,23,23,23,23,23,23,23,24,24,24,24,24,24,24,25,41,41,41,41,41,41,42,42,42,42,42,42,42,43,43,43,59,59,59,59,60,60,60,60,60,60,60,61,61,61,61,61] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k3 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,4,5,4,5] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13,29,29,30,30,30,30,30,30,30,31,31,31,31,31,31,31,32,32,32,32,32,32,32,33,33,33,33,33,33,33,34,34,50,50,50,50,50,51,51,51,51,51,51,51,52,52,52,52] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k4 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4,20,20,20,21,21,21,21,21,21,21,22,22,22,22,22,22,38,39,39,39,39,39,39,39,40,40,40,40,40,40,40,41,57,57,57,57,57,57,58,58,58,58,58,58,58,59,59,59] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k5 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,2,3] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11,27,27,27,27,28,28,28,28,28,28,28,29,29,29,29,29,45,45,46,46,46,46,46,46,46,47,47,47,47,47,47,47,48,48,48,48,48,48,48,49,49,49,49,49,49,49,50,50] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k6 +; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,18,18,18,18,18,19,19,19,19,19,19,19,20,20,20,20,36,36,36,37,37,37,37,37,37,37,38,38,38,38,38,38,54,55,55,55,55,55,55,55,56,56,56,56,56,56,56,57] +; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k7 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k7} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k7, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k7, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k6} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k6, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k6, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k5} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k5, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k5, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k4} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k4, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k3} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} +; AVX512BW-ONLY-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload +; AVX512BW-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm24 {%k2} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm27, 1728(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm26, 1664(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm25, 1600(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm24, 1536(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm23, 1472(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm22, 1408(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm21, 1344(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm20, 1280(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm19, 1216(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm18, 1152(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm17, 1088(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm16, 1024(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm15, 960(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm14, 896(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-ONLY-NEXT: vzeroupper +; AVX512BW-ONLY-NEXT: retq +; +; AVX512VBMI-ONLY-LABEL: mask_replication_factor7_vf64: +; AVX512VBMI-ONLY: # %bb.0: +; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [54,55,55,55,55,55,55,55,56,56,56,56,56,56,56,57,57,57,57,57,57,57,58,58,58,58,58,58,58,59,59,59,59,59,59,59,60,60,60,60,60,60,60,61,61,61,61,61,61,61,62,62,62,62,62,62,62,63,63,63,63,63,63,63] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI-ONLY-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [45,45,46,46,46,46,46,46,46,47,47,47,47,47,47,47,48,48,48,48,48,48,48,49,49,49,49,49,49,49,50,50,50,50,50,50,50,51,51,51,51,51,51,51,52,52,52,52,52,52,52,53,53,53,53,53,53,53,54,54,54,54,54,54] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [36,36,36,37,37,37,37,37,37,37,38,38,38,38,38,38,38,39,39,39,39,39,39,39,40,40,40,40,40,40,40,41,41,41,41,41,41,41,42,42,42,42,42,42,42,43,43,43,43,43,43,43,44,44,44,44,44,44,44,45,45,45,45,45] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k3 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [27,27,27,27,28,28,28,28,28,28,28,29,29,29,29,29,29,29,30,30,30,30,30,30,30,31,31,31,31,31,31,31,32,32,32,32,32,32,32,33,33,33,33,33,33,33,34,34,34,34,34,34,34,35,35,35,35,35,35,35,36,36,36,36] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k4 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [18,18,18,18,18,19,19,19,19,19,19,19,20,20,20,20,20,20,20,21,21,21,21,21,21,21,22,22,22,22,22,22,22,23,23,23,23,23,23,23,24,24,24,24,24,24,24,25,25,25,25,25,25,25,26,26,26,26,26,26,26,27,27,27] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k5 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,9,9,9,9,9,10,10,10,10,10,10,10,11,11,11,11,11,11,11,12,12,12,12,12,12,12,13,13,13,13,13,13,13,14,14,14,14,14,14,14,15,15,15,15,15,15,15,16,16,16,16,16,16,16,17,17,17,17,17,17,17,18,18] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k6 +; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6,6,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9] +; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k7 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k7} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k7, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k7, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k6} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k6, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k6, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k5} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k5, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k5, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k4} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k4, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k3} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1152(%rsi), %zmm18 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1536(%rsi), %zmm24 {%k2} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm27, 1728(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm26, 1664(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm25, 1600(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm24, 1536(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm23, 1472(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm22, 1408(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm21, 1344(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm20, 1280(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm19, 1216(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm18, 1152(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm17, 1088(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm16, 1024(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm15, 960(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm14, 896(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512VBMI-ONLY-NEXT: vzeroupper +; AVX512VBMI-ONLY-NEXT: retq %src.mask = load <64 x i1>, ptr %in.maskvec, align 64 %tgt.mask = shufflevector <64 x i1> %src.mask, <64 x i1> poison, <448 x i32> %data = call <448 x i32> @llvm.masked.load.v448i32.p0(ptr %in.vec, i32 64, <448 x i1> %tgt.mask, <448 x i32> poison) @@ -12946,95 +5215,95 @@ ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; ; AVX512DQ-LABEL: mask_replication_factor8_vf8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-ONLY-LABEL: mask_replication_factor8_vf8: ; AVX512BW-ONLY: # %bb.0: -; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} ; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-ONLY-NEXT: vzeroupper ; AVX512BW-ONLY-NEXT: retq ; ; AVX512VBMI-ONLY-LABEL: mask_replication_factor8_vf8: ; AVX512VBMI-ONLY: # %bb.0: -; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 +; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k0 ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z} ; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1 +; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512VBMI-ONLY-NEXT: vzeroupper ; AVX512VBMI-ONLY-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 @@ -13050,48 +5319,48 @@ ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k4 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k6 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k7 -; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512F-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k7} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k6} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k5} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} ; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512F-ONLY-NEXT: vzeroupper ; AVX512F-ONLY-NEXT: retq ; @@ -13099,48 +5368,48 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 ; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k4 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k7} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k6} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k5} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z} ; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -13153,28 +5422,28 @@ ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] ; AVX512BW-NEXT: vpmovb2m %zmm0, %k2 +; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k3} {z} ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z} +; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} +; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} +; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 @@ -13364,50 +5633,50 @@ ; AVX512BW-NEXT: vpmovb2m %zmm1, %k3 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k4 +; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k4} {z} ; AVX512BW-NEXT: kshiftrd $16, %k4, %k5 -; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} -; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} +; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k5} {z} ; AVX512BW-NEXT: kshiftrq $32, %k4, %k4 -; AVX512BW-NEXT: kshiftrd $16, %k4, %k5 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z} -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z} +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k4} {z} +; AVX512BW-NEXT: kshiftrd $16, %k4, %k4 +; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k4} {z} +; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k3} {z} ; AVX512BW-NEXT: kshiftrd $16, %k3, %k4 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} +; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k4} {z} ; AVX512BW-NEXT: kshiftrq $32, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $16, %k3, %k4 -; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z} -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z} +; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k3} {z} +; AVX512BW-NEXT: kshiftrd $16, %k3, %k3 +; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k3} {z} +; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z} -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k3} {z} ; AVX512BW-NEXT: kshiftrq $32, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $16, %k2, %k3 -; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z} -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k2} {z} +; AVX512BW-NEXT: kshiftrd $16, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z} -; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z} +; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 -; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z} -; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z} -; AVX512BW-NEXT: vmovdqa64 %zmm15, 896(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 960(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 768(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 832(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 704(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 512(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z} +; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k1} {z} +; AVX512BW-NEXT: vmovdqa64 %zmm15, 960(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 896(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm13, 832(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 768(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 704(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 640(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 512(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 448(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -1107,9 +1107,9 @@ ; SSE41-LABEL: constant_rotate_v4i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] @@ -1118,13 +1118,13 @@ ; ; AVX1-LABEL: constant_rotate_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -901,21 +901,21 @@ ; AVX1-LABEL: constant_rotate_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5],xmm0[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -1848,29 +1848,28 @@ ; AVX1-LABEL: load_sext_4i1_to_4i64: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: movl %eax, %ecx +; AVX1-NEXT: shrb $3, %cl +; AVX1-NEXT: movzbl %cl, %ecx +; AVX1-NEXT: negq %rcx +; AVX1-NEXT: vmovq %rcx, %xmm0 ; AVX1-NEXT: movzbl %al, %ecx -; AVX1-NEXT: shrb %al +; AVX1-NEXT: shrb $2, %al ; AVX1-NEXT: movzbl %al, %eax ; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: andl $1, %edx -; AVX1-NEXT: negl %edx -; AVX1-NEXT: vmovd %edx, %xmm0 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX1-NEXT: negq %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: movl %ecx, %eax -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: movzbl %al, %eax ; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; AVX1-NEXT: shrb $3, %cl +; AVX1-NEXT: negq %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: shrb %cl ; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: negl %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: negq %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -3519,13 +3518,10 @@ ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movl 8(%rdi), %ecx -; SSE2-NEXT: shll $28, %ecx -; SSE2-NEXT: movq %rax, %rdx -; SSE2-NEXT: shrq $51, %rdx -; SSE2-NEXT: shll $15, %edx -; SSE2-NEXT: orl %ecx, %edx -; SSE2-NEXT: sarl $15, %edx -; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: shldq $13, %rax, %rcx +; SSE2-NEXT: shll $15, %ecx +; SSE2-NEXT: sarl $15, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: shrq $34, %rax ; SSE2-NEXT: shll $15, %eax ; SSE2-NEXT: sarl $15, %eax @@ -3548,13 +3544,10 @@ ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: movl 8(%rdi), %ecx -; SSSE3-NEXT: shll $28, %ecx -; SSSE3-NEXT: movq %rax, %rdx -; SSSE3-NEXT: shrq $51, %rdx -; SSSE3-NEXT: shll $15, %edx -; SSSE3-NEXT: orl %ecx, %edx -; SSSE3-NEXT: sarl $15, %edx -; SSSE3-NEXT: movd %edx, %xmm1 +; SSSE3-NEXT: shldq $13, %rax, %rcx +; SSSE3-NEXT: shll $15, %ecx +; SSSE3-NEXT: sarl $15, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: shrq $34, %rax ; SSSE3-NEXT: shll $15, %eax ; SSSE3-NEXT: sarl $15, %eax @@ -3566,53 +3559,47 @@ ; SSE41-LABEL: sext_4i17_to_4i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movq (%rdi), %rax -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq $17, %rcx +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: movq %rax, %rdx +; SSE41-NEXT: movl 8(%rdi), %esi +; SSE41-NEXT: shldq $13, %rax, %rsi +; SSE41-NEXT: shrq $17, %rax +; SSE41-NEXT: shll $15, %eax +; SSE41-NEXT: sarl $15, %eax ; SSE41-NEXT: shll $15, %ecx ; SSE41-NEXT: sarl $15, %ecx -; SSE41-NEXT: movl %eax, %edx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; SSE41-NEXT: shrq $34, %rdx ; SSE41-NEXT: shll $15, %edx ; SSE41-NEXT: sarl $15, %edx -; SSE41-NEXT: movd %edx, %xmm0 -; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq $34, %rcx -; SSE41-NEXT: shll $15, %ecx -; SSE41-NEXT: sarl $15, %ecx -; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 -; SSE41-NEXT: movl 8(%rdi), %ecx -; SSE41-NEXT: shll $28, %ecx -; SSE41-NEXT: shrq $51, %rax -; SSE41-NEXT: shll $15, %eax -; SSE41-NEXT: orl %ecx, %eax -; SSE41-NEXT: sarl $15, %eax -; SSE41-NEXT: pinsrd $3, %eax, %xmm0 +; SSE41-NEXT: pinsrd $2, %edx, %xmm0 +; SSE41-NEXT: shll $15, %esi +; SSE41-NEXT: sarl $15, %esi +; SSE41-NEXT: pinsrd $3, %esi, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: sext_4i17_to_4i32: ; AVX: # %bb.0: ; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: shrq $17, %rcx +; AVX-NEXT: movl %eax, %ecx +; AVX-NEXT: movq %rax, %rdx +; AVX-NEXT: movl 8(%rdi), %esi +; AVX-NEXT: shldq $13, %rax, %rsi +; AVX-NEXT: shrq $17, %rax +; AVX-NEXT: shll $15, %eax +; AVX-NEXT: sarl $15, %eax ; AVX-NEXT: shll $15, %ecx ; AVX-NEXT: sarl $15, %ecx -; AVX-NEXT: movl %eax, %edx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: shrq $34, %rdx ; AVX-NEXT: shll $15, %edx ; AVX-NEXT: sarl $15, %edx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: shrq $34, %rcx -; AVX-NEXT: shll $15, %ecx -; AVX-NEXT: sarl $15, %ecx -; AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movl 8(%rdi), %ecx -; AVX-NEXT: shll $28, %ecx -; AVX-NEXT: shrq $51, %rax -; AVX-NEXT: shll $15, %eax -; AVX-NEXT: orl %ecx, %eax -; AVX-NEXT: sarl $15, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 +; AVX-NEXT: shll $15, %esi +; AVX-NEXT: sarl $15, %esi +; AVX-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X86-SSE2-LABEL: sext_4i17_to_4i32: diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -1180,7 +1180,6 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpsrad %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper @@ -1190,7 +1189,6 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1199,9 +1197,8 @@ ; ; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper @@ -1209,9 +1206,8 @@ ; ; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -1306,7 +1306,6 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1328,8 +1327,7 @@ ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 -; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpsraw %xmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -961,7 +961,6 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpsrld %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper @@ -971,7 +970,6 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -980,9 +978,8 @@ ; ; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpsrld %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper @@ -990,9 +987,8 @@ ; ; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -1074,7 +1074,6 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1093,8 +1092,7 @@ ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -865,7 +865,6 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpslld %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper @@ -875,7 +874,6 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -884,9 +882,8 @@ ; ; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpslld %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper @@ -894,9 +891,8 @@ ; ; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -995,7 +995,6 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1013,8 +1012,7 @@ ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -113,77 +113,51 @@ } define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) { -; SSE-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: -; SSE: # %bb.0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE-NEXT: retq -; -; AVX1-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: -; AVX1: # %bb.0: -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; AVX2-SLOW-NEXT: retq +; SSE2-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] -; AVX2-FAST-NEXT: retq +; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; SSSE3-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] -; AVX512VL-NEXT: retq +; SSE41-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; SSE41-NEXT: retq ; -; XOP-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: -; XOP: # %bb.0: -; XOP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; XOP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; XOP-NEXT: retq +; AVX-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(<16 x i8> %a, <16 x i8> %b) { -; SSE-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: -; SSE: # %bb.0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: retq -; -; AVX1-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: -; AVX1: # %bb.0: -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; AVX2-SLOW-NEXT: retq +; SSE2-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: retq ; -; AVX2-FAST-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] -; AVX2-FAST-NEXT: retq +; SSSE3-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; SSSE3-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] -; AVX512VL-NEXT: retq +; SSE41-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; SSE41-NEXT: retq ; -; XOP-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: -; XOP: # %bb.0: -; XOP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; XOP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; XOP-NEXT: retq +; AVX-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -969,13 +969,27 @@ ; ; SSE41-LABEL: insert_reg_lo_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pinsrq $0, %rdi, %xmm0 +; SSE41-NEXT: movq %rdi, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_reg_lo_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: insert_reg_lo_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq %rdi, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_reg_lo_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovq %rdi, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_reg_lo_v2i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovq %rdi, %xmm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512VL-NEXT: retq %v = insertelement <2 x i64> undef, i64 %a, i32 0 %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle @@ -999,12 +1013,14 @@ ; ; SSE41-LABEL: insert_mem_lo_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pinsrq $0, (%rdi), %xmm0 +; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: insert_mem_lo_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpinsrq $0, (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX-NEXT: retq %a = load i64, ptr %ptr %v = insertelement <2 x i64> undef, i64 %a, i32 0 @@ -1013,32 +1029,16 @@ } define <2 x i64> @insert_reg_hi_v2i64(i64 %a, <2 x i64> %b) { -; SSE2-LABEL: insert_reg_hi_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movq %rdi, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSE3-LABEL: insert_reg_hi_v2i64: -; SSE3: # %bb.0: -; SSE3-NEXT: movq %rdi, %xmm1 -; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE3-NEXT: retq -; -; SSSE3-LABEL: insert_reg_hi_v2i64: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movq %rdi, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: insert_reg_hi_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: pinsrq $1, %rdi, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: insert_reg_hi_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: movq %rdi, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq ; ; AVX-LABEL: insert_reg_hi_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 +; AVX-NEXT: vmovq %rdi, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %v = insertelement <2 x i64> undef, i64 %a, i32 0 %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> @@ -1046,32 +1046,16 @@ } define <2 x i64> @insert_mem_hi_v2i64(ptr %ptr, <2 x i64> %b) { -; SSE2-LABEL: insert_mem_hi_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSE3-LABEL: insert_mem_hi_v2i64: -; SSE3: # %bb.0: -; SSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE3-NEXT: retq -; -; SSSE3-LABEL: insert_mem_hi_v2i64: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: insert_mem_hi_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: pinsrq $1, (%rdi), %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: insert_mem_hi_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq ; ; AVX-LABEL: insert_mem_hi_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %a = load i64, ptr %ptr %v = insertelement <2 x i64> undef, i64 %a, i32 0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1517,13 +1517,13 @@ define <4 x i32> @shuffle_v4i32_2456(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: shuffle_v4i32_2456: ; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm1[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_2456: ; SSE3: # %bb.0: -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm1[0,0] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] ; SSE3-NEXT: retq ; @@ -2052,19 +2052,19 @@ define <4 x i32> @extract3_insert3_v4i32_0127(<4 x i32> %a0, <4 x i32> %a1) { ; SSE2-LABEL: extract3_insert3_v4i32_0127: ; SSE2: # %bb.0: -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSE2-NEXT: retq ; ; SSE3-LABEL: extract3_insert3_v4i32_0127: ; SSE3: # %bb.0: -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: extract3_insert3_v4i32_0127: ; SSSE3: # %bb.0: -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[2,0] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSSE3-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1330,7 +1330,7 @@ ; SSE2-LABEL: shuffle_v8i16_032dXXXX: ; SSE2: # %bb.0: ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] @@ -3259,10 +3259,18 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v8i16_i32: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %xmm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_elt1_mem_v8i16_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 2(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt1_mem_v8i16_i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 2(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_elt1_mem_v8i16_i32: ; XOPAVX1: # %bb.0: @@ -3273,7 +3281,9 @@ ; ; XOPAVX2-LABEL: insert_dup_elt1_mem_v8i16_i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw 2(%rdi), %xmm0 +; XOPAVX2-NEXT: movzwl 2(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0 ; XOPAVX2-NEXT: retq %tmp = load i32, ptr %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 @@ -3283,24 +3293,12 @@ } define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(ptr %ptr) { -; SSE2-LABEL: insert_dup_elt3_mem_v8i16_i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: insert_dup_elt3_mem_v8i16_i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] -; SSE41-NEXT: retq +; SSE-LABEL: insert_dup_elt3_mem_v8i16_i32: +; SSE: # %bb.0: +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_elt3_mem_v8i16_i32: ; AVX1: # %bb.0: @@ -3309,10 +3307,18 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v8i16_i32: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %xmm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_elt3_mem_v8i16_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 2(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt3_mem_v8i16_i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 2(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_elt3_mem_v8i16_i32: ; XOPAVX1: # %bb.0: @@ -3323,7 +3329,9 @@ ; ; XOPAVX2-LABEL: insert_dup_elt3_mem_v8i16_i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw 2(%rdi), %xmm0 +; XOPAVX2-NEXT: movzwl 2(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0 ; XOPAVX2-NEXT: retq %tmp = load i32, ptr %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1 @@ -3388,33 +3396,20 @@ } define <8 x i16> @insert_dup_elt3_mem_v8i16_sext_i16(ptr %ptr) { -; SSE2-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movswl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movswl (%rdi), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movswl (%rdi), %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] -; SSE41-NEXT: retq +; SSE-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: +; SSE: # %bb.0: +; SSE-NEXT: movswl (%rdi), %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: ; AVX1: # %bb.0: ; AVX1-NEXT: movswl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: @@ -3436,7 +3431,8 @@ ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: movswl (%rdi), %eax ; XOPAVX1-NEXT: vmovd %eax, %xmm0 -; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: @@ -3457,14 +3453,14 @@ define <8 x i16> @insert_dup_mem_v8i16_i64(ptr %ptr) { ; SSE-LABEL: insert_dup_mem_v8i16_i64: ; SSE: # %bb.0: -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_mem_v8i16_i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq @@ -3476,7 +3472,7 @@ ; ; XOPAVX1-LABEL: insert_dup_mem_v8i16_i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; XOPAVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; XOPAVX1-NEXT: retq @@ -3545,10 +3541,18 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v8i16_i64: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw 6(%rdi), %xmm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_elt3_mem_v8i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 6(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt3_mem_v8i16_i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 6(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_elt3_mem_v8i16_i64: ; XOPAVX1: # %bb.0: @@ -3559,7 +3563,9 @@ ; ; XOPAVX2-LABEL: insert_dup_elt3_mem_v8i16_i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw 6(%rdi), %xmm0 +; XOPAVX2-NEXT: movzwl 6(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0 ; XOPAVX2-NEXT: retq %tmp = load i64, ptr %ptr, align 4 %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0 @@ -3596,10 +3602,18 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_elt7_mem_v8i16_i64: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw 6(%rdi), %xmm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_elt7_mem_v8i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 6(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt7_mem_v8i16_i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 6(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_elt7_mem_v8i16_i64: ; XOPAVX1: # %bb.0: @@ -3610,7 +3624,9 @@ ; ; XOPAVX2-LABEL: insert_dup_elt7_mem_v8i16_i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw 6(%rdi), %xmm0 +; XOPAVX2-NEXT: movzwl 6(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0 ; XOPAVX2-NEXT: retq %tmp = load i64, ptr %ptr, align 4 %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -54,24 +54,29 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-NEXT: retq ; -; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX512VL-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; XOPAVX1: # %bb.0: @@ -109,24 +114,29 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-NEXT: retq ; -; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX512VL-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; XOPAVX1: # %bb.0: @@ -164,24 +174,29 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-FAST-NEXT: retq ; -; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] +; AVX512VL-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; XOPAVX1: # %bb.0: @@ -217,11 +232,23 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -255,11 +282,23 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -293,11 +332,23 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -331,11 +382,23 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -1007,26 +1070,31 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,1,0,0,0,1] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: retq ; -; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0] +; AVX512VL-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0] +; AVX512VL-FAST-CROSSLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: ; XOPAVX1: # %bb.0: @@ -1060,26 +1128,31 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: retq ; -; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0] +; AVX512VL-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: ; XOPAVX1: # %bb.0: @@ -1113,26 +1186,31 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-FAST-NEXT: retq ; -; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0] +; AVX512VL-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: ; XOPAVX1: # %bb.0: @@ -1164,12 +1242,24 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0,0,0,0,4,0,0,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0,0,0,0,4,0,0,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -1199,12 +1289,24 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0,0,0,5,0,0,0,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0,0,0,5,0,0,0,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -1234,12 +1336,24 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0,0,6,0,0,0,0,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0,0,6,0,0,0,0,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -1269,12 +1383,24 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VL-FAST-CROSSLANE: # %bb.0: +; AVX512VL-FAST-CROSSLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VL-FAST-CROSSLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-CROSSLANE-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512VL-FAST-PERLANE-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -1687,12 +1813,19 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: ; AVX512VL: # %bb.0: @@ -3428,8 +3561,8 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; XOPAVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7] -; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,4,5,8,9,14,15] +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,0,1,2,3,4,5],xmm3[6,7] +; XOPAVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -3686,11 +3819,12 @@ ; AVX1-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08: @@ -4353,8 +4487,8 @@ ; ; AVX2-SLOW-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] ; AVX2-SLOW-NEXT: retq ; @@ -4367,8 +4501,8 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -4388,8 +4522,8 @@ ; ; XOPAVX2-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] -; XOPAVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -4400,11 +4534,12 @@ ; AVX1-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12: @@ -4459,11 +4594,12 @@ ; AVX1-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12: @@ -4518,11 +4654,12 @@ ; AVX1-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12: @@ -4577,11 +4714,12 @@ ; AVX1-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08: @@ -4696,11 +4834,12 @@ ; AVX1-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12: @@ -4755,11 +4894,12 @@ ; AVX1-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12: @@ -4814,11 +4954,12 @@ ; AVX1-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,u,u] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12: @@ -4911,11 +5052,24 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,2] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,8,9,10,11,0,1,2,3,4,5,14,15,16,17,18,19,24,25,26,27,16,17,18,19,20,21,30,31] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,2] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,8,9,10,11,0,1,2,3,4,5,14,15,16,17,18,19,24,25,26,27,16,17,18,19,20,21,30,31] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-ALL-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,4,3,5,6,4,7,5] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,0,1,2,3,8,9,14,15,16,17,18,19,20,21,22,23,16,17,18,19,24,25,30,31] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,2] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,8,9,10,11,0,1,2,3,4,5,14,15,16,17,18,19,24,25,26,27,16,17,18,19,20,21,30,31] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11: ; AVX512VL: # %bb.0: @@ -5099,8 +5253,8 @@ ; ; AVX2-SLOW-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] ; AVX2-SLOW-NEXT: retq ; @@ -5113,8 +5267,8 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -5134,8 +5288,8 @@ ; ; XOPAVX2-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] -; XOPAVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -6094,8 +6248,7 @@ ; AVX1-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [0,1,4,5,8,9,4,5,0,1,4,5,8,9,4,5] -; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5] ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] @@ -6153,9 +6306,9 @@ ; ; AVX2-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12: ; AVX2: # %bb.0: -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX2-NEXT: retq ; @@ -6177,9 +6330,9 @@ ; ; XOPAVX2-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] ; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -6335,9 +6488,9 @@ ; ; AVX2-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10: ; AVX2: # %bb.0: -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; AVX2-NEXT: retq ; @@ -6353,16 +6506,17 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOPAVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5] -; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,4,5,10,11] +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,0,1,2,3],xmm3[4,5] +; XOPAVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15] +; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] +; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] ; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -6519,7 +6673,7 @@ ; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26: ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; AVX2-NEXT: retq @@ -6535,15 +6689,16 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; XOPAVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5] -; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,4,5,10,11] +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,0,1,2,3],xmm3[4,5] +; XOPAVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] ; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; XOPAVX2-NEXT: retq @@ -6597,7 +6752,7 @@ ; AVX2-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28: ; AVX2: # %bb.0: ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; AVX2-NEXT: retq @@ -6622,7 +6777,7 @@ ; XOPAVX2-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] ; XOPAVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] ; XOPAVX2-NEXT: retq @@ -7537,30 +7692,17 @@ ; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: PR34369: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6],xmm0[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: PR34369: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,10,11,u,u,u,u,u,u,4,5] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6],xmm2[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: retq +; AVX2-LABEL: PR34369: +; AVX2: # %bb.0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,10,11,u,u,u,u,u,u,4,5] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25] +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6],xmm2[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: PR34369: ; AVX512VL: # %bb.0: @@ -7635,21 +7777,29 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16(ptr %ptr) { ; AVX1-LABEL: insert_dup_mem_v16i16_sext_i16: ; AVX1: # %bb.0: -; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: movswl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_mem_v16i16_sext_i16: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_mem_v16i16_sext_i16: +; AVX2: # %bb.0: +; AVX2-NEXT: movswl (%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_mem_v16i16_sext_i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movswl (%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_mem_v16i16_sext_i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: movzwl (%rdi), %eax +; XOPAVX1-NEXT: movswl (%rdi), %eax ; XOPAVX1-NEXT: vmovd %eax, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] @@ -7658,7 +7808,9 @@ ; ; XOPAVX2-LABEL: insert_dup_mem_v16i16_sext_i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw (%rdi), %ymm0 +; XOPAVX2-NEXT: movswl (%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; XOPAVX2-NEXT: retq %tmp = load i16, ptr %ptr, align 2 %tmp1 = sext i16 %tmp to i32 @@ -7677,10 +7829,18 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v16i16_i32: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_elt1_mem_v16i16_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 2(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt1_mem_v16i16_i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 2(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_elt1_mem_v16i16_i32: ; XOPAVX1: # %bb.0: @@ -7692,7 +7852,9 @@ ; ; XOPAVX2-LABEL: insert_dup_elt1_mem_v16i16_i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw 2(%rdi), %ymm0 +; XOPAVX2-NEXT: movzwl 2(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; XOPAVX2-NEXT: retq %tmp = load i32, ptr %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 @@ -7710,10 +7872,18 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v16i16_i32: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_elt3_mem_v16i16_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 2(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt3_mem_v16i16_i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 2(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_elt3_mem_v16i16_i32: ; XOPAVX1: # %bb.0: @@ -7725,7 +7895,9 @@ ; ; XOPAVX2-LABEL: insert_dup_elt3_mem_v16i16_i32: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw 2(%rdi), %ymm0 +; XOPAVX2-NEXT: movzwl 2(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; XOPAVX2-NEXT: retq %tmp = load i32, ptr %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1 @@ -7809,10 +7981,18 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v16i16_i64: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw 6(%rdi), %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_elt3_mem_v16i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 6(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt3_mem_v16i16_i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 6(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_elt3_mem_v16i16_i64: ; XOPAVX1: # %bb.0: @@ -7824,7 +8004,9 @@ ; ; XOPAVX2-LABEL: insert_dup_elt3_mem_v16i16_i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw 6(%rdi), %ymm0 +; XOPAVX2-NEXT: movzwl 6(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; XOPAVX2-NEXT: retq %tmp = load i64, ptr %ptr, align 4 %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0 @@ -7836,26 +8018,38 @@ define <16 x i16> @insert_dup_elt7_mem_v16i16_i64(ptr %ptr) { ; AVX1-LABEL: insert_dup_elt7_mem_v16i16_i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_elt7_mem_v16i16_i64: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw 6(%rdi), %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_elt7_mem_v16i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 6(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt7_mem_v16i16_i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 6(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_elt7_mem_v16i16_i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: insert_dup_elt7_mem_v16i16_i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw 6(%rdi), %ymm0 +; XOPAVX2-NEXT: movzwl 6(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; XOPAVX2-NEXT: retq %tmp = load i64, ptr %ptr, align 4 %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 1 @@ -7867,22 +8061,30 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16_i64(ptr %ptr) { ; AVX1-LABEL: insert_dup_mem_v16i16_sext_i16_i64: ; AVX1: # %bb.0: -; AVX1-NEXT: movzwl (%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: movswq (%rdi), %rax +; AVX1-NEXT: vmovq %rax, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: insert_dup_mem_v16i16_sext_i16_i64: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: insert_dup_mem_v16i16_sext_i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movswq (%rdi), %rax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_mem_v16i16_sext_i16_i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movswq (%rdi), %rax +; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0 +; AVX512VL-NEXT: retq ; ; XOPAVX1-LABEL: insert_dup_mem_v16i16_sext_i16_i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: movzwl (%rdi), %eax -; XOPAVX1-NEXT: vmovd %eax, %xmm0 +; XOPAVX1-NEXT: movswq (%rdi), %rax +; XOPAVX1-NEXT: vmovq %rax, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -7890,7 +8092,9 @@ ; ; XOPAVX2-LABEL: insert_dup_mem_v16i16_sext_i16_i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastw (%rdi), %ymm0 +; XOPAVX2-NEXT: movswq (%rdi), %rax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; XOPAVX2-NEXT: retq %tmp = load i16, ptr %ptr, align 2 %tmp1 = sext i16 %tmp to i64 @@ -7960,11 +8164,19 @@ ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[26,27],zero,zero ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: pr43230: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: pr43230: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512VL-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15] +; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: pr43230: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512VL-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512VL-FAST-NEXT: retq ; ; XOPAVX1-LABEL: pr43230: ; XOPAVX1: # %bb.0: @@ -7985,7 +8197,9 @@ ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; XOPAVX2-NEXT: vpshlw %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15] +; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; XOPAVX2-NEXT: retq %shr = lshr <16 x i16> %a, %b %shuf = shufflevector <16 x i16> zeroinitializer, <16 x i16> %shr, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -4433,11 +4433,24 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-ALL-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,1,5,2,6,3,7] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15,16,17,20,21,18,19,22,23,24,25,28,29,26,27,30,31] +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] +; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_00_01_16_17_02_03_18_19_04_05_20_21_06_07_22_23_08_09_24_25_10_11_26_27_12_13_28_29_14_15_30_31: ; AVX512VL: # %bb.0: @@ -4770,18 +4783,11 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: retq ; -; AVX2-FAST-ALL-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,30,31,30,31,30,31,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,30,31,30,31,30,31,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31: ; AVX512VL: # %bb.0: @@ -5145,7 +5151,7 @@ define <32 x i8> @insert_dup_mem_v32i8_sext_i8(ptr %ptr) { ; AVX1-LABEL: insert_dup_mem_v32i8_sext_i8: ; AVX1: # %bb.0: -; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: movsbl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 @@ -5159,7 +5165,7 @@ ; ; XOPAVX1-LABEL: insert_dup_mem_v32i8_sext_i8: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: movzbl (%rdi), %eax +; XOPAVX1-NEXT: movsbl (%rdi), %eax ; XOPAVX1-NEXT: vmovd %eax, %xmm0 ; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -2008,14 +2008,14 @@ ; ; AVX2-LABEL: add_v4f64_024u_135u_reverse: ; AVX2: # %bb.0: -; AVX2-NEXT: vhaddpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,3,1] +; AVX2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,0] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: add_v4f64_024u_135u_reverse: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vhaddpd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,3,1] +; AVX512VL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,0] ; AVX512VL-NEXT: retq %shuffle0 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> %shuffle1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -3686,37 +3686,43 @@ define <8 x float> @broadcast_concat_crash(<4 x float> %x, <4 x float> %y, float %z) { ; AVX1-LABEL: broadcast_concat_crash: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,1,1] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: broadcast_concat_crash: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,1,1] ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss %xmm2, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: broadcast_concat_crash: ; AVX512VL-SLOW: # %bb.0: # %entry -; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512VL-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,1,1] ; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX512VL-SLOW-NEXT: vbroadcastss %xmm2, %ymm1 +; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX512VL-SLOW-NEXT: retq ; -; AVX512VL-FAST-LABEL: broadcast_concat_crash: -; AVX512VL-FAST: # %bb.0: # %entry -; AVX512VL-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [1,4,3,3] -; AVX512VL-FAST-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 -; AVX512VL-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-FAST-NEXT: retq +; AVX512VL-FAST-ALL-LABEL: broadcast_concat_crash: +; AVX512VL-FAST-ALL: # %bb.0: # %entry +; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512VL-FAST-ALL-NEXT: vbroadcastss %xmm2, %ymm2 +; AVX512VL-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [3,13,1,1,3,13,1,1] +; AVX512VL-FAST-ALL-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512VL-FAST-ALL-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: broadcast_concat_crash: +; AVX512VL-FAST-PERLANE: # %bb.0: # %entry +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,1,1] +; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vbroadcastss %xmm2, %ymm1 +; AVX512VL-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX512VL-FAST-PERLANE-NEXT: retq entry: %tmp = shufflevector <4 x float> %x, <4 x float> %y, <8 x i32> %bc = bitcast <8 x float> %tmp to <4 x i64> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -586,8 +586,8 @@ define <16 x float> @insert_sub01_8(<16 x float> %base, <4 x float> %sub1, <4 x float> %sub2, <4 x float> %sub3, <4 x float> %sub4) { ; ALL-LABEL: insert_sub01_8: ; ALL: # %bb.0: -; ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; ALL-NEXT: vinsertf32x4 $1, %xmm2, %zmm1, %zmm1 ; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; ALL-NEXT: retq %sub12 = shufflevector <4 x float> %sub1, <4 x float> %sub2, <8 x i32> @@ -600,9 +600,9 @@ define <16 x float> @insert_sub23_0(<16 x float> %base, <4 x float> %sub1, <4 x float> %sub2, <4 x float> %sub3, <4 x float> %sub4) { ; ALL-LABEL: insert_sub23_0: ; ALL: # %bb.0: -; ALL-NEXT: # kill: def $xmm3 killed $xmm3 def $ymm3 -; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm1 -; ALL-NEXT: vinsertf64x4 $0, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vinsertf32x4 $2, %xmm3, %zmm0, %zmm1 +; ALL-NEXT: vinsertf32x4 $3, %xmm4, %zmm1, %zmm1 +; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7],zmm0[4,5,6,7] ; ALL-NEXT: retq %sub12 = shufflevector <4 x float> %sub1, <4 x float> %sub2, <8 x i32> %sub34 = shufflevector <4 x float> %sub3, <4 x float> %sub4, <8 x i32> @@ -867,8 +867,8 @@ ; ALL-NEXT: vbroadcastss {{.*#+}} ymm0 = [-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0] ; ALL-NEXT: vmulps 32(%rdi), %ymm0, %ymm0 ; ALL-NEXT: vcvtps2pd %ymm0, %zmm0 -; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,4,5,0,1,0,1] -; ALL-NEXT: vmovapd %ymm0, {{[0-9]+}}(%rsp) +; ALL-NEXT: vextractf32x4 $2, %zmm0, {{[0-9]+}}(%rsp) +; ALL-NEXT: vextractf32x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; ALL-NEXT: movq %rbp, %rsp ; ALL-NEXT: popq %rbp ; ALL-NEXT: .cfi_def_cfa %rsp, 8 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -285,13 +285,16 @@ define <32 x i16> @insert_dup_mem_v32i16_sext_i16(ptr %ptr) { ; KNL-LABEL: insert_dup_mem_v32i16_sext_i16: ; KNL: ## %bb.0: -; KNL-NEXT: vpbroadcastw (%rdi), %ymm0 +; KNL-NEXT: movswl (%rdi), %eax +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 ; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: insert_dup_mem_v32i16_sext_i16: ; SKX: ## %bb.0: -; SKX-NEXT: vpbroadcastw (%rdi), %zmm0 +; SKX-NEXT: movswl (%rdi), %eax +; SKX-NEXT: vpbroadcastw %eax, %zmm0 ; SKX-NEXT: retq %tmp = load i16, ptr %ptr, align 2 %tmp1 = sext i16 %tmp to i32 @@ -304,13 +307,16 @@ define <32 x i16> @insert_dup_elt1_mem_v32i16_i32(ptr %ptr) #0 { ; KNL-LABEL: insert_dup_elt1_mem_v32i16_i32: ; KNL: ## %bb.0: -; KNL-NEXT: vpbroadcastw 2(%rdi), %ymm0 +; KNL-NEXT: movzwl 2(%rdi), %eax +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 ; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: insert_dup_elt1_mem_v32i16_i32: ; SKX: ## %bb.0: -; SKX-NEXT: vpbroadcastw 2(%rdi), %zmm0 +; SKX-NEXT: movzwl 2(%rdi), %eax +; SKX-NEXT: vpbroadcastw %eax, %zmm0 ; SKX-NEXT: retq %tmp = load i32, ptr %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 @@ -322,13 +328,16 @@ define <32 x i16> @insert_dup_elt3_mem_v32i16_i32(ptr %ptr) #0 { ; KNL-LABEL: insert_dup_elt3_mem_v32i16_i32: ; KNL: ## %bb.0: -; KNL-NEXT: vpbroadcastw 2(%rdi), %ymm0 +; KNL-NEXT: movzwl 2(%rdi), %eax +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 ; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: insert_dup_elt3_mem_v32i16_i32: ; SKX: ## %bb.0: -; SKX-NEXT: vpbroadcastw 2(%rdi), %zmm0 +; SKX-NEXT: movzwl 2(%rdi), %eax +; SKX-NEXT: vpbroadcastw %eax, %zmm0 ; SKX-NEXT: retq %tmp = load i32, ptr %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1 @@ -376,13 +385,16 @@ define <32 x i16> @insert_dup_elt3_mem_v16i16_i64(ptr %ptr) { ; KNL-LABEL: insert_dup_elt3_mem_v16i16_i64: ; KNL: ## %bb.0: -; KNL-NEXT: vpbroadcastw 6(%rdi), %ymm0 +; KNL-NEXT: movzwl 6(%rdi), %eax +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 ; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: insert_dup_elt3_mem_v16i16_i64: ; SKX: ## %bb.0: -; SKX-NEXT: vpbroadcastw 6(%rdi), %zmm0 +; SKX-NEXT: movzwl 6(%rdi), %eax +; SKX-NEXT: vpbroadcastw %eax, %zmm0 ; SKX-NEXT: retq %tmp = load i64, ptr %ptr, align 4 %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0 @@ -394,13 +406,16 @@ define <32 x i16> @insert_dup_elt7_mem_v16i16_i64(ptr %ptr) { ; KNL-LABEL: insert_dup_elt7_mem_v16i16_i64: ; KNL: ## %bb.0: -; KNL-NEXT: vpbroadcastw 6(%rdi), %ymm0 +; KNL-NEXT: movzwl 6(%rdi), %eax +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 ; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: insert_dup_elt7_mem_v16i16_i64: ; SKX: ## %bb.0: -; SKX-NEXT: vpbroadcastw 6(%rdi), %zmm0 +; SKX-NEXT: movzwl 6(%rdi), %eax +; SKX-NEXT: vpbroadcastw %eax, %zmm0 ; SKX-NEXT: retq %tmp = load i64, ptr %ptr, align 4 %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 1 @@ -412,13 +427,16 @@ define <32 x i16> @insert_dup_mem_v16i16_sext_i16_i64(ptr %ptr) { ; KNL-LABEL: insert_dup_mem_v16i16_sext_i16_i64: ; KNL: ## %bb.0: -; KNL-NEXT: vpbroadcastw (%rdi), %ymm0 +; KNL-NEXT: movswq (%rdi), %rax +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 ; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: insert_dup_mem_v16i16_sext_i16_i64: ; SKX: ## %bb.0: -; SKX-NEXT: vpbroadcastw (%rdi), %zmm0 +; SKX-NEXT: movswq (%rdi), %rax +; SKX-NEXT: vpbroadcastw %eax, %zmm0 ; SKX-NEXT: retq %tmp = load i16, ptr %ptr, align 2 %tmp1 = sext i16 %tmp to i64 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -290,27 +290,22 @@ ;Negative test. define <8 x float> @expand15(<4 x float> %a) { -; AVX512-SLOW-LABEL: expand15: -; AVX512-SLOW: # %bb.0: -; AVX512-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX512-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX512-SLOW-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] -; AVX512-SLOW-NEXT: ret{{[l|q]}} -; -; AVX512-FAST-LABEL: expand15: -; AVX512-FAST: # %bb.0: -; AVX512-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [1,0,0,0,1,0,0,0] -; AVX512-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX512-FAST-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] -; AVX512-FAST-NEXT: ret{{[l|q]}} +; AVX512-LABEL: expand15: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [0,1,8,3,9,5,6,7] +; AVX512-NEXT: vpermi2ps %ymm0, %ymm2, %ymm1 +; AVX512-NEXT: vmovaps %ymm1, %ymm0 +; AVX512-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: expand15: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX512F-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,16,3,17,5,6,7] +; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 +; AVX512F-NEXT: vmovaps %ymm1, %ymm0 ; AVX512F-NEXT: ret{{[l|q]}} %addV = fadd <4 x float> , %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> @@ -572,8 +567,11 @@ ; X86-AVX512-SLOW-NEXT: vpbroadcastd 44(%ecx), %xmm0 ; X86-AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; X86-AVX512-SLOW-NEXT: vmovdqa %ymm0, 672(%eax) -; X86-AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,0,2,3] -; X86-AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; X86-AVX512-SLOW-NEXT: vmovdqa 208(%ecx), %xmm0 +; X86-AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero +; X86-AVX512-SLOW-NEXT: vmovd %xmm0, %ecx +; X86-AVX512-SLOW-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm0 ; X86-AVX512-SLOW-NEXT: vmovdqa %ymm0, 832(%eax) ; X86-AVX512-SLOW-NEXT: vzeroupper ; X86-AVX512-SLOW-NEXT: retl @@ -583,8 +581,11 @@ ; X64-AVX512-SLOW-NEXT: vpbroadcastd 44(%rdi), %xmm0 ; X64-AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; X64-AVX512-SLOW-NEXT: vmovdqa %ymm0, 672(%rsi) -; X64-AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,0,2,3] -; X64-AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; X64-AVX512-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 +; X64-AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero +; X64-AVX512-SLOW-NEXT: vmovd %xmm0, %eax +; X64-AVX512-SLOW-NEXT: vpinsrd $1, %eax, %xmm1, %xmm0 ; X64-AVX512-SLOW-NEXT: vmovdqa %ymm0, 832(%rsi) ; X64-AVX512-SLOW-NEXT: vzeroupper ; X64-AVX512-SLOW-NEXT: retq @@ -597,7 +598,9 @@ ; X86-AVX512-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; X86-AVX512-FAST-NEXT: vmovdqa %ymm0, 672(%eax) ; X86-AVX512-FAST-NEXT: vmovdqa 208(%ecx), %xmm0 -; X86-AVX512-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero +; X86-AVX512-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,6,7,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; X86-AVX512-FAST-NEXT: vmovd %xmm0, %ecx +; X86-AVX512-FAST-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm0 ; X86-AVX512-FAST-NEXT: vmovdqa %ymm0, 832(%eax) ; X86-AVX512-FAST-NEXT: vzeroupper ; X86-AVX512-FAST-NEXT: retl @@ -608,7 +611,9 @@ ; X64-AVX512-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; X64-AVX512-FAST-NEXT: vmovdqa %ymm0, 672(%rsi) ; X64-AVX512-FAST-NEXT: vmovdqa 208(%rdi), %xmm0 -; X64-AVX512-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero +; X64-AVX512-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,6,7,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; X64-AVX512-FAST-NEXT: vmovd %xmm0, %eax +; X64-AVX512-FAST-NEXT: vpinsrd $1, %eax, %xmm1, %xmm0 ; X64-AVX512-FAST-NEXT: vmovdqa %ymm0, 832(%rsi) ; X64-AVX512-FAST-NEXT: vzeroupper ; X64-AVX512-FAST-NEXT: retq @@ -620,8 +625,11 @@ ; X86-AVX512F-NEXT: vpbroadcastd 44(%ecx), %xmm0 ; X86-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; X86-AVX512F-NEXT: vmovdqa %ymm0, 672(%eax) -; X86-AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,0,2,3] -; X86-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; X86-AVX512F-NEXT: vmovdqa 208(%ecx), %xmm0 +; X86-AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX512F-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero +; X86-AVX512F-NEXT: vmovd %xmm0, %ecx +; X86-AVX512F-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm0 ; X86-AVX512F-NEXT: vmovdqa %ymm0, 832(%eax) ; X86-AVX512F-NEXT: vzeroupper ; X86-AVX512F-NEXT: retl @@ -631,8 +639,11 @@ ; X64-AVX512F-NEXT: vpbroadcastd 44(%rdi), %xmm0 ; X64-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; X64-AVX512F-NEXT: vmovdqa %ymm0, 672(%rsi) -; X64-AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,0,2,3] -; X64-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; X64-AVX512F-NEXT: vmovdqa 208(%rdi), %xmm0 +; X64-AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512F-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero +; X64-AVX512F-NEXT: vmovd %xmm0, %eax +; X64-AVX512F-NEXT: vpinsrd $1, %eax, %xmm1, %xmm0 ; X64-AVX512F-NEXT: vmovdqa %ymm0, 832(%rsi) ; X64-AVX512F-NEXT: vzeroupper ; X64-AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -132,19 +132,10 @@ } define <8 x float> @combine_vpermilvar_vperm2f128_zero_8f32(<8 x float> %a0) { -; AVX-LABEL: combine_vpermilvar_vperm2f128_zero_8f32: -; AVX: # %bb.0: -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] -; AVX-NEXT: ret{{[l|q]}} -; -; AVX512-LABEL: combine_vpermilvar_vperm2f128_zero_8f32: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [16,17,18,19,3,2,1,0] -; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 -; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX512-NEXT: ret{{[l|q]}} +; CHECK-LABEL: combine_vpermilvar_vperm2f128_zero_8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> ) %2 = shufflevector <8 x float> %1, <8 x float> zeroinitializer, <8 x i32> %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %2, <8 x i32> ) @@ -439,7 +430,7 @@ ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,1,2,2] +; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[2] ; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm5 ; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1],ymm4[0],ymm5[2],ymm4[3] @@ -491,16 +482,16 @@ ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 -; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [1,0,2,0,8,0,9,0] -; X86-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm3 -; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm5 = [0,0,10,0,2,0,9,0] -; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1] -; X86-AVX512-NEXT: vpermt2pd %zmm4, %zmm5, %zmm6 -; X86-AVX512-NEXT: vmovapd %ymm6, (%edx) -; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm4 = [0,0,3,0,10,0,1,0] -; X86-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4 -; X86-AVX512-NEXT: vmovapd %ymm4, (%ecx) +; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm4 = [1,0,2,0,8,0,9,0] +; X86-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm4 +; X86-AVX512-NEXT: vblendpd {{.*#+}} ymm5 = ymm0[0,1],ymm4[2,3] +; X86-AVX512-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1] +; X86-AVX512-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3] +; X86-AVX512-NEXT: vmovapd %ymm3, (%edx) +; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [0,0,3,0,10,0,1,0] +; X86-AVX512-NEXT: vpermi2pd %zmm0, %zmm4, %zmm3 +; X86-AVX512-NEXT: vmovapd %ymm3, (%ecx) ; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,0,11,0,3,0,11,0] ; X86-AVX512-NEXT: # ymm3 = mem[0,1,0,1] ; X86-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3 @@ -513,7 +504,7 @@ ; X64-AVX1-LABEL: PR48908: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,1,2,2] +; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[2] ; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] ; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm5 ; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[1],ymm4[0],ymm5[2],ymm4[3] @@ -562,10 +553,10 @@ ; X64-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm4 = [1,2,8,9] ; X64-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm4 -; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm5 = [0,10,2,9] -; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1] -; X64-AVX512-NEXT: vpermt2pd %zmm3, %zmm5, %zmm6 -; X64-AVX512-NEXT: vmovapd %ymm6, (%rdi) +; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm5 = ymm0[0,1],ymm4[2,3] +; X64-AVX512-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1] +; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3] +; X64-AVX512-NEXT: vmovapd %ymm3, (%rdi) ; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [0,3,10,1] ; X64-AVX512-NEXT: vpermi2pd %zmm0, %zmm4, %zmm3 ; X64-AVX512-NEXT: vmovapd %ymm3, (%rsi) @@ -756,3 +747,5 @@ %v1 = shufflevector <16 x i64> %v0, <16 x i64> undef, <16 x i32> ret <16 x i64> %v1 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -55,6 +55,9 @@ ; CHECK-LABEL: combine_and_pshufb: ; CHECK: # %bb.0: ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i32> @@ -128,7 +131,7 @@ define <8 x float> @combine_as_vpermps(<8 x float> %a0) { ; CHECK-LABEL: combine_as_vpermps: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7> +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [6,4,7,5,1,0,4,7] ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> @@ -838,16 +841,29 @@ ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-NEXT: ret{{[l|q]}} ; -; AVX512-LABEL: PR34577: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] -; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = <23,18,7,2,20,u,3,2> -; AVX512-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512-NEXT: ret{{[l|q]}} +; X86-AVX512-LABEL: PR34577: +; X86-AVX512: # %bb.0: # %entry +; X86-AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = <8,0,u,u,1,0,u,u> +; X86-AVX512-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; X86-AVX512-NEXT: vpermt2pd %zmm3, %zmm2, %zmm0 +; X86-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,23,18,4,5,19,18] +; X86-AVX512-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 +; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; X86-AVX512-NEXT: retl +; +; X64-AVX512-LABEL: PR34577: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = <8,u,1,u> +; X64-AVX512-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; X64-AVX512-NEXT: vpermt2pd %zmm3, %zmm2, %zmm0 +; X64-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,23,18,4,5,19,18] +; X64-AVX512-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 +; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; X64-AVX512-NEXT: retq entry: %shuf0 = shufflevector <8 x float> %inp0, <8 x float> %inp2, <8 x i32> %sel = select <8 x i1> , <8 x float> %shuf0, <8 x float> zeroinitializer diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -54,10 +54,10 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) { ; X86-LABEL: combine_pshufb_identity_mask: ; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; X86-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X86-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 ; X86-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1} ; X86-NEXT: vpshufb %zmm2, %zmm3, %zmm1 {%k1} @@ -102,8 +102,8 @@ define <64 x i8> @combine_pshufb_as_pslldq_mask(<64 x i8> %a0, i64 %m) { ; X86-LABEL: combine_pshufb_as_pslldq_mask: ; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -128,8 +128,8 @@ define <64 x i8> @combine_pshufb_as_psrldq_mask(<64 x i8> %a0, i64 %m) { ; X86-LABEL: combine_pshufb_as_psrldq_mask: ; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z} ; X86-NEXT: retl ; @@ -158,10 +158,10 @@ define <64 x i8> @combine_permi2q_pshufb_as_permi2d_mask(<8 x i64> %a0, <8 x i64> %a1, i64 %m) { ; X86-LABEL: combine_permi2q_pshufb_as_permi2d_mask: ; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [7,0,12,0,5,0,14,0,7,0,12,0,5,0,14,0] ; X86-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; X86-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm2[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,20,21,22,23,20,21,22,23,20,21,22,23,20,21,22,23,40,41,42,43,40,41,42,43,40,41,42,43,40,41,42,43,60,61,62,63,60,61,62,63,60,61,62,63,60,61,62,63] ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll @@ -110,8 +110,8 @@ ; X86-NEXT: vpmovqw %ymm1, %xmm1 ; X86-NEXT: vpsllw $8, %xmm0, %xmm0 ; X86-NEXT: vpsraw $8, %xmm0, %xmm0 -; X86-NEXT: vpsllw $8, %xmm1, %xmm1 -; X86-NEXT: vpsraw $8, %xmm1, %xmm1 +; X86-NEXT: vpsllw $8, %ymm1, %ymm1 +; X86-NEXT: vpsraw $8, %ymm1, %ymm1 ; X86-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; X86-NEXT: vmovdqu %ymm0, (%eax) ; X86-NEXT: vzeroupper @@ -123,11 +123,14 @@ ; X64-NEXT: vmovdqu (%rax), %ymm1 ; X64-NEXT: vpmovqw %ymm0, %xmm0 ; X64-NEXT: vpmovqw %ymm1, %xmm1 -; X64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; X64-NEXT: vpsllw $8, %ymm0, %ymm0 -; X64-NEXT: vpsraw $8, %ymm0, %ymm0 -; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,1] -; X64-NEXT: vmovdqu %ymm0, (%rdi) +; X64-NEXT: vpsllw $8, %xmm0, %xmm0 +; X64-NEXT: vpsraw $8, %xmm0, %xmm0 +; X64-NEXT: vpsllw $8, %xmm1, %xmm1 +; X64-NEXT: vpsraw $8, %xmm1, %xmm1 +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: vmovdqu %xmm0, (%rdi) +; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovdqu %xmm0, 16(%rdi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %2 = load <4 x i64>, ptr null, align 8 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll @@ -128,8 +128,8 @@ define <64 x i8> @combine_permi2q_pshufb_as_permi2d_mask(<8 x i64> %a0, <8 x i64> %a1, i64 %m) { ; X86-LABEL: combine_permi2q_pshufb_as_permi2d_mask: ; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm2 = [56,57,58,59,56,57,58,59,56,57,58,59,56,57,58,59,44,45,46,47,44,45,46,47,44,45,46,47,44,45,46,47,96,97,98,99,96,97,98,99,96,97,98,99,96,97,98,99,116,117,118,119,116,117,118,119,116,117,118,119,116,117,118,119] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpermi2b %zmm0, %zmm1, %zmm2 {%k1} {z} ; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll @@ -25,35 +25,37 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) { ; SSE-LABEL: PR50049: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 ; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa (%rsi), %xmm4 ; SSE-NEXT: movdqa 16(%rsi), %xmm5 -; SSE-NEXT: movdqa 32(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = <128,128,128,128,128,128,2,5,8,11,14,u,u,u,u,u> -; SSE-NEXT: pshufb %xmm6, %xmm0 +; SSE-NEXT: pshufb %xmm6, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = <0,3,6,9,12,15,128,128,128,128,128,u,u,u,u,u> -; SSE-NEXT: pshufb %xmm7, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufb %xmm7, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] +; SSE-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE-NEXT: pshufb %xmm3, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [128,128,128,128,128,128,128,128,128,128,128,1,4,7,10,13] +; SSE-NEXT: pshufb %xmm9, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pshufb %xmm6, %xmm5 ; SSE-NEXT: pshufb %xmm7, %xmm4 ; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; SSE-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE-NEXT: pmullw %xmm5, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = <8,u,9,u,10,u,128,u,128,u,128,u,128,u,128,u> -; SSE-NEXT: pshufb %xmm6, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm7 = <128,u,128,u,128,u,1,u,4,u,7,u,10,u,13,u> -; SSE-NEXT: pshufb %xmm7, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshufb %xmm6, %xmm2 -; SSE-NEXT: pshufb %xmm7, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pmullw %xmm3, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; SSE-NEXT: pshufb %xmm3, %xmm4 +; SSE-NEXT: pshufb %xmm9, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: pmullw %xmm8, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE-NEXT: pmullw %xmm2, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq %x1 = load <48 x i8>, ptr %p1, align 16 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -469,9 +469,15 @@ } define <8 x i16> @combine_pshufb_as_unpacklo_undef(<16 x i8> %a0) { -; CHECK-LABEL: combine_pshufb_as_unpacklo_undef: -; CHECK: # %bb.0: -; CHECK-NEXT: retq +; SSE-LABEL: combine_pshufb_as_unpacklo_undef: +; SSE: # %bb.0: +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_pshufb_as_unpacklo_undef: +; AVX: # %bb.0: +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; AVX-NEXT: retq %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) %2 = bitcast <16 x i8> %1 to <8 x i16> %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -1724,46 +1724,38 @@ define <4 x i8> @combine_test1c(ptr %a, ptr %b) { ; SSE2-LABEL: combine_test1c: ; SSE2: # %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: andps %xmm0, %xmm2 -; SSE2-NEXT: andnps %xmm1, %xmm0 -; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test1c: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test1c: ; SSE41: # %bb.0: -; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE41-NEXT: movaps {{.*#+}} xmm0 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, (%rsi), %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE41-NEXT: retq ; -; AVX1-LABEL: combine_test1c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_test1c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: combine_test1c: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: retq %A = load <4 x i8>, ptr %a %B = load <4 x i8>, ptr %b %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> @@ -1772,18 +1764,34 @@ } define <4 x i8> @combine_test2c(ptr %a, ptr %b) { -; SSE-LABEL: combine_test2c: -; SSE: # %bb.0: -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: retq +; SSE2-LABEL: combine_test2c: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test2c: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test2c: +; SSE41: # %bb.0: +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, (%rsi), %xmm0 +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE41-NEXT: retq ; ; AVX-LABEL: combine_test2c: ; AVX: # %bb.0: ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: retq %A = load <4 x i8>, ptr %a %B = load <4 x i8>, ptr %b @@ -1793,20 +1801,34 @@ } define <4 x i8> @combine_test3c(ptr %a, ptr %b) { -; SSE-LABEL: combine_test3c: -; SSE: # %bb.0: -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: retq +; SSE2-LABEL: combine_test3c: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,3,2,3,4,5,6,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test3c: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,3,2,3,4,5,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test3c: +; SSE41: # %bb.0: +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, (%rdi), %xmm0 +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; SSE41-NEXT: retq ; ; AVX-LABEL: combine_test3c: ; AVX: # %bb.0: ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX-NEXT: vpinsrd $1, (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; AVX-NEXT: retq %A = load <4 x i8>, ptr %a %B = load <4 x i8>, ptr %b @@ -1818,46 +1840,38 @@ define <4 x i8> @combine_test4c(ptr %a, ptr %b) { ; SSE2-LABEL: combine_test4c: ; SSE2: # %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: andps %xmm0, %xmm2 -; SSE2-NEXT: andnps %xmm1, %xmm0 -; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test4c: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,3,4,6,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,5,2,3,u,u,u,u,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test4c: ; SSE41: # %bb.0: -; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE41-NEXT: movaps {{.*#+}} xmm0 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, (%rdi), %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,5,2,3,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE41-NEXT: retq ; -; AVX1-LABEL: combine_test4c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255] -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_test4c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255] -; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: combine_test4c: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpinsrd $1, (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,2,3,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: retq %A = load <4 x i8>, ptr %a %B = load <4 x i8>, ptr %b %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> @@ -2479,23 +2493,23 @@ ; AVX2-SLOW-LABEL: combine_unneeded_subvector1: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-SLOW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-ALL-LABEL: combine_unneeded_subvector1: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] ; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-FAST-ALL-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: combine_unneeded_subvector1: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-FAST-PERLANE-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: retq %b = add <8 x i32> %a, %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> @@ -2649,14 +2663,16 @@ define void @combine_scalar_load_with_blend_with_zero(ptr %a0, ptr %a1) { ; SSE-LABEL: combine_scalar_load_with_blend_with_zero: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE-NEXT: movdqa %xmm0, (%rsi) ; SSE-NEXT: retq ; ; AVX-LABEL: combine_scalar_load_with_blend_with_zero: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovaps %xmm0, (%rsi) +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX-NEXT: vmovdqa %xmm0, (%rsi) ; AVX-NEXT: retq %1 = load double, ptr %a0, align 8 %2 = insertelement <2 x double> undef, double %1, i32 0 @@ -2714,15 +2730,21 @@ ; ; SSE41-LABEL: combine_constant_insertion_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = -; SSE41-NEXT: pinsrd $0, %edi, %xmm0 +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_constant_insertion_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_constant_insertion_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovd %edi, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_constant_insertion_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] +; AVX2-NEXT: retq %a0 = insertelement <4 x i32> undef, i32 %f, i32 0 %ret = shufflevector <4 x i32> %a0, <4 x i32> , <4 x i32> ret <4 x i32> %ret @@ -3008,40 +3030,56 @@ define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i16> %b) { ; SSE2-LABEL: shuffle_extract_concat_insert: ; SSE2: # %bb.0: -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: pextrw $5, %xmm2, %edx +; SSE2-NEXT: pextrw $7, %xmm2, %esi +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-NEXT: pinsrw $5, %edx, %xmm0 +; SSE2-NEXT: pinsrw $6, %eax, %xmm0 +; SSE2-NEXT: pinsrw $7, %esi, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_extract_concat_insert: ; SSSE3: # %bb.0: -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pextrw $2, %xmm1, %eax +; SSSE3-NEXT: pextrw $5, %xmm2, %ecx +; SSSE3-NEXT: movd %xmm1, %edx +; SSSE3-NEXT: pextrw $7, %xmm2, %esi ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,0,1,14,15,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pinsrw $4, %eax, %xmm0 +; SSSE3-NEXT: pinsrw $5, %ecx, %xmm0 +; SSSE3-NEXT: pinsrw $6, %edx, %xmm0 +; SSSE3-NEXT: pinsrw $7, %esi, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_extract_concat_insert: ; SSE41: # %bb.0: -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] +; SSE41-NEXT: pextrw $2, %xmm1, %eax ; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,0,1,14,15,u,u,u,u,12,13,14,15] +; SSE41-NEXT: movd %xmm1, %ecx +; SSE41-NEXT: pinsrw $4, %eax, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5],xmm0[6,7] +; SSE41-NEXT: pinsrw $6, %ecx, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_extract_concat_insert: ; AVX: # %bb.0: -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpextrw $2, %xmm1, %eax +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,0,1,14,15,u,u,u,u,12,13,14,15] +; AVX-NEXT: vmovd %xmm1, %ecx +; AVX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5],xmm0[6,7] +; AVX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] ; AVX-NEXT: retq %a = shufflevector <4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i32> %a0 = extractelement <8 x i16> %a, i32 0 @@ -3067,17 +3105,19 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: psraw $8, %xmm1 ; SSE2-NEXT: pextrw $7, %xmm1, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: movsbl (%rsi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movsbl (%rdx), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: movl $65531, %ecx # imm = 0xFFFB +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movsbl (%rsi), %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: movsbl (%rdx), %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_scalar_to_vector_extract: @@ -3093,7 +3133,9 @@ ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSSE3-NEXT: pxor %xmm0, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSSE3-NEXT: movl $65531, %eax # imm = 0xFFFB +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-NEXT: retq ; @@ -3149,23 +3191,58 @@ ; Bug noticed in D96345 define i32 @shuffle_binops_with_undef() { -; SSE-LABEL: shuffle_binops_with_undef: -; SSE: # %bb.0: # %entry -; SSE-NEXT: movdqa (%rax), %xmm0 -; SSE-NEXT: paddw %xmm0, %xmm0 -; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: psrlw %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rax) -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_binops_with_undef: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movdqa (%rax), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: psrlw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, (%rax) +; SSE2-NEXT: retq ; -; AVX-LABEL: shuffle_binops_with_undef: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovdqa (%rax), %xmm0 -; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rax) -; AVX-NEXT: retq +; SSSE3-LABEL: shuffle_binops_with_undef: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movdqa (%rax), %xmm0 +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: paddw %xmm0, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: psrlw %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_binops_with_undef: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movdqa (%rax), %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: paddw %xmm0, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: psrlw %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, (%rax) +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_binops_with_undef: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovdqa (%rax), %xmm0 +; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_binops_with_undef: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovdqa (%rax), %xmm0 +; AVX2-NEXT: vpaddw %xmm0, %xmm0, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rax) +; AVX2-NEXT: retq entry: %load0 = load <8 x i16>, ptr undef, align 16 %load1 = load <8 x i16>, ptr undef, align 16 @@ -3185,48 +3262,18 @@ declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) define void @PR43024() { -; SSE2-LABEL: PR43024: -; SSE2: # %bb.0: -; SSE2-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] -; SSE2-NEXT: movaps %xmm0, (%rax) -; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: addss %xmm1, %xmm0 -; SSE2-NEXT: addss %xmm1, %xmm0 -; SSE2-NEXT: movss %xmm0, (%rax) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: PR43024: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] -; SSSE3-NEXT: movaps %xmm0, (%rax) -; SSSE3-NEXT: addss %xmm0, %xmm0 -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: addss %xmm1, %xmm0 -; SSSE3-NEXT: addss %xmm1, %xmm0 -; SSSE3-NEXT: movss %xmm0, (%rax) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: PR43024: -; SSE41: # %bb.0: -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] -; SSE41-NEXT: movaps %xmm0, (%rax) -; SSE41-NEXT: addss %xmm0, %xmm0 -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: addss %xmm1, %xmm0 -; SSE41-NEXT: addss %xmm1, %xmm0 -; SSE41-NEXT: movss %xmm0, (%rax) -; SSE41-NEXT: retq +; SSE-LABEL: PR43024: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] +; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movl $2143289344, (%rax) # imm = 0x7FC00000 +; SSE-NEXT: retq ; ; AVX-LABEL: PR43024: ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] ; AVX-NEXT: vmovaps %xmm0, (%rax) -; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}+12(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovss %xmm0, (%rax) +; AVX-NEXT: movl $2143289344, (%rax) # imm = 0x7FC00000 ; AVX-NEXT: retq store <4 x float> , ptr undef, align 16 %1 = load <4 x float>, ptr undef, align 16 @@ -3435,13 +3482,15 @@ ; SSE2-LABEL: SpinningCube: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; SSE2-NEXT: movaps {{.*#+}} xmm0 = +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movapd {{.*#+}} xmm2 = -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE2-NEXT: xorps %xmm3, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movaps {{.*#+}} xmm3 = <0.0E+0,-2.0E+0,u,u> +; SSE2-NEXT: mulps %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm2[2,0] ; SSE2-NEXT: addps %xmm3, %xmm1 ; SSE2-NEXT: movaps %xmm1, (%rax) ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero @@ -3454,18 +3503,22 @@ ; SSSE3-LABEL: SpinningCube: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; SSSE3-NEXT: movaps {{.*#+}} xmm0 = -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movapd {{.*#+}} xmm2 = -; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSSE3-NEXT: xorps %xmm3, %xmm3 -; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] -; SSSE3-NEXT: addps %xmm3, %xmm1 -; SSSE3-NEXT: movaps %xmm1, (%rax) +; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm0, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2] +; SSSE3-NEXT: movaps {{.*#+}} xmm2 = <0.0E+0,0.0E+0,-2.0E+0,u> +; SSSE3-NEXT: mulps %xmm2, %xmm1 +; SSSE3-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSSE3-NEXT: movaps %xmm1, %xmm4 +; SSSE3-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] +; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] +; SSSE3-NEXT: addps %xmm1, %xmm3 +; SSSE3-NEXT: movaps %xmm3, (%rax) ; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,2] -; SSSE3-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: mulps %xmm2, %xmm1 ; SSSE3-NEXT: addps %xmm0, %xmm1 ; SSSE3-NEXT: movaps %xmm1, (%rax) ; SSSE3-NEXT: retq @@ -3473,31 +3526,35 @@ ; SSE41-LABEL: SpinningCube: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = -; SSE41-NEXT: movaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u> -; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE41-NEXT: movaps %xmm1, %xmm3 -; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[0] -; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: insertps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[2,3] -; SSE41-NEXT: addps %xmm3, %xmm4 +; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; SSE41-NEXT: movaps {{.*#+}} xmm2 = <0.0E+0,0.0E+0,-2.0E+0,u> +; SSE41-NEXT: mulps %xmm2, %xmm0 +; SSE41-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: insertps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[2,3] +; SSE41-NEXT: addps %xmm0, %xmm4 ; SSE41-NEXT: movaps %xmm4, (%rax) -; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,2] -; SSE41-NEXT: mulps %xmm1, %xmm2 -; SSE41-NEXT: addps %xmm0, %xmm2 -; SSE41-NEXT: movaps %xmm2, (%rax) +; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; SSE41-NEXT: mulps %xmm2, %xmm0 +; SSE41-NEXT: addps %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm0, (%rax) ; SSE41-NEXT: retq ; ; AVX-LABEL: SpinningCube: ; AVX: # %bb.0: # %entry ; AVX-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u> -; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0] -; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX-NEXT: vaddps %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vbroadcastss (%rax), %xmm0 +; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,-2.0E+0] +; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX-NEXT: vaddps %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovaps %xmm2, (%rax) ; AVX-NEXT: vbroadcastss (%rax), %xmm2 ; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll b/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll @@ -21,8 +21,8 @@ ; ; AVX-LABEL: concat_a_to_shuf_of_a: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vmovapd (%rdi), %xmm0 +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: vmovaps %ymm0, (%rsi) ; AVX-NEXT: vzeroupper @@ -68,8 +68,8 @@ ; ; AVX-LABEL: concat_shuf_of_a_to_a: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vmovapd (%rdi), %xmm0 +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vmovaps %ymm0, (%rdx) ; AVX-NEXT: vzeroupper @@ -609,8 +609,8 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: movdqa %xmm0, 32(%rsi) ; SSE-NEXT: movdqa %xmm0, 48(%rsi) +; SSE-NEXT: movdqa %xmm0, 32(%rsi) ; SSE-NEXT: movdqa %xmm0, 16(%rsi) ; SSE-NEXT: movdqa %xmm1, (%rsi) ; SSE-NEXT: retq @@ -637,9 +637,8 @@ ; AVX512F-LABEL: concat_aaa_to_shuf_of_a: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,1,0,1,2,3] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, (%rsi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -647,9 +646,8 @@ ; AVX512BW-LABEL: concat_aaa_to_shuf_of_a: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,1,0,1,2,3] +; AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -694,9 +692,8 @@ ; AVX512F-LABEL: concat_shuf_of_a_to_aaa: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,0,1,1,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm0, (%rsi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -704,9 +701,8 @@ ; AVX512BW-LABEL: concat_shuf_of_a_to_aaa: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,0,1,1,0] +; AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll @@ -364,8 +364,8 @@ ; AMD10H: # %bb.0: ; AMD10H-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; AMD10H-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AMD10H-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AMD10H-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; AMD10H-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; AMD10H-NEXT: packuswb %xmm0, %xmm0 ; AMD10H-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll @@ -209,11 +209,12 @@ ; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512F-NEXT: vpor %xmm2, %xmm4, %xmm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,3,5,9,11,15,17,21,23,27,29,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512F-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpternlogq $234, %zmm2, %zmm0, %zmm4 @@ -222,14 +223,13 @@ ; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm2 ; AVX512F-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 ; AVX512F-NEXT: retq ; @@ -263,14 +263,13 @@ ; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm4 ; AVX512BW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm4 ; AVX512BW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: movabsq $8796090925056, %rax # imm = 0x7FFFFE00000 ; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} @@ -344,19 +343,17 @@ ; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14] -; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-NEXT: vpternlogq $216, %ymm5, %ymm2, %ymm0 -; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm8 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = <2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm9, %xmm8, %xmm8 -; AVX512F-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3,4],xmm2[5,6,7] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-NEXT: vpternlogq $186, %ymm2, %ymm4, %ymm0 +; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm7 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = <2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512F-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3,4],xmm2[5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1 @@ -366,14 +363,14 @@ ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512F-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX512F-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512F-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vpternlogq $226, %ymm1, %ymm5, %ymm2 +; AVX512F-NEXT: vpternlogq $226, %ymm1, %ymm4, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; @@ -496,11 +493,12 @@ ; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512F-NEXT: vpor %xmm2, %xmm4, %xmm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,2,4,8,10,14,16,20,22,26,28,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512F-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpternlogq $234, %zmm2, %zmm0, %zmm4 @@ -509,14 +507,13 @@ ; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm2 ; AVX512F-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 ; AVX512F-NEXT: retq ; @@ -550,14 +547,13 @@ ; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm4 ; AVX512BW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm4 ; AVX512BW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] +; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: movabsq $8796090925056, %rax # imm = 0x7FFFFE00000 ; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll @@ -40,11 +40,11 @@ ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: foo: @@ -53,11 +53,11 @@ ; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14] ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqu 16(%rdi), %xmm2 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[0,2,3,5,6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,3,4,6,7,9,10,12,13,15],zero,zero,zero,zero,zero,ymm0[24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-NEXT: vmovdqu 16(%rdi), %xmm2 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,2,3,5,6] +; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: retq ; @@ -70,10 +70,10 @@ ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: movl $63488, %eax # imm = 0xF800 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14] ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -1224,8 +1224,8 @@ ; SSE2-NEXT: andl $3, %ecx ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; @@ -1241,8 +1241,8 @@ ; SSSE3-NEXT: andl $3, %ecx ; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: retq ; @@ -1256,9 +1256,9 @@ ; SSE41-NEXT: andl $3, %edx ; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE41-NEXT: andl $3, %ecx -; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],zero,zero +; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: retq ; @@ -1273,9 +1273,9 @@ ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; AVX-NEXT: andl $3, %ecx ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],zero,zero ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],zero,zero -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX-NEXT: retq %x0 = extractelement <4 x float> %x, i32 %i0 %x1 = extractelement <4 x float> %x, i32 %i1 diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -2139,11 +2139,9 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_mul_const_v4i64_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_mul_const_v4i64_v4i32: @@ -4659,11 +4657,61 @@ ; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: mul_add_const_v4i64_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: mul_add_const_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: mul_add_const_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-SLOW-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-SLOW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-ALL-LABEL: mul_add_const_v4i64_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-FAST-ALL-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: mul_add_const_v4i64_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-FAST-PERLANE-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-FAST-PERLANE-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX512-LABEL: mul_add_const_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = sext <4 x i32> %a0 to <4 x i64> %2 = sext <4 x i32> %a1 to <4 x i64> %3 = mul <4 x i64> %1, %2 @@ -4685,11 +4733,61 @@ ; SSE-NEXT: paddd %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: mul_add_self_v4i64_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: mul_add_self_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: mul_add_self_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-SLOW-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-ALL-LABEL: mul_add_self_v4i64_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-FAST-ALL-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-ALL-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: mul_add_self_v4i64_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-FAST-PERLANE-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX2-FAST-PERLANE-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX512-LABEL: mul_add_self_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = sext <4 x i32> %a0 to <4 x i64> %2 = sext <4 x i32> %a1 to <4 x i64> %3 = mul <4 x i64> %1, %2 @@ -4711,11 +4809,61 @@ ; SSE-NEXT: paddd %xmm4, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: mul_add_multiuse_v4i64_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: mul_add_multiuse_v4i64_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,2,3,3] +; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpmuldq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: mul_add_multiuse_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-SLOW-NEXT: vpmuldq %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-ALL-LABEL: mul_add_multiuse_v4i64_v4i32: +; AVX2-FAST-ALL: # %bb.0: +; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-FAST-ALL-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-FAST-ALL-NEXT: vpmuldq %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-ALL-NEXT: vzeroupper +; AVX2-FAST-ALL-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: mul_add_multiuse_v4i64_v4i32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-FAST-PERLANE-NEXT: vpmuldq %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-FAST-PERLANE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX512-LABEL: mul_add_multiuse_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512-NEXT: vpmuldq %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = sext <4 x i32> %a0 to <4 x i64> %2 = sext <4 x i32> %a1 to <4 x i64> %3 = mul <4 x i64> %1, %2 diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -25,14 +25,14 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -55,14 +55,14 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -184,14 +184,14 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -215,14 +215,14 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -308,7 +308,8 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovusqd %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_packus_v2i64_v2i32_store: @@ -325,14 +326,16 @@ ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovusqd %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_packus_v2i64_v2i32_store: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpmovusqd %xmm0, (%rdi) +; SKX-NEXT: vpmovusqd %xmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -350,27 +353,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647] -; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm5 @@ -404,27 +407,27 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647] -; SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pandn %xmm3, %xmm5 -; SSSE3-NEXT: por %xmm5, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pandn %xmm3, %xmm6 +; SSSE3-NEXT: por %xmm6, %xmm0 ; SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSSE3-NEXT: por %xmm4, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm1 ; SSSE3-NEXT: pandn %xmm3, %xmm5 @@ -616,65 +619,65 @@ ; SSE2-LABEL: trunc_packus_v8i64_v8i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm3 -; SSE2-NEXT: movdqa 16(%rdi), %xmm8 +; SSE2-NEXT: movdqa 16(%rdi), %xmm7 ; SSE2-NEXT: movdqa 32(%rdi), %xmm6 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] -; SSE2-NEXT: movdqa %xmm5, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm4, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm7, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm8 +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm7 ; SSE2-NEXT: pandn %xmm4, %xmm3 -; SSE2-NEXT: por %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm8 -; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm6, %xmm8 +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pxor %xmm0, %xmm7 +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm6 +; SSE2-NEXT: pandn %xmm4, %xmm7 +; SSE2-NEXT: por %xmm6, %xmm7 ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: pxor %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm4, %xmm5 -; SSE2-NEXT: por %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm6 +; SSE2-NEXT: por %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 @@ -683,8 +686,8 @@ ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm7, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 @@ -693,7 +696,7 @@ ; SSE2-NEXT: pand %xmm5, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSE2-NEXT: por %xmm6, %xmm1 -; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: pand %xmm7, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pxor %xmm0, %xmm4 @@ -721,65 +724,65 @@ ; SSSE3-LABEL: trunc_packus_v8i64_v8i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa (%rdi), %xmm3 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm8 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm7 ; SSSE3-NEXT: movdqa 32(%rdi), %xmm6 ; SSSE3-NEXT: movdqa 48(%rdi), %xmm1 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] ; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] -; SSSE3-NEXT: movdqa %xmm5, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm2 +; SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm3 ; SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm8, %xmm3 +; SSSE3-NEXT: movdqa %xmm7, %xmm3 ; SSSE3-NEXT: pxor %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSSE3-NEXT: movdqa %xmm5, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm8 +; SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm7 ; SSSE3-NEXT: pandn %xmm4, %xmm3 -; SSSE3-NEXT: por %xmm8, %xmm3 -; SSSE3-NEXT: movdqa %xmm6, %xmm8 -; SSSE3-NEXT: pxor %xmm0, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSSE3-NEXT: movdqa %xmm5, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm8 -; SSSE3-NEXT: pand %xmm8, %xmm6 -; SSSE3-NEXT: pandn %xmm4, %xmm8 -; SSSE3-NEXT: por %xmm6, %xmm8 +; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: movdqa %xmm6, %xmm7 +; SSSE3-NEXT: pxor %xmm0, %xmm7 +; SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm6 +; SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSSE3-NEXT: por %xmm6, %xmm7 ; SSSE3-NEXT: movdqa %xmm1, %xmm6 ; SSSE3-NEXT: pxor %xmm0, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSSE3-NEXT: por %xmm1, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm1 +; SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm1 +; SSSE3-NEXT: pandn %xmm4, %xmm6 +; SSSE3-NEXT: por %xmm1, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm1 ; SSSE3-NEXT: pxor %xmm0, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 @@ -788,8 +791,8 @@ ; SSSE3-NEXT: pand %xmm4, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: movdqa %xmm8, %xmm1 +; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: movdqa %xmm7, %xmm1 ; SSSE3-NEXT: pxor %xmm0, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 @@ -798,7 +801,7 @@ ; SSSE3-NEXT: pand %xmm5, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSSE3-NEXT: por %xmm6, %xmm1 -; SSSE3-NEXT: pand %xmm8, %xmm1 +; SSSE3-NEXT: pand %xmm7, %xmm1 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] ; SSSE3-NEXT: movdqa %xmm3, %xmm4 ; SSSE3-NEXT: pxor %xmm0, %xmm4 @@ -1042,14 +1045,14 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -1074,14 +1077,14 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -1217,14 +1220,14 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -1250,14 +1253,14 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -1360,7 +1363,8 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovusqw %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovusqw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_packus_v2i64_v2i16_store: @@ -1377,14 +1381,16 @@ ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovusqw %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusqw %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_packus_v2i64_v2i16_store: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpmovusqw %xmm0, (%rdi) +; SKX-NEXT: vpmovusqw %xmm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -1402,27 +1408,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm4, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm5 @@ -1462,27 +1468,27 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] -; SSSE3-NEXT: por %xmm9, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm8, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm1 ; SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm1, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm0 ; SSSE3-NEXT: pandn %xmm4, %xmm5 @@ -1655,27 +1661,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm4, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm5 @@ -1716,27 +1722,27 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] -; SSSE3-NEXT: por %xmm9, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm8, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm1 ; SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm1, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm0 ; SSSE3-NEXT: pandn %xmm4, %xmm5 @@ -1870,7 +1876,8 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovusqw %ymm0, (%rdi) +; AVX512VL-NEXT: vpmovusqw %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -1888,7 +1895,8 @@ ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovusqw %ymm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; @@ -1896,7 +1904,8 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; SKX-NEXT: vpmovusqw %ymm0, (%rdi) +; SKX-NEXT: vpmovusqw %ymm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp slt <4 x i64> %a0, @@ -1911,7 +1920,7 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" { ; SSE2-LABEL: trunc_packus_v8i64_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm8 +; SSE2-NEXT: movdqa (%rdi), %xmm7 ; SSE2-NEXT: movdqa 16(%rdi), %xmm2 ; SSE2-NEXT: movdqa 32(%rdi), %xmm3 ; SSE2-NEXT: movdqa 48(%rdi), %xmm6 @@ -1919,79 +1928,79 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm5, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm1 +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pandn %xmm4, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm8 +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm7 ; SSE2-NEXT: pandn %xmm4, %xmm2 -; SSE2-NEXT: por %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm8 -; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm6, %xmm8 +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pxor %xmm0, %xmm7 +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm6 +; SSE2-NEXT: pandn %xmm4, %xmm7 +; SSE2-NEXT: por %xmm6, %xmm7 ; SSE2-NEXT: movdqa %xmm3, %xmm6 ; SSE2-NEXT: pxor %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: pandn %xmm4, %xmm5 -; SSE2-NEXT: por %xmm3, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm6 +; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: movdqa %xmm8, %xmm4 +; SSE2-NEXT: por %xmm8, %xmm3 +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm7, %xmm4 ; SSE2-NEXT: pxor %xmm0, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm4 -; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: por %xmm8, %xmm4 +; SSE2-NEXT: pand %xmm7, %xmm4 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: pxor %xmm0, %xmm5 ; SSE2-NEXT: movdqa %xmm5, %xmm6 @@ -2029,7 +2038,7 @@ ; ; SSSE3-LABEL: trunc_packus_v8i64_v8i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa (%rdi), %xmm8 +; SSSE3-NEXT: movdqa (%rdi), %xmm7 ; SSSE3-NEXT: movdqa 16(%rdi), %xmm2 ; SSSE3-NEXT: movdqa 32(%rdi), %xmm3 ; SSSE3-NEXT: movdqa 48(%rdi), %xmm6 @@ -2037,79 +2046,79 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm5, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm1 +; SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: pandn %xmm4, %xmm1 ; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm7, %xmm2 ; SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSSE3-NEXT: movdqa %xmm5, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm8 +; SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm7 ; SSSE3-NEXT: pandn %xmm4, %xmm2 -; SSSE3-NEXT: por %xmm8, %xmm2 -; SSSE3-NEXT: movdqa %xmm6, %xmm8 -; SSSE3-NEXT: pxor %xmm0, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSSE3-NEXT: movdqa %xmm5, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm8 -; SSSE3-NEXT: pand %xmm8, %xmm6 -; SSSE3-NEXT: pandn %xmm4, %xmm8 -; SSSE3-NEXT: por %xmm6, %xmm8 +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: movdqa %xmm6, %xmm7 +; SSSE3-NEXT: pxor %xmm0, %xmm7 +; SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm6 +; SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSSE3-NEXT: por %xmm6, %xmm7 ; SSSE3-NEXT: movdqa %xmm3, %xmm6 ; SSSE3-NEXT: pxor %xmm0, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm3 -; SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSSE3-NEXT: por %xmm3, %xmm5 -; SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: pandn %xmm4, %xmm6 +; SSSE3-NEXT: por %xmm3, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm3 ; SSSE3-NEXT: pxor %xmm0, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, %xmm4 ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm8 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm3 -; SSSE3-NEXT: pand %xmm5, %xmm3 -; SSSE3-NEXT: movdqa %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm8, %xmm3 +; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: movdqa %xmm7, %xmm4 ; SSSE3-NEXT: pxor %xmm0, %xmm4 ; SSSE3-NEXT: movdqa %xmm4, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm8 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm4 -; SSSE3-NEXT: pand %xmm8, %xmm4 +; SSSE3-NEXT: por %xmm8, %xmm4 +; SSSE3-NEXT: pand %xmm7, %xmm4 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 ; SSSE3-NEXT: pxor %xmm0, %xmm5 ; SSSE3-NEXT: movdqa %xmm5, %xmm6 @@ -2427,37 +2436,16 @@ ; AVX-NEXT: vmovq %xmm0, (%rdi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: trunc_packus_v4i32_v4i16_store: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rdi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_packus_v4i32_v4i16_store: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovusdw %xmm0, (%rdi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_packus_v4i32_v4i16_store: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rdi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_packus_v4i32_v4i16_store: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovusdw %xmm0, (%rdi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc_packus_v4i32_v4i16_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rdi) +; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_packus_v4i32_v4i16_store: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpmovusdw %xmm0, (%rdi) +; SKX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <4 x i32> %a0, %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> @@ -2747,14 +2735,14 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -2780,14 +2768,14 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -2909,14 +2897,14 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -2944,14 +2932,14 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -3039,7 +3027,8 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovusqb %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovusqb %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_packus_v2i64_v2i8_store: @@ -3056,14 +3045,16 @@ ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovusqb %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusqb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_packus_v2i64_v2i8_store: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpmovusqb %xmm0, (%rdi) +; SKX-NEXT: vpmovusqb %xmm0, %xmm0 +; SKX-NEXT: vpextrw $0, %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -3081,27 +3072,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm5 @@ -3139,27 +3130,27 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] -; SSSE3-NEXT: por %xmm9, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm8, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm1 ; SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm1, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm0 ; SSSE3-NEXT: pandn %xmm4, %xmm5 @@ -3338,28 +3329,28 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 @@ -3397,27 +3388,27 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] -; SSSE3-NEXT: por %xmm9, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm8, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm1 ; SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm1, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm0 ; SSSE3-NEXT: pandn %xmm4, %xmm5 @@ -3557,7 +3548,8 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovusqb %ymm0, (%rdi) +; AVX512VL-NEXT: vpmovusqb %ymm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -3575,7 +3567,8 @@ ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovusqb %ymm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusqb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, (%rdi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; @@ -3583,7 +3576,8 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; SKX-NEXT: vpmovusqb %ymm0, (%rdi) +; SKX-NEXT: vpmovusqb %ymm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp slt <4 x i64> %a0, @@ -3601,58 +3595,58 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm5 ; SSE2-NEXT: movdqa 16(%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm3 -; SSE2-NEXT: movdqa 48(%rdi), %xmm8 +; SSE2-NEXT: movdqa 48(%rdi), %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255] ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm2 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm4, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm7, %xmm3 ; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm8 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm7 ; SSE2-NEXT: pandn %xmm4, %xmm3 -; SSE2-NEXT: por %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm8 -; SSE2-NEXT: pxor %xmm1, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm8 -; SSE2-NEXT: pand %xmm8, %xmm5 -; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm5, %xmm8 +; SSE2-NEXT: por %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: pxor %xmm1, %xmm7 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pandn %xmm4, %xmm7 +; SSE2-NEXT: por %xmm5, %xmm7 ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2] +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE2-NEXT: pand %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm5, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm6 @@ -3668,17 +3662,17 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm4 ; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: por %xmm8, %xmm0 +; SSE2-NEXT: pand %xmm7, %xmm0 ; SSE2-NEXT: packuswb %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pxor %xmm1, %xmm4 @@ -3712,58 +3706,58 @@ ; SSSE3-NEXT: movdqa (%rdi), %xmm5 ; SSSE3-NEXT: movdqa 16(%rdi), %xmm0 ; SSSE3-NEXT: movdqa 32(%rdi), %xmm3 -; SSSE3-NEXT: movdqa 48(%rdi), %xmm8 +; SSSE3-NEXT: movdqa 48(%rdi), %xmm7 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255] ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm2 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm3 ; SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm8, %xmm3 +; SSSE3-NEXT: movdqa %xmm7, %xmm3 ; SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm8 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm7 ; SSSE3-NEXT: pandn %xmm4, %xmm3 -; SSSE3-NEXT: por %xmm8, %xmm3 -; SSSE3-NEXT: movdqa %xmm5, %xmm8 -; SSSE3-NEXT: pxor %xmm1, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm8 -; SSSE3-NEXT: pand %xmm8, %xmm5 -; SSSE3-NEXT: pandn %xmm4, %xmm8 -; SSSE3-NEXT: por %xmm5, %xmm8 +; SSSE3-NEXT: por %xmm7, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSSE3-NEXT: pxor %xmm1, %xmm7 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm5 +; SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSSE3-NEXT: por %xmm5, %xmm7 ; SSSE3-NEXT: movdqa %xmm0, %xmm5 ; SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSSE3-NEXT: pand %xmm9, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] ; SSSE3-NEXT: por %xmm5, %xmm6 ; SSSE3-NEXT: pand %xmm6, %xmm0 ; SSSE3-NEXT: pandn %xmm4, %xmm6 @@ -3779,17 +3773,17 @@ ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm4 ; SSSE3-NEXT: pand %xmm6, %xmm4 -; SSSE3-NEXT: movdqa %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm7, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm8 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm0 -; SSSE3-NEXT: pand %xmm8, %xmm0 +; SSSE3-NEXT: por %xmm8, %xmm0 +; SSSE3-NEXT: pand %xmm7, %xmm0 ; SSSE3-NEXT: packuswb %xmm4, %xmm0 ; SSSE3-NEXT: movdqa %xmm3, %xmm4 ; SSSE3-NEXT: pxor %xmm1, %xmm4 @@ -4002,58 +3996,58 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm5 ; SSE2-NEXT: movdqa 16(%rdi), %xmm3 ; SSE2-NEXT: movdqa 32(%rdi), %xmm2 -; SSE2-NEXT: movdqa 48(%rdi), %xmm8 +; SSE2-NEXT: movdqa 48(%rdi), %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255] ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pandn %xmm4, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm8 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm7 ; SSE2-NEXT: pandn %xmm4, %xmm2 -; SSE2-NEXT: por %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm5, %xmm8 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm8 -; SSE2-NEXT: pand %xmm8, %xmm5 -; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm5, %xmm8 +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: pxor %xmm0, %xmm7 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pandn %xmm4, %xmm7 +; SSE2-NEXT: por %xmm5, %xmm7 ; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: pxor %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2] +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE2-NEXT: pand %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm5, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm3 ; SSE2-NEXT: pandn %xmm4, %xmm6 @@ -4069,17 +4063,17 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm3, %xmm4 ; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm7, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: por %xmm8, %xmm3 +; SSE2-NEXT: pand %xmm7, %xmm3 ; SSE2-NEXT: packuswb %xmm4, %xmm3 ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: pxor %xmm0, %xmm4 @@ -4114,58 +4108,58 @@ ; SSSE3-NEXT: movdqa (%rdi), %xmm5 ; SSSE3-NEXT: movdqa 16(%rdi), %xmm3 ; SSSE3-NEXT: movdqa 32(%rdi), %xmm2 -; SSSE3-NEXT: movdqa 48(%rdi), %xmm8 +; SSSE3-NEXT: movdqa 48(%rdi), %xmm7 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255] ; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] -; SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm1 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: pandn %xmm4, %xmm1 ; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm7, %xmm2 ; SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm8 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm7 ; SSSE3-NEXT: pandn %xmm4, %xmm2 -; SSSE3-NEXT: por %xmm8, %xmm2 -; SSSE3-NEXT: movdqa %xmm5, %xmm8 -; SSSE3-NEXT: pxor %xmm0, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm8 -; SSSE3-NEXT: pand %xmm8, %xmm5 -; SSSE3-NEXT: pandn %xmm4, %xmm8 -; SSSE3-NEXT: por %xmm5, %xmm8 +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSSE3-NEXT: pxor %xmm0, %xmm7 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm5 +; SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSSE3-NEXT: por %xmm5, %xmm7 ; SSSE3-NEXT: movdqa %xmm3, %xmm5 ; SSSE3-NEXT: pxor %xmm0, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSSE3-NEXT: pand %xmm9, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] ; SSSE3-NEXT: por %xmm5, %xmm6 ; SSSE3-NEXT: pand %xmm6, %xmm3 ; SSSE3-NEXT: pandn %xmm4, %xmm6 @@ -4181,17 +4175,17 @@ ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm3, %xmm4 ; SSSE3-NEXT: pand %xmm6, %xmm4 -; SSSE3-NEXT: movdqa %xmm8, %xmm3 +; SSSE3-NEXT: movdqa %xmm7, %xmm3 ; SSSE3-NEXT: pxor %xmm0, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, %xmm5 ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm8 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm3 -; SSSE3-NEXT: pand %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm8, %xmm3 +; SSSE3-NEXT: pand %xmm7, %xmm3 ; SSSE3-NEXT: packuswb %xmm4, %xmm3 ; SSSE3-NEXT: movdqa %xmm2, %xmm4 ; SSSE3-NEXT: pxor %xmm0, %xmm4 @@ -4378,7 +4372,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vpmaxsq (%rdi), %zmm0, %zmm0 -; AVX512-NEXT: vpmovusqb %zmm0, (%rsi) +; AVX512-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -4408,9 +4403,9 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm7 ; SSE2-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-NEXT: movdqa 32(%rdi), %xmm12 -; SSE2-NEXT: movdqa 48(%rdi), %xmm11 -; SSE2-NEXT: movdqa 80(%rdi), %xmm10 +; SSE2-NEXT: movdqa 32(%rdi), %xmm11 +; SSE2-NEXT: movdqa 48(%rdi), %xmm10 +; SSE2-NEXT: movdqa 80(%rdi), %xmm9 ; SSE2-NEXT: movdqa 64(%rdi), %xmm5 ; SSE2-NEXT: movdqa 112(%rdi), %xmm4 ; SSE2-NEXT: movdqa 96(%rdi), %xmm3 @@ -4418,105 +4413,105 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm13 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm8, %xmm14 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-NEXT: pand %xmm13, %xmm15 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,3,3] -; SSE2-NEXT: por %xmm15, %xmm2 +; SSE2-NEXT: movdqa %xmm8, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm6, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm13 -; SSE2-NEXT: movdqa %xmm8, %xmm14 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-NEXT: pand %xmm13, %xmm15 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,3,3] -; SSE2-NEXT: por %xmm15, %xmm3 +; SSE2-NEXT: movdqa %xmm8, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm4 ; SSE2-NEXT: pandn %xmm6, %xmm3 ; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm13 -; SSE2-NEXT: movdqa %xmm8, %xmm14 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-NEXT: pand %xmm13, %xmm15 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,3,3] -; SSE2-NEXT: por %xmm15, %xmm4 +; SSE2-NEXT: movdqa %xmm8, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm5 ; SSE2-NEXT: pandn %xmm6, %xmm4 ; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm10, %xmm5 +; SSE2-NEXT: movdqa %xmm9, %xmm5 ; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm13 -; SSE2-NEXT: movdqa %xmm8, %xmm14 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-NEXT: pand %xmm13, %xmm15 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm14[1,1,3,3] -; SSE2-NEXT: por %xmm15, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm10 +; SSE2-NEXT: movdqa %xmm8, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm9 ; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm10, %xmm5 -; SSE2-NEXT: movdqa %xmm12, %xmm10 -; SSE2-NEXT: pxor %xmm1, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm10[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm13 -; SSE2-NEXT: movdqa %xmm8, %xmm14 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-NEXT: pand %xmm13, %xmm15 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm14[1,1,3,3] -; SSE2-NEXT: por %xmm15, %xmm10 -; SSE2-NEXT: pand %xmm10, %xmm12 -; SSE2-NEXT: pandn %xmm6, %xmm10 -; SSE2-NEXT: por %xmm12, %xmm10 -; SSE2-NEXT: movdqa %xmm11, %xmm12 -; SSE2-NEXT: pxor %xmm1, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm13 -; SSE2-NEXT: movdqa %xmm8, %xmm14 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-NEXT: pand %xmm13, %xmm15 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm14[1,1,3,3] -; SSE2-NEXT: por %xmm15, %xmm12 -; SSE2-NEXT: pand %xmm12, %xmm11 -; SSE2-NEXT: pandn %xmm6, %xmm12 -; SSE2-NEXT: por %xmm11, %xmm12 -; SSE2-NEXT: movdqa %xmm7, %xmm11 +; SSE2-NEXT: por %xmm9, %xmm5 +; SSE2-NEXT: movdqa %xmm11, %xmm9 +; SSE2-NEXT: pxor %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm8, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm11 +; SSE2-NEXT: pandn %xmm6, %xmm9 +; SSE2-NEXT: por %xmm11, %xmm9 +; SSE2-NEXT: movdqa %xmm10, %xmm11 ; SSE2-NEXT: pxor %xmm1, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm13 -; SSE2-NEXT: movdqa %xmm8, %xmm14 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-NEXT: pand %xmm13, %xmm15 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,3,3] -; SSE2-NEXT: por %xmm15, %xmm11 -; SSE2-NEXT: pand %xmm11, %xmm7 +; SSE2-NEXT: movdqa %xmm8, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm11[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm10 ; SSE2-NEXT: pandn %xmm6, %xmm11 -; SSE2-NEXT: por %xmm7, %xmm11 +; SSE2-NEXT: por %xmm10, %xmm11 +; SSE2-NEXT: movdqa %xmm7, %xmm10 +; SSE2-NEXT: pxor %xmm1, %xmm10 +; SSE2-NEXT: movdqa %xmm8, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm10[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pandn %xmm6, %xmm10 +; SSE2-NEXT: por %xmm7, %xmm10 ; SSE2-NEXT: movdqa %xmm0, %xmm7 ; SSE2-NEXT: pxor %xmm1, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm13 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,0,2,2] +; SSE2-NEXT: movdqa %xmm8, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; SSE2-NEXT: pand %xmm13, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,3,3] ; SSE2-NEXT: por %xmm7, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pandn %xmm6, %xmm8 @@ -4532,19 +4527,19 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm6 ; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: movdqa %xmm11, %xmm0 +; SSE2-NEXT: movdqa %xmm10, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm12 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm0 -; SSE2-NEXT: pand %xmm11, %xmm0 +; SSE2-NEXT: por %xmm12, %xmm0 +; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: packuswb %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm12, %xmm6 +; SSE2-NEXT: movdqa %xmm11, %xmm6 ; SSE2-NEXT: pxor %xmm1, %xmm6 ; SSE2-NEXT: movdqa %xmm6, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 @@ -4554,18 +4549,18 @@ ; SSE2-NEXT: pand %xmm8, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm6, %xmm7 -; SSE2-NEXT: pand %xmm12, %xmm7 -; SSE2-NEXT: movdqa %xmm10, %xmm6 +; SSE2-NEXT: pand %xmm11, %xmm7 +; SSE2-NEXT: movdqa %xmm9, %xmm6 ; SSE2-NEXT: pxor %xmm1, %xmm6 ; SSE2-NEXT: movdqa %xmm6, %xmm8 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: pand %xmm10, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm6, %xmm8 -; SSE2-NEXT: pand %xmm10, %xmm8 +; SSE2-NEXT: pand %xmm9, %xmm8 ; SSE2-NEXT: packuswb %xmm7, %xmm8 ; SSE2-NEXT: packuswb %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm5, %xmm6 @@ -4622,9 +4617,9 @@ ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa (%rdi), %xmm7 ; SSSE3-NEXT: movdqa 16(%rdi), %xmm0 -; SSSE3-NEXT: movdqa 32(%rdi), %xmm12 -; SSSE3-NEXT: movdqa 48(%rdi), %xmm11 -; SSSE3-NEXT: movdqa 80(%rdi), %xmm10 +; SSSE3-NEXT: movdqa 32(%rdi), %xmm11 +; SSSE3-NEXT: movdqa 48(%rdi), %xmm10 +; SSSE3-NEXT: movdqa 80(%rdi), %xmm9 ; SSSE3-NEXT: movdqa 64(%rdi), %xmm5 ; SSSE3-NEXT: movdqa 112(%rdi), %xmm4 ; SSSE3-NEXT: movdqa 96(%rdi), %xmm3 @@ -4632,105 +4627,105 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm9, %xmm9 -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm8, %xmm14 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm14 -; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSSE3-NEXT: pand %xmm13, %xmm15 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,3,3] -; SSSE3-NEXT: por %xmm15, %xmm2 +; SSSE3-NEXT: movdqa %xmm8, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm3 ; SSSE3-NEXT: pandn %xmm6, %xmm2 ; SSSE3-NEXT: por %xmm3, %xmm2 ; SSSE3-NEXT: movdqa %xmm4, %xmm3 ; SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 -; SSSE3-NEXT: movdqa %xmm8, %xmm14 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm14 -; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSSE3-NEXT: pand %xmm13, %xmm15 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,3,3] -; SSSE3-NEXT: por %xmm15, %xmm3 +; SSSE3-NEXT: movdqa %xmm8, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm4 ; SSSE3-NEXT: pandn %xmm6, %xmm3 ; SSSE3-NEXT: por %xmm4, %xmm3 ; SSSE3-NEXT: movdqa %xmm5, %xmm4 ; SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 -; SSSE3-NEXT: movdqa %xmm8, %xmm14 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm14 -; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSSE3-NEXT: pand %xmm13, %xmm15 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,3,3] -; SSSE3-NEXT: por %xmm15, %xmm4 +; SSSE3-NEXT: movdqa %xmm8, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm4 ; SSSE3-NEXT: pand %xmm4, %xmm5 ; SSSE3-NEXT: pandn %xmm6, %xmm4 ; SSSE3-NEXT: por %xmm5, %xmm4 -; SSSE3-NEXT: movdqa %xmm10, %xmm5 +; SSSE3-NEXT: movdqa %xmm9, %xmm5 ; SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 -; SSSE3-NEXT: movdqa %xmm8, %xmm14 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm14 -; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSSE3-NEXT: pand %xmm13, %xmm15 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm14[1,1,3,3] -; SSSE3-NEXT: por %xmm15, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm10 +; SSSE3-NEXT: movdqa %xmm8, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm9 ; SSSE3-NEXT: pandn %xmm6, %xmm5 -; SSSE3-NEXT: por %xmm10, %xmm5 -; SSSE3-NEXT: movdqa %xmm12, %xmm10 -; SSSE3-NEXT: pxor %xmm1, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm10[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 -; SSSE3-NEXT: movdqa %xmm8, %xmm14 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm14 -; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSSE3-NEXT: pand %xmm13, %xmm15 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm14[1,1,3,3] -; SSSE3-NEXT: por %xmm15, %xmm10 -; SSSE3-NEXT: pand %xmm10, %xmm12 -; SSSE3-NEXT: pandn %xmm6, %xmm10 -; SSSE3-NEXT: por %xmm12, %xmm10 -; SSSE3-NEXT: movdqa %xmm11, %xmm12 -; SSSE3-NEXT: pxor %xmm1, %xmm12 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 -; SSSE3-NEXT: movdqa %xmm8, %xmm14 -; SSSE3-NEXT: pcmpgtd %xmm12, %xmm14 -; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSSE3-NEXT: pand %xmm13, %xmm15 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm14[1,1,3,3] -; SSSE3-NEXT: por %xmm15, %xmm12 -; SSSE3-NEXT: pand %xmm12, %xmm11 -; SSSE3-NEXT: pandn %xmm6, %xmm12 -; SSSE3-NEXT: por %xmm11, %xmm12 -; SSSE3-NEXT: movdqa %xmm7, %xmm11 +; SSSE3-NEXT: por %xmm9, %xmm5 +; SSSE3-NEXT: movdqa %xmm11, %xmm9 +; SSSE3-NEXT: pxor %xmm1, %xmm9 +; SSSE3-NEXT: movdqa %xmm8, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm9 +; SSSE3-NEXT: pand %xmm9, %xmm11 +; SSSE3-NEXT: pandn %xmm6, %xmm9 +; SSSE3-NEXT: por %xmm11, %xmm9 +; SSSE3-NEXT: movdqa %xmm10, %xmm11 ; SSSE3-NEXT: pxor %xmm1, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 -; SSSE3-NEXT: movdqa %xmm8, %xmm14 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm14 -; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSSE3-NEXT: pand %xmm13, %xmm15 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,3,3] -; SSSE3-NEXT: por %xmm15, %xmm11 -; SSSE3-NEXT: pand %xmm11, %xmm7 +; SSSE3-NEXT: movdqa %xmm8, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm11[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm11 +; SSSE3-NEXT: pand %xmm11, %xmm10 ; SSSE3-NEXT: pandn %xmm6, %xmm11 -; SSSE3-NEXT: por %xmm7, %xmm11 +; SSSE3-NEXT: por %xmm10, %xmm11 +; SSSE3-NEXT: movdqa %xmm7, %xmm10 +; SSSE3-NEXT: pxor %xmm1, %xmm10 +; SSSE3-NEXT: movdqa %xmm8, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm10[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm10 +; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pandn %xmm6, %xmm10 +; SSSE3-NEXT: por %xmm7, %xmm10 ; SSSE3-NEXT: movdqa %xmm0, %xmm7 ; SSSE3-NEXT: pxor %xmm1, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm13 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm8, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm8, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; SSSE3-NEXT: pand %xmm13, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,3,3] ; SSSE3-NEXT: por %xmm7, %xmm8 ; SSSE3-NEXT: pand %xmm8, %xmm0 ; SSSE3-NEXT: pandn %xmm6, %xmm8 @@ -4746,19 +4741,19 @@ ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm6 ; SSSE3-NEXT: pand %xmm8, %xmm6 -; SSSE3-NEXT: movdqa %xmm11, %xmm0 +; SSSE3-NEXT: movdqa %xmm10, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm7 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 ; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm12 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm9, %xmm0 -; SSSE3-NEXT: pand %xmm11, %xmm0 +; SSSE3-NEXT: por %xmm12, %xmm0 +; SSSE3-NEXT: pand %xmm10, %xmm0 ; SSSE3-NEXT: packuswb %xmm6, %xmm0 -; SSSE3-NEXT: movdqa %xmm12, %xmm6 +; SSSE3-NEXT: movdqa %xmm11, %xmm6 ; SSSE3-NEXT: pxor %xmm1, %xmm6 ; SSSE3-NEXT: movdqa %xmm6, %xmm7 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 @@ -4768,18 +4763,18 @@ ; SSSE3-NEXT: pand %xmm8, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] ; SSSE3-NEXT: por %xmm6, %xmm7 -; SSSE3-NEXT: pand %xmm12, %xmm7 -; SSSE3-NEXT: movdqa %xmm10, %xmm6 +; SSSE3-NEXT: pand %xmm11, %xmm7 +; SSSE3-NEXT: movdqa %xmm9, %xmm6 ; SSSE3-NEXT: pxor %xmm1, %xmm6 ; SSSE3-NEXT: movdqa %xmm6, %xmm8 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm9, %xmm6 +; SSSE3-NEXT: pand %xmm10, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] ; SSSE3-NEXT: por %xmm6, %xmm8 -; SSSE3-NEXT: pand %xmm10, %xmm8 +; SSSE3-NEXT: pand %xmm9, %xmm8 ; SSSE3-NEXT: packuswb %xmm7, %xmm8 ; SSSE3-NEXT: packuswb %xmm8, %xmm0 ; SSSE3-NEXT: movdqa %xmm5, %xmm6 @@ -5128,13 +5123,14 @@ ; SKX-NEXT: vpmovusqb %ymm1, %xmm1 ; SKX-NEXT: vpmaxsq 64(%rdi), %ymm0, %ymm2 ; SKX-NEXT: vpmovusqb %ymm2, %xmm2 -; SKX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SKX-NEXT: vpmaxsq 32(%rdi), %ymm0, %ymm2 -; SKX-NEXT: vpmovusqb %ymm2, %xmm2 +; SKX-NEXT: vpbroadcastq {{.*#+}} xmm3 = [0,4,0,4] +; SKX-NEXT: vpermi2d %xmm1, %xmm2, %xmm3 +; SKX-NEXT: vpmaxsq 32(%rdi), %ymm0, %ymm1 +; SKX-NEXT: vpmovusqb %ymm1, %xmm1 ; SKX-NEXT: vpmaxsq (%rdi), %ymm0, %ymm0 ; SKX-NEXT: vpmovusqb %ymm0, %xmm0 -; SKX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SKX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %a0 = load <16 x i64>, ptr %p0 @@ -5330,7 +5326,8 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovusdb %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovusdb %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_packus_v4i32_v4i8_store: @@ -5346,14 +5343,16 @@ ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovusdb %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusdb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_packus_v4i32_v4i8_store: ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpmovusdb %xmm0, (%rdi) +; SKX-NEXT: vpmovusdb %xmm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <4 x i32> %a0, %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> @@ -5473,7 +5472,8 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovusdb %ymm0, (%rdi) +; AVX512VL-NEXT: vpmovusdb %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -5490,7 +5490,8 @@ ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovusdb %ymm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; @@ -5498,7 +5499,8 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; SKX-NEXT: vpmovusdb %ymm0, (%rdi) +; SKX-NEXT: vpmovusdb %ymm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp slt <8 x i32> %a0, @@ -5663,36 +5665,16 @@ ; AVX-NEXT: vmovq %xmm0, (%rdi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: trunc_packus_v8i16_v8i8_store: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rdi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_packus_v8i16_v8i8_store: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, (%rdi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_packus_v8i16_v8i8_store: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rdi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_packus_v8i16_v8i8_store: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovuswb %xmm0, (%rdi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc_packus_v8i16_v8i8_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rdi) +; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_packus_v8i16_v8i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpmovuswb %xmm0, (%rdi) +; SKX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <8 x i16> %a0, %2 = select <8 x i1> %1, <8 x i16> %a0, <8 x i16> diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -25,26 +25,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -57,26 +58,27 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSSE3-NEXT: por %xmm0, %xmm3 ; SSSE3-NEXT: pxor %xmm3, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [18446744069414584320,18446744069414584320] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm3 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -179,26 +181,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -212,26 +215,27 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSSE3-NEXT: por %xmm0, %xmm3 ; SSSE3-NEXT: pxor %xmm3, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [18446744069414584320,18446744069414584320] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm3 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -304,7 +308,8 @@ ; ; AVX512VL-LABEL: trunc_ssat_v2i64_v2i32_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsqd %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovsqd %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_ssat_v2i64_v2i32_store: @@ -317,12 +322,14 @@ ; ; AVX512BWVL-LABEL: trunc_ssat_v2i64_v2i32_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsqd %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovsqd %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v2i64_v2i32_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsqd %xmm0, (%rdi) +; SKX-NEXT: vpmovsqd %xmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -340,27 +347,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [4294967295,4294967295] -; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm5 @@ -368,30 +375,31 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; SSE2-NEXT: movdqa %xmm5, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm8, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm6 +; SSE2-NEXT: por %xmm5, %xmm6 ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_ssat_v4i64_v4i32: @@ -400,27 +408,27 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [4294967295,4294967295] -; SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pandn %xmm3, %xmm5 -; SSSE3-NEXT: por %xmm5, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pandn %xmm3, %xmm6 +; SSSE3-NEXT: por %xmm6, %xmm0 ; SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSSE3-NEXT: por %xmm4, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm1 ; SSSE3-NEXT: pandn %xmm3, %xmm5 @@ -428,30 +436,31 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm8 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744069414584320,18446744069414584320] +; SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm8, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm5 -; SSSE3-NEXT: pandn %xmm1, %xmm3 -; SSSE3-NEXT: por %xmm5, %xmm3 +; SSSE3-NEXT: pand %xmm7, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm5 +; SSSE3-NEXT: pandn %xmm1, %xmm6 +; SSSE3-NEXT: por %xmm5, %xmm6 ; SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pandn %xmm1, %xmm2 -; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pandn %xmm1, %xmm3 +; SSSE3-NEXT: por %xmm3, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v4i64_v4i32: @@ -616,109 +625,112 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm2 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm4, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm5, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm3 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm5 ; SSE2-NEXT: pandn %xmm4, %xmm3 ; SSE2-NEXT: por %xmm5, %xmm3 ; SSE2-NEXT: movdqa %xmm7, %xmm5 ; SSE2-NEXT: pxor %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm5 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm7 ; SSE2-NEXT: pandn %xmm4, %xmm5 ; SSE2-NEXT: por %xmm7, %xmm5 ; SSE2-NEXT: movdqa %xmm1, %xmm7 ; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm8 -; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm7 +; SSE2-NEXT: por %xmm1, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968] -; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm7, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm10, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm8 -; SSE2-NEXT: pandn %xmm4, %xmm9 -; SSE2-NEXT: por %xmm8, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm7, %xmm8 ; SSE2-NEXT: movdqa %xmm5, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm8, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm10, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm5 ; SSE2-NEXT: pandn %xmm4, %xmm1 ; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2] ; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: pxor %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: pandn %xmm4, %xmm5 -; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm7 +; SSE2-NEXT: por %xmm3, %xmm7 ; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[0,2] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_ssat_v8i64_v8i32: @@ -731,109 +743,112 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] -; SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm2 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm3 ; SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSSE3-NEXT: por %xmm3, %xmm2 ; SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSSE3-NEXT: pxor %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm3 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm5 ; SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm5, %xmm3 ; SSSE3-NEXT: movdqa %xmm7, %xmm5 ; SSSE3-NEXT: pxor %xmm0, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSSE3-NEXT: movdqa %xmm6, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm5 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm7 ; SSSE3-NEXT: pandn %xmm4, %xmm5 ; SSSE3-NEXT: por %xmm7, %xmm5 ; SSSE3-NEXT: movdqa %xmm1, %xmm7 ; SSSE3-NEXT: pxor %xmm0, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm8 -; SSSE3-NEXT: pand %xmm8, %xmm1 -; SSSE3-NEXT: pandn %xmm4, %xmm8 -; SSSE3-NEXT: por %xmm1, %xmm8 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSSE3-NEXT: por %xmm1, %xmm7 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968] -; SSSE3-NEXT: movdqa %xmm8, %xmm1 +; SSSE3-NEXT: movdqa %xmm7, %xmm1 ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm9 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm10, %xmm9 -; SSSE3-NEXT: pand %xmm9, %xmm8 -; SSSE3-NEXT: pandn %xmm4, %xmm9 -; SSSE3-NEXT: por %xmm8, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320] +; SSSE3-NEXT: movdqa %xmm1, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm7 +; SSSE3-NEXT: pandn %xmm4, %xmm8 +; SSSE3-NEXT: por %xmm7, %xmm8 ; SSSE3-NEXT: movdqa %xmm5, %xmm1 ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm8, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm1, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] ; SSSE3-NEXT: por %xmm10, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm5 ; SSSE3-NEXT: pandn %xmm4, %xmm1 ; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2] ; SSSE3-NEXT: movdqa %xmm3, %xmm5 ; SSSE3-NEXT: pxor %xmm0, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm9, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm3 -; SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSSE3-NEXT: por %xmm3, %xmm5 +; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm3 +; SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSSE3-NEXT: por %xmm3, %xmm7 ; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; SSSE3-NEXT: por %xmm6, %xmm0 ; SSSE3-NEXT: pand %xmm0, %xmm2 ; SSSE3-NEXT: pandn %xmm4, %xmm0 ; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[0,2] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v8i64_v8i32: @@ -1054,26 +1069,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147516415,2147516415] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -1087,26 +1103,27 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147516415,2147516415] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSSE3-NEXT: por %xmm0, %xmm3 ; SSSE3-NEXT: pxor %xmm3, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562035200,18446744071562035200] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm3 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -1222,26 +1239,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147516415,2147516415] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -1256,26 +1274,27 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147516415,2147516415] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSSE3-NEXT: por %xmm0, %xmm3 ; SSSE3-NEXT: pxor %xmm3, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562035200,18446744071562035200] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm3 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -1364,7 +1383,8 @@ ; ; AVX512VL-LABEL: trunc_ssat_v2i64_v2i16_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsqw %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovsqw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_ssat_v2i64_v2i16_store: @@ -1377,12 +1397,14 @@ ; ; AVX512BWVL-LABEL: trunc_ssat_v2i64_v2i16_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsqw %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovsqw %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v2i64_v2i16_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsqw %xmm0, (%rdi) +; SKX-NEXT: vpmovsqw %xmm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -1400,27 +1422,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415] -; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147516415,2147516415] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm5 @@ -1428,30 +1450,31 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; SSE2-NEXT: movdqa %xmm5, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm8, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm6 +; SSE2-NEXT: por %xmm5, %xmm6 ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: packssdw %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm3 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: packssdw %xmm6, %xmm0 ; SSE2-NEXT: packssdw %xmm0, %xmm0 ; SSE2-NEXT: retq ; @@ -1461,27 +1484,27 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415] -; SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pandn %xmm3, %xmm5 -; SSSE3-NEXT: por %xmm5, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147516415,2147516415] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pandn %xmm3, %xmm6 +; SSSE3-NEXT: por %xmm6, %xmm0 ; SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSSE3-NEXT: por %xmm4, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm1 ; SSSE3-NEXT: pandn %xmm3, %xmm5 @@ -1489,30 +1512,31 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200] -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm8 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562035200,18446744071562035200] +; SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm8, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm5 -; SSSE3-NEXT: pandn %xmm1, %xmm3 -; SSSE3-NEXT: por %xmm5, %xmm3 +; SSSE3-NEXT: pand %xmm7, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm3, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm5 +; SSSE3-NEXT: pandn %xmm1, %xmm6 +; SSSE3-NEXT: por %xmm5, %xmm6 ; SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pandn %xmm1, %xmm2 -; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: packssdw %xmm3, %xmm0 +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pandn %xmm1, %xmm3 +; SSSE3-NEXT: por %xmm3, %xmm0 +; SSSE3-NEXT: packssdw %xmm6, %xmm0 ; SSSE3-NEXT: packssdw %xmm0, %xmm0 ; SSSE3-NEXT: retq ; @@ -1646,27 +1670,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415] -; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147516415,2147516415] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pandn %xmm4, %xmm5 @@ -1674,32 +1698,33 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744073709518848,18446744073709518848] ; SSE2-NEXT: movdqa %xmm5, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm8, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pandn %xmm0, %xmm6 +; SSE2-NEXT: por %xmm5, %xmm6 ; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: packssdw %xmm1, %xmm2 -; SSE2-NEXT: packssdw %xmm2, %xmm2 -; SSE2-NEXT: movq %xmm2, (%rdi) +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: packssdw %xmm6, %xmm1 +; SSE2-NEXT: packssdw %xmm1, %xmm1 +; SSE2-NEXT: movq %xmm1, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_ssat_v4i64_v4i16_store: @@ -1708,27 +1733,27 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415] -; SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] -; SSSE3-NEXT: por %xmm9, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147516415,2147516415] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm8, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm0, %xmm3 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm1 ; SSSE3-NEXT: pandn %xmm4, %xmm5 @@ -1736,32 +1761,33 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [18446744073709518848,18446744073709518848] ; SSSE3-NEXT: movdqa %xmm5, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200] -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm8 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562035200,18446744071562035200] +; SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm8, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm5 -; SSSE3-NEXT: pandn %xmm0, %xmm1 -; SSSE3-NEXT: por %xmm5, %xmm1 +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm5 +; SSSE3-NEXT: pandn %xmm0, %xmm6 +; SSSE3-NEXT: por %xmm5, %xmm6 ; SSSE3-NEXT: pxor %xmm3, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pandn %xmm0, %xmm2 -; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: packssdw %xmm1, %xmm2 -; SSSE3-NEXT: packssdw %xmm2, %xmm2 -; SSSE3-NEXT: movq %xmm2, (%rdi) +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm1 +; SSSE3-NEXT: pand %xmm1, %xmm3 +; SSSE3-NEXT: pandn %xmm0, %xmm1 +; SSSE3-NEXT: por %xmm3, %xmm1 +; SSSE3-NEXT: packssdw %xmm6, %xmm1 +; SSSE3-NEXT: packssdw %xmm1, %xmm1 +; SSSE3-NEXT: movq %xmm1, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v4i64_v4i16_store: @@ -1860,7 +1886,8 @@ ; ; AVX512VL-LABEL: trunc_ssat_v4i64_v4i16_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsqw %ymm0, (%rdi) +; AVX512VL-NEXT: vpmovsqw %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -1874,13 +1901,15 @@ ; ; AVX512BWVL-LABEL: trunc_ssat_v4i64_v4i16_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsqw %ymm0, (%rdi) +; AVX512BWVL-NEXT: vpmovsqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v4i64_v4i16_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsqw %ymm0, (%rdi) +; SKX-NEXT: vpmovsqw %ymm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp slt <4 x i64> %a0, @@ -1903,110 +1932,113 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415] -; SSE2-NEXT: movdqa %xmm7, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm4, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm5, %xmm3 ; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm7, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm3 +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm5 ; SSE2-NEXT: pandn %xmm4, %xmm3 ; SSE2-NEXT: por %xmm5, %xmm3 ; SSE2-NEXT: movdqa %xmm6, %xmm5 ; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm7, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm5 +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm6 ; SSE2-NEXT: pandn %xmm4, %xmm5 ; SSE2-NEXT: por %xmm6, %xmm5 ; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: pxor %xmm1, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE2-NEXT: pand %xmm9, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm8 -; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm7 +; SSE2-NEXT: por %xmm0, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848] -; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm10, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm8 -; SSE2-NEXT: pandn %xmm4, %xmm9 -; SSE2-NEXT: por %xmm8, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm7, %xmm8 ; SSE2-NEXT: movdqa %xmm5, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm8, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm10, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm5 ; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: packssdw %xmm9, %xmm0 +; SSE2-NEXT: packssdw %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: pandn %xmm4, %xmm5 -; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm7 +; SSE2-NEXT: por %xmm3, %xmm7 ; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm6 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm4, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: packssdw %xmm5, %xmm1 -; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm4, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: packssdw %xmm7, %xmm3 +; SSE2-NEXT: packssdw %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_ssat_v8i64_v8i16: @@ -2019,110 +2051,113 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415] -; SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm2 +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm3 ; SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSSE3-NEXT: por %xmm3, %xmm2 ; SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm3 +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm5 ; SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm5, %xmm3 ; SSSE3-NEXT: movdqa %xmm6, %xmm5 ; SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm5 +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm6 ; SSSE3-NEXT: pandn %xmm4, %xmm5 ; SSSE3-NEXT: por %xmm6, %xmm5 ; SSSE3-NEXT: movdqa %xmm0, %xmm6 ; SSSE3-NEXT: pxor %xmm1, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSSE3-NEXT: pand %xmm9, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm8 -; SSSE3-NEXT: pand %xmm8, %xmm0 -; SSSE3-NEXT: pandn %xmm4, %xmm8 -; SSSE3-NEXT: por %xmm0, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSSE3-NEXT: por %xmm0, %xmm7 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848] -; SSSE3-NEXT: movdqa %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm7, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm9 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562035200,18446744071562035200] -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm10, %xmm9 -; SSSE3-NEXT: pand %xmm9, %xmm8 -; SSSE3-NEXT: pandn %xmm4, %xmm9 -; SSSE3-NEXT: por %xmm8, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562035200,18446744071562035200] +; SSSE3-NEXT: movdqa %xmm0, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm7 +; SSSE3-NEXT: pandn %xmm4, %xmm8 +; SSSE3-NEXT: por %xmm7, %xmm8 ; SSSE3-NEXT: movdqa %xmm5, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSSE3-NEXT: pand %xmm8, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm0, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSSE3-NEXT: por %xmm10, %xmm0 ; SSSE3-NEXT: pand %xmm0, %xmm5 ; SSSE3-NEXT: pandn %xmm4, %xmm0 ; SSSE3-NEXT: por %xmm5, %xmm0 -; SSSE3-NEXT: packssdw %xmm9, %xmm0 +; SSSE3-NEXT: packssdw %xmm8, %xmm0 ; SSSE3-NEXT: movdqa %xmm3, %xmm5 ; SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm9, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm3 -; SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSSE3-NEXT: por %xmm3, %xmm5 +; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm3 +; SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSSE3-NEXT: por %xmm3, %xmm7 ; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm6 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: pandn %xmm4, %xmm1 -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: packssdw %xmm5, %xmm1 -; SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm4, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: packssdw %xmm7, %xmm3 +; SSSE3-NEXT: packssdw %xmm3, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v8i64_v8i16: @@ -2340,31 +2375,16 @@ ; AVX-NEXT: vmovq %xmm0, (%rdi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: trunc_ssat_v4i32_v4i16_store: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rdi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_ssat_v4i32_v4i16_store: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsdw %xmm0, (%rdi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_ssat_v4i32_v4i16_store: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rdi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_ssat_v4i32_v4i16_store: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsdw %xmm0, (%rdi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc_ssat_v4i32_v4i16_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rdi) +; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v4i32_v4i16_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsdw %xmm0, (%rdi) +; SKX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <4 x i32> %a0, %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> @@ -2490,30 +2510,31 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483775,2147483775] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 @@ -2525,30 +2546,31 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483775,2147483775] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSSE3-NEXT: por %xmm3, %xmm0 ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 -; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067840,18446744071562067840] +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSSE3-NEXT: por %xmm2, %xmm0 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSSE3-NEXT: retq ; @@ -2646,26 +2668,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483775,2147483775] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -2683,26 +2706,27 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483775,2147483775] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSSE3-NEXT: por %xmm2, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSSE3-NEXT: por %xmm0, %xmm3 ; SSSE3-NEXT: pxor %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562067840,18446744071562067840] +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSSE3-NEXT: por %xmm0, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm3 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -2776,7 +2800,8 @@ ; ; AVX512VL-LABEL: trunc_ssat_v2i64_v2i8_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsqb %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovsqb %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_ssat_v2i64_v2i8_store: @@ -2789,12 +2814,14 @@ ; ; AVX512BWVL-LABEL: trunc_ssat_v2i64_v2i8_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsqb %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovsqb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v2i64_v2i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsqb %xmm0, (%rdi) +; SKX-NEXT: vpmovsqb %xmm0, %xmm0 +; SKX-NEXT: vpextrw $0, %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -2812,27 +2839,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm4, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm5 @@ -2840,33 +2867,34 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm8, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm4 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm4, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: retq @@ -2877,27 +2905,27 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] -; SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] -; SSSE3-NEXT: por %xmm9, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483775,2147483775] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm8, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm1 ; SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm1, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm0 ; SSSE3-NEXT: pandn %xmm4, %xmm5 @@ -2905,33 +2933,34 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] ; SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm8 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [18446744071562067840,18446744071562067840] +; SSSE3-NEXT: movdqa %xmm4, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm8, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pandn %xmm1, %xmm4 -; SSSE3-NEXT: por %xmm4, %xmm0 +; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pandn %xmm1, %xmm6 +; SSSE3-NEXT: por %xmm6, %xmm0 ; SSSE3-NEXT: pxor %xmm3, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pandn %xmm1, %xmm2 -; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pandn %xmm1, %xmm4 +; SSSE3-NEXT: por %xmm3, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm1, %xmm2 +; SSSE3-NEXT: pshufb %xmm1, %xmm4 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v4i64_v4i8: @@ -3071,27 +3100,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm8, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm4, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm5 @@ -3099,33 +3128,34 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] ; SSE2-NEXT: movdqa %xmm5, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm8, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm5 ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSE2-NEXT: por %xmm2, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm4 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm4, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: movd %xmm0, (%rdi) @@ -3137,27 +3167,27 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm1, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSSE3-NEXT: pxor %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] -; SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] -; SSSE3-NEXT: por %xmm9, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483775,2147483775] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] +; SSSE3-NEXT: por %xmm8, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm1 ; SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm1, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSSE3-NEXT: por %xmm1, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm0 ; SSSE3-NEXT: pandn %xmm4, %xmm5 @@ -3165,33 +3195,34 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [18446744073709551488,18446744073709551488] ; SSSE3-NEXT: movdqa %xmm5, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067840,18446744071562067840] +; SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] ; SSSE3-NEXT: por %xmm8, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm5 ; SSSE3-NEXT: pandn %xmm0, %xmm1 ; SSSE3-NEXT: por %xmm5, %xmm1 ; SSSE3-NEXT: pxor %xmm3, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSSE3-NEXT: pand %xmm4, %xmm5 +; SSSE3-NEXT: movdqa %xmm2, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pandn %xmm0, %xmm2 -; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: pand %xmm6, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; SSSE3-NEXT: por %xmm2, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pandn %xmm0, %xmm4 +; SSSE3-NEXT: por %xmm3, %xmm4 ; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm0, %xmm2 +; SSSE3-NEXT: pshufb %xmm0, %xmm4 ; SSSE3-NEXT: pshufb %xmm0, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; SSSE3-NEXT: movd %xmm1, (%rdi) ; SSSE3-NEXT: retq ; @@ -3298,7 +3329,8 @@ ; ; AVX512VL-LABEL: trunc_ssat_v4i64_v4i8_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsqb %ymm0, (%rdi) +; AVX512VL-NEXT: vpmovsqb %ymm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -3312,13 +3344,15 @@ ; ; AVX512BWVL-LABEL: trunc_ssat_v4i64_v4i8_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsqb %ymm0, (%rdi) +; AVX512BWVL-NEXT: vpmovsqb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, (%rdi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v4i64_v4i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsqb %ymm0, (%rdi) +; SKX-NEXT: vpmovsqb %ymm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp slt <4 x i64> %a0, @@ -3341,110 +3375,113 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm7, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm4, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm5, %xmm3 ; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm7, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm3 +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm5 ; SSE2-NEXT: pandn %xmm4, %xmm3 ; SSE2-NEXT: por %xmm5, %xmm3 ; SSE2-NEXT: movdqa %xmm6, %xmm5 ; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm7, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm5 +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm6 ; SSE2-NEXT: pandn %xmm4, %xmm5 ; SSE2-NEXT: por %xmm6, %xmm5 ; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: pxor %xmm1, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE2-NEXT: pand %xmm9, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm8 -; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: pandn %xmm4, %xmm8 -; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm6, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm7 +; SSE2-NEXT: por %xmm0, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] -; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm9 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm10, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm8 -; SSE2-NEXT: pandn %xmm4, %xmm9 -; SSE2-NEXT: por %xmm8, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pandn %xmm4, %xmm8 +; SSE2-NEXT: por %xmm7, %xmm8 ; SSE2-NEXT: movdqa %xmm5, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm8, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm10, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm5 ; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: packssdw %xmm9, %xmm0 +; SSE2-NEXT: packssdw %xmm8, %xmm0 ; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: pandn %xmm4, %xmm5 -; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pandn %xmm4, %xmm7 +; SSE2-NEXT: por %xmm3, %xmm7 ; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm6 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm4, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: packssdw %xmm5, %xmm1 -; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm4, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: packssdw %xmm7, %xmm3 +; SSE2-NEXT: packssdw %xmm3, %xmm0 ; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: retq ; @@ -3458,110 +3495,113 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] -; SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm2 +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm3 ; SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSSE3-NEXT: por %xmm3, %xmm2 ; SSSE3-NEXT: movdqa %xmm5, %xmm3 ; SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm3 +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm5 ; SSSE3-NEXT: pandn %xmm4, %xmm3 ; SSSE3-NEXT: por %xmm5, %xmm3 ; SSSE3-NEXT: movdqa %xmm6, %xmm5 ; SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm5 +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm6 ; SSSE3-NEXT: pandn %xmm4, %xmm5 ; SSSE3-NEXT: por %xmm6, %xmm5 ; SSSE3-NEXT: movdqa %xmm0, %xmm6 ; SSSE3-NEXT: pxor %xmm1, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSSE3-NEXT: pand %xmm9, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm8 -; SSSE3-NEXT: pand %xmm8, %xmm0 -; SSSE3-NEXT: pandn %xmm4, %xmm8 -; SSSE3-NEXT: por %xmm0, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm6, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm0 +; SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSSE3-NEXT: por %xmm0, %xmm7 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] -; SSSE3-NEXT: movdqa %xmm8, %xmm0 +; SSSE3-NEXT: movdqa %xmm7, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm6 -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm9 -; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm10, %xmm9 -; SSSE3-NEXT: pand %xmm9, %xmm8 -; SSSE3-NEXT: pandn %xmm4, %xmm9 -; SSSE3-NEXT: por %xmm8, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840] +; SSSE3-NEXT: movdqa %xmm0, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm7 +; SSSE3-NEXT: pandn %xmm4, %xmm8 +; SSSE3-NEXT: por %xmm7, %xmm8 ; SSSE3-NEXT: movdqa %xmm5, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSSE3-NEXT: pand %xmm8, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm0, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] ; SSSE3-NEXT: por %xmm10, %xmm0 ; SSSE3-NEXT: pand %xmm0, %xmm5 ; SSSE3-NEXT: pandn %xmm4, %xmm0 ; SSSE3-NEXT: por %xmm5, %xmm0 -; SSSE3-NEXT: packssdw %xmm9, %xmm0 +; SSSE3-NEXT: packssdw %xmm8, %xmm0 ; SSSE3-NEXT: movdqa %xmm3, %xmm5 ; SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm9, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm3 -; SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSSE3-NEXT: por %xmm3, %xmm5 +; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm3 +; SSSE3-NEXT: pandn %xmm4, %xmm7 +; SSSE3-NEXT: por %xmm3, %xmm7 ; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm6, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm6 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: pandn %xmm4, %xmm1 -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: packssdw %xmm5, %xmm1 -; SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSSE3-NEXT: pand %xmm5, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm4, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: packssdw %xmm7, %xmm3 +; SSSE3-NEXT: packssdw %xmm3, %xmm0 ; SSSE3-NEXT: packsswb %xmm0, %xmm0 ; SSSE3-NEXT: retq ; @@ -3755,112 +3795,115 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm7, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm1 +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pandn %xmm4, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm5, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm7, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm5 ; SSE2-NEXT: pandn %xmm4, %xmm2 ; SSE2-NEXT: por %xmm5, %xmm2 ; SSE2-NEXT: movdqa %xmm6, %xmm5 ; SSE2-NEXT: pxor %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm7, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm5 +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm6 ; SSE2-NEXT: pandn %xmm4, %xmm5 ; SSE2-NEXT: por %xmm6, %xmm5 ; SSE2-NEXT: movdqa %xmm3, %xmm6 ; SSE2-NEXT: pxor %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE2-NEXT: pand %xmm9, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm6, %xmm7 ; SSE2-NEXT: pand %xmm7, %xmm3 ; SSE2-NEXT: pandn %xmm4, %xmm7 ; SSE2-NEXT: por %xmm3, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] -; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm9 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: pxor %xmm0, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] -; SSE2-NEXT: por %xmm10, %xmm8 +; SSE2-NEXT: por %xmm6, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm7 ; SSE2-NEXT: pandn %xmm3, %xmm8 ; SSE2-NEXT: por %xmm7, %xmm8 -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm0, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3] ; SSE2-NEXT: pand %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm10, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm5 -; SSE2-NEXT: pandn %xmm3, %xmm7 -; SSE2-NEXT: por %xmm5, %xmm7 -; SSE2-NEXT: packssdw %xmm8, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pandn %xmm3, %xmm6 +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: packssdw %xmm8, %xmm6 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: pxor %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] -; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: pand %xmm8, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm2 +; SSE2-NEXT: pandn %xmm3, %xmm7 +; SSE2-NEXT: por %xmm2, %xmm7 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: packssdw %xmm5, %xmm0 -; SSE2-NEXT: packssdw %xmm0, %xmm7 -; SSE2-NEXT: packsswb %xmm7, %xmm7 -; SSE2-NEXT: movq %xmm7, (%rsi) +; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: packssdw %xmm7, %xmm2 +; SSE2-NEXT: packssdw %xmm2, %xmm6 +; SSE2-NEXT: packsswb %xmm6, %xmm6 +; SSE2-NEXT: movq %xmm6, (%rsi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_ssat_v8i64_v8i8_store: @@ -3873,112 +3916,115 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] -; SSSE3-NEXT: pxor %xmm8, %xmm8 -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] -; SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm1 +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm2 ; SSSE3-NEXT: pandn %xmm4, %xmm1 ; SSSE3-NEXT: por %xmm2, %xmm1 ; SSSE3-NEXT: movdqa %xmm5, %xmm2 ; SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm2 +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm5 ; SSSE3-NEXT: pandn %xmm4, %xmm2 ; SSSE3-NEXT: por %xmm5, %xmm2 ; SSSE3-NEXT: movdqa %xmm6, %xmm5 ; SSSE3-NEXT: pxor %xmm0, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSSE3-NEXT: movdqa %xmm7, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm5 +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm6 ; SSSE3-NEXT: pandn %xmm4, %xmm5 ; SSSE3-NEXT: por %xmm6, %xmm5 ; SSSE3-NEXT: movdqa %xmm3, %xmm6 ; SSSE3-NEXT: pxor %xmm0, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm7, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSSE3-NEXT: pand %xmm9, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] ; SSSE3-NEXT: por %xmm6, %xmm7 ; SSSE3-NEXT: pand %xmm7, %xmm3 ; SSSE3-NEXT: pandn %xmm4, %xmm7 ; SSSE3-NEXT: por %xmm3, %xmm7 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] -; SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSSE3-NEXT: pxor %xmm0, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm9 -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840] -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] -; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: movdqa %xmm7, %xmm6 +; SSSE3-NEXT: pxor %xmm0, %xmm6 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067840,18446744071562067840] +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] -; SSSE3-NEXT: por %xmm10, %xmm8 +; SSSE3-NEXT: por %xmm6, %xmm8 ; SSSE3-NEXT: pand %xmm8, %xmm7 ; SSSE3-NEXT: pandn %xmm3, %xmm8 ; SSSE3-NEXT: por %xmm7, %xmm8 -; SSSE3-NEXT: movdqa %xmm5, %xmm7 -; SSSE3-NEXT: pxor %xmm0, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm0, %xmm6 +; SSSE3-NEXT: movdqa %xmm6, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3] ; SSSE3-NEXT: pand %xmm9, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm10, %xmm7 -; SSSE3-NEXT: pand %xmm7, %xmm5 -; SSSE3-NEXT: pandn %xmm3, %xmm7 -; SSSE3-NEXT: por %xmm5, %xmm7 -; SSSE3-NEXT: packssdw %xmm8, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm5 +; SSSE3-NEXT: pandn %xmm3, %xmm6 +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: packssdw %xmm8, %xmm6 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 ; SSSE3-NEXT: pxor %xmm0, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2] -; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm9, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: pandn %xmm3, %xmm5 -; SSSE3-NEXT: por %xmm2, %xmm5 +; SSSE3-NEXT: pand %xmm8, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm2 +; SSSE3-NEXT: pandn %xmm3, %xmm7 +; SSSE3-NEXT: por %xmm2, %xmm7 ; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSSE3-NEXT: pand %xmm2, %xmm4 +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pandn %xmm3, %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: packssdw %xmm5, %xmm0 -; SSSE3-NEXT: packssdw %xmm0, %xmm7 -; SSSE3-NEXT: packsswb %xmm7, %xmm7 -; SSSE3-NEXT: movq %xmm7, (%rsi) +; SSSE3-NEXT: pand %xmm5, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pandn %xmm3, %xmm2 +; SSSE3-NEXT: por %xmm1, %xmm2 +; SSSE3-NEXT: packssdw %xmm7, %xmm2 +; SSSE3-NEXT: packssdw %xmm2, %xmm6 +; SSSE3-NEXT: packsswb %xmm6, %xmm6 +; SSSE3-NEXT: movq %xmm6, (%rsi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_ssat_v8i64_v8i8_store: @@ -4138,7 +4184,8 @@ ; AVX512-LABEL: trunc_ssat_v8i64_v8i8_store: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpmovsqb %zmm0, (%rsi) +; AVX512-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -4165,11 +4212,11 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256" { ; SSE2-LABEL: trunc_ssat_v16i64_v16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm8 +; SSE2-NEXT: movdqa (%rdi), %xmm7 ; SSE2-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-NEXT: movdqa 32(%rdi), %xmm12 -; SSE2-NEXT: movdqa 48(%rdi), %xmm11 -; SSE2-NEXT: movdqa 80(%rdi), %xmm7 +; SSE2-NEXT: movdqa 32(%rdi), %xmm11 +; SSE2-NEXT: movdqa 48(%rdi), %xmm10 +; SSE2-NEXT: movdqa 80(%rdi), %xmm8 ; SSE2-NEXT: movdqa 64(%rdi), %xmm5 ; SSE2-NEXT: movdqa 112(%rdi), %xmm4 ; SSE2-NEXT: movdqa 96(%rdi), %xmm3 @@ -4177,223 +4224,230 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3] -; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm13 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm9, %xmm14 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-NEXT: pand %xmm13, %xmm15 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,3,3] -; SSE2-NEXT: por %xmm15, %xmm2 +; SSE2-NEXT: movdqa %xmm9, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm6, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-NEXT: movdqa %xmm9, %xmm14 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-NEXT: pand %xmm13, %xmm15 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,3,3] -; SSE2-NEXT: por %xmm15, %xmm3 +; SSE2-NEXT: movdqa %xmm9, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm4 ; SSE2-NEXT: pandn %xmm6, %xmm3 ; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: movdqa %xmm5, %xmm4 ; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-NEXT: movdqa %xmm9, %xmm14 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-NEXT: pand %xmm13, %xmm15 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,3,3] -; SSE2-NEXT: por %xmm15, %xmm4 +; SSE2-NEXT: movdqa %xmm9, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm5 ; SSE2-NEXT: pandn %xmm6, %xmm4 ; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: movdqa %xmm8, %xmm5 ; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-NEXT: movdqa %xmm9, %xmm14 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-NEXT: pand %xmm13, %xmm15 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm14[1,1,3,3] -; SSE2-NEXT: por %xmm15, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm7 +; SSE2-NEXT: movdqa %xmm9, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm8 ; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm7, %xmm5 -; SSE2-NEXT: movdqa %xmm12, %xmm7 -; SSE2-NEXT: pxor %xmm1, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-NEXT: movdqa %xmm9, %xmm14 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-NEXT: pand %xmm13, %xmm15 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm14[1,1,3,3] -; SSE2-NEXT: por %xmm15, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm12 -; SSE2-NEXT: pandn %xmm6, %xmm7 -; SSE2-NEXT: por %xmm12, %xmm7 -; SSE2-NEXT: movdqa %xmm11, %xmm12 -; SSE2-NEXT: pxor %xmm1, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-NEXT: movdqa %xmm9, %xmm14 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-NEXT: pand %xmm13, %xmm15 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm14[1,1,3,3] -; SSE2-NEXT: por %xmm15, %xmm12 -; SSE2-NEXT: pand %xmm12, %xmm11 -; SSE2-NEXT: pandn %xmm6, %xmm12 -; SSE2-NEXT: por %xmm11, %xmm12 -; SSE2-NEXT: movdqa %xmm8, %xmm11 +; SSE2-NEXT: por %xmm8, %xmm5 +; SSE2-NEXT: movdqa %xmm11, %xmm8 +; SSE2-NEXT: pxor %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm9, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm11 +; SSE2-NEXT: pandn %xmm6, %xmm8 +; SSE2-NEXT: por %xmm11, %xmm8 +; SSE2-NEXT: movdqa %xmm10, %xmm11 ; SSE2-NEXT: pxor %xmm1, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-NEXT: movdqa %xmm9, %xmm14 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSE2-NEXT: pand %xmm13, %xmm15 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,3,3] -; SSE2-NEXT: por %xmm15, %xmm11 -; SSE2-NEXT: pand %xmm11, %xmm8 +; SSE2-NEXT: movdqa %xmm9, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm11[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm10 ; SSE2-NEXT: pandn %xmm6, %xmm11 -; SSE2-NEXT: por %xmm8, %xmm11 -; SSE2-NEXT: movdqa %xmm0, %xmm8 -; SSE2-NEXT: pxor %xmm1, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm8[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm13 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,0,2,2] -; SSE2-NEXT: pand %xmm13, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3] -; SSE2-NEXT: por %xmm8, %xmm10 -; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: por %xmm10, %xmm11 +; SSE2-NEXT: movdqa %xmm7, %xmm10 +; SSE2-NEXT: pxor %xmm1, %xmm10 +; SSE2-NEXT: movdqa %xmm9, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm10[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm14 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm14, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm7 ; SSE2-NEXT: pandn %xmm6, %xmm10 -; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: por %xmm7, %xmm10 +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: pxor %xmm1, %xmm7 +; SSE2-NEXT: movdqa %xmm9, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm0 +; SSE2-NEXT: pandn %xmm6, %xmm9 +; SSE2-NEXT: por %xmm0, %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488] +; SSE2-NEXT: movdqa %xmm9, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm0, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm13, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm12 +; SSE2-NEXT: pand %xmm12, %xmm9 +; SSE2-NEXT: pandn %xmm6, %xmm12 +; SSE2-NEXT: por %xmm9, %xmm12 ; SSE2-NEXT: movdqa %xmm10, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm13 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm9[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm13, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm14, %xmm13 -; SSE2-NEXT: pand %xmm13, %xmm10 -; SSE2-NEXT: pandn %xmm6, %xmm13 -; SSE2-NEXT: por %xmm10, %xmm13 -; SSE2-NEXT: movdqa %xmm11, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3] ; SSE2-NEXT: por %xmm14, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm11 +; SSE2-NEXT: pand %xmm0, %xmm10 ; SSE2-NEXT: pandn %xmm6, %xmm0 -; SSE2-NEXT: por %xmm11, %xmm0 -; SSE2-NEXT: packssdw %xmm13, %xmm0 -; SSE2-NEXT: movdqa %xmm12, %xmm10 -; SSE2-NEXT: pxor %xmm1, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 +; SSE2-NEXT: por %xmm10, %xmm0 +; SSE2-NEXT: packssdw %xmm12, %xmm0 +; SSE2-NEXT: movdqa %xmm11, %xmm9 +; SSE2-NEXT: pxor %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm9 ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm13, %xmm10 -; SSE2-NEXT: pand %xmm10, %xmm12 +; SSE2-NEXT: por %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm11 ; SSE2-NEXT: pandn %xmm6, %xmm10 -; SSE2-NEXT: por %xmm12, %xmm10 -; SSE2-NEXT: movdqa %xmm7, %xmm11 -; SSE2-NEXT: pxor %xmm1, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] -; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: por %xmm11, %xmm10 +; SSE2-NEXT: movdqa %xmm8, %xmm9 +; SSE2-NEXT: pxor %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm9 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm13, %xmm11 -; SSE2-NEXT: pand %xmm11, %xmm7 +; SSE2-NEXT: por %xmm9, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm8 ; SSE2-NEXT: pandn %xmm6, %xmm11 -; SSE2-NEXT: por %xmm7, %xmm11 +; SSE2-NEXT: por %xmm8, %xmm11 ; SSE2-NEXT: packssdw %xmm10, %xmm11 ; SSE2-NEXT: packssdw %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: pxor %xmm1, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm5 -; SSE2-NEXT: pandn %xmm6, %xmm7 -; SSE2-NEXT: por %xmm5, %xmm7 +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: pxor %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm8, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm5 +; SSE2-NEXT: pandn %xmm6, %xmm9 +; SSE2-NEXT: por %xmm5, %xmm9 ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,0,2,2] +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm5[1,1,3,3] ; SSE2-NEXT: pand %xmm10, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm11, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm4 ; SSE2-NEXT: pandn %xmm6, %xmm5 ; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: packssdw %xmm7, %xmm5 +; SSE2-NEXT: packssdw %xmm9, %xmm5 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm10 +; SSE2-NEXT: movdqa %xmm4, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm10, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pandn %xmm6, %xmm4 -; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm9, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm3 +; SSE2-NEXT: pandn %xmm6, %xmm8 +; SSE2-NEXT: por %xmm3, %xmm8 ; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: packssdw %xmm4, %xmm1 -; SSE2-NEXT: packssdw %xmm1, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: packssdw %xmm8, %xmm3 +; SSE2-NEXT: packssdw %xmm3, %xmm5 ; SSE2-NEXT: packsswb %xmm5, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_ssat_v16i64_v16i8: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa (%rdi), %xmm8 +; SSSE3-NEXT: movdqa (%rdi), %xmm7 ; SSSE3-NEXT: movdqa 16(%rdi), %xmm0 -; SSSE3-NEXT: movdqa 32(%rdi), %xmm12 -; SSSE3-NEXT: movdqa 48(%rdi), %xmm11 -; SSSE3-NEXT: movdqa 80(%rdi), %xmm7 +; SSSE3-NEXT: movdqa 32(%rdi), %xmm11 +; SSSE3-NEXT: movdqa 48(%rdi), %xmm10 +; SSSE3-NEXT: movdqa 80(%rdi), %xmm8 ; SSSE3-NEXT: movdqa 64(%rdi), %xmm5 ; SSSE3-NEXT: movdqa 112(%rdi), %xmm4 ; SSSE3-NEXT: movdqa 96(%rdi), %xmm3 @@ -4401,213 +4455,220 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3] -; SSSE3-NEXT: pxor %xmm10, %xmm10 -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] -; SSSE3-NEXT: movdqa %xmm9, %xmm14 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm14 -; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSSE3-NEXT: pand %xmm13, %xmm15 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,3,3] -; SSSE3-NEXT: por %xmm15, %xmm2 +; SSSE3-NEXT: movdqa %xmm9, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm2 ; SSSE3-NEXT: pand %xmm2, %xmm3 ; SSSE3-NEXT: pandn %xmm6, %xmm2 ; SSSE3-NEXT: por %xmm3, %xmm2 ; SSSE3-NEXT: movdqa %xmm4, %xmm3 ; SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSSE3-NEXT: movdqa %xmm9, %xmm14 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm14 -; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSSE3-NEXT: pand %xmm13, %xmm15 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm14[1,1,3,3] -; SSSE3-NEXT: por %xmm15, %xmm3 +; SSSE3-NEXT: movdqa %xmm9, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm4 ; SSSE3-NEXT: pandn %xmm6, %xmm3 ; SSSE3-NEXT: por %xmm4, %xmm3 ; SSSE3-NEXT: movdqa %xmm5, %xmm4 ; SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSSE3-NEXT: movdqa %xmm9, %xmm14 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm14 -; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSSE3-NEXT: pand %xmm13, %xmm15 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm14[1,1,3,3] -; SSSE3-NEXT: por %xmm15, %xmm4 +; SSSE3-NEXT: movdqa %xmm9, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm4 ; SSSE3-NEXT: pand %xmm4, %xmm5 ; SSSE3-NEXT: pandn %xmm6, %xmm4 ; SSSE3-NEXT: por %xmm5, %xmm4 -; SSSE3-NEXT: movdqa %xmm7, %xmm5 +; SSSE3-NEXT: movdqa %xmm8, %xmm5 ; SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm5[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSSE3-NEXT: movdqa %xmm9, %xmm14 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm14 -; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSSE3-NEXT: pand %xmm13, %xmm15 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm14[1,1,3,3] -; SSSE3-NEXT: por %xmm15, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm7 +; SSSE3-NEXT: movdqa %xmm9, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm8 ; SSSE3-NEXT: pandn %xmm6, %xmm5 -; SSSE3-NEXT: por %xmm7, %xmm5 -; SSSE3-NEXT: movdqa %xmm12, %xmm7 -; SSSE3-NEXT: pxor %xmm1, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSSE3-NEXT: movdqa %xmm9, %xmm14 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm14 -; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSSE3-NEXT: pand %xmm13, %xmm15 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm14[1,1,3,3] -; SSSE3-NEXT: por %xmm15, %xmm7 -; SSSE3-NEXT: pand %xmm7, %xmm12 -; SSSE3-NEXT: pandn %xmm6, %xmm7 -; SSSE3-NEXT: por %xmm12, %xmm7 -; SSSE3-NEXT: movdqa %xmm11, %xmm12 -; SSSE3-NEXT: pxor %xmm1, %xmm12 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSSE3-NEXT: movdqa %xmm9, %xmm14 -; SSSE3-NEXT: pcmpgtd %xmm12, %xmm14 -; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSSE3-NEXT: pand %xmm13, %xmm15 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm14[1,1,3,3] -; SSSE3-NEXT: por %xmm15, %xmm12 -; SSSE3-NEXT: pand %xmm12, %xmm11 -; SSSE3-NEXT: pandn %xmm6, %xmm12 -; SSSE3-NEXT: por %xmm11, %xmm12 -; SSSE3-NEXT: movdqa %xmm8, %xmm11 +; SSSE3-NEXT: por %xmm8, %xmm5 +; SSSE3-NEXT: movdqa %xmm11, %xmm8 +; SSSE3-NEXT: pxor %xmm1, %xmm8 +; SSSE3-NEXT: movdqa %xmm9, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm8[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm11 +; SSSE3-NEXT: pandn %xmm6, %xmm8 +; SSSE3-NEXT: por %xmm11, %xmm8 +; SSSE3-NEXT: movdqa %xmm10, %xmm11 ; SSSE3-NEXT: pxor %xmm1, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSSE3-NEXT: movdqa %xmm9, %xmm14 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm14 -; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,2,2] -; SSSE3-NEXT: pand %xmm13, %xmm15 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm14[1,1,3,3] -; SSSE3-NEXT: por %xmm15, %xmm11 -; SSSE3-NEXT: pand %xmm11, %xmm8 +; SSSE3-NEXT: movdqa %xmm9, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm11[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm11 +; SSSE3-NEXT: pand %xmm11, %xmm10 ; SSSE3-NEXT: pandn %xmm6, %xmm11 -; SSSE3-NEXT: por %xmm8, %xmm11 -; SSSE3-NEXT: movdqa %xmm0, %xmm8 -; SSSE3-NEXT: pxor %xmm1, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm8[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm13 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm9[0,0,2,2] -; SSSE3-NEXT: pand %xmm13, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[1,1,3,3] -; SSSE3-NEXT: por %xmm8, %xmm10 -; SSSE3-NEXT: pand %xmm10, %xmm0 +; SSSE3-NEXT: por %xmm10, %xmm11 +; SSSE3-NEXT: movdqa %xmm7, %xmm10 +; SSSE3-NEXT: pxor %xmm1, %xmm10 +; SSSE3-NEXT: movdqa %xmm9, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm10[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm14 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm14, %xmm10 +; SSSE3-NEXT: pand %xmm10, %xmm7 ; SSSE3-NEXT: pandn %xmm6, %xmm10 -; SSSE3-NEXT: por %xmm0, %xmm10 +; SSSE3-NEXT: por %xmm7, %xmm10 +; SSSE3-NEXT: movdqa %xmm0, %xmm7 +; SSSE3-NEXT: pxor %xmm1, %xmm7 +; SSSE3-NEXT: movdqa %xmm9, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm7, %xmm9 +; SSSE3-NEXT: pand %xmm9, %xmm0 +; SSSE3-NEXT: pandn %xmm6, %xmm9 +; SSSE3-NEXT: por %xmm0, %xmm9 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488] +; SSSE3-NEXT: movdqa %xmm9, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [18446744071562067840,18446744071562067840] +; SSSE3-NEXT: movdqa %xmm0, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pand %xmm13, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] +; SSSE3-NEXT: por %xmm0, %xmm12 +; SSSE3-NEXT: pand %xmm12, %xmm9 +; SSSE3-NEXT: pandn %xmm6, %xmm12 +; SSSE3-NEXT: por %xmm9, %xmm12 ; SSSE3-NEXT: movdqa %xmm10, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8 -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm13 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm0, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm9[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm0[1,1,3,3] ; SSSE3-NEXT: pand %xmm13, %xmm14 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm14, %xmm13 -; SSSE3-NEXT: pand %xmm13, %xmm10 -; SSSE3-NEXT: pandn %xmm6, %xmm13 -; SSSE3-NEXT: por %xmm10, %xmm13 -; SSSE3-NEXT: movdqa %xmm11, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] -; SSSE3-NEXT: pand %xmm10, %xmm14 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,3,3] ; SSSE3-NEXT: por %xmm14, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm11 +; SSSE3-NEXT: pand %xmm0, %xmm10 ; SSSE3-NEXT: pandn %xmm6, %xmm0 -; SSSE3-NEXT: por %xmm11, %xmm0 -; SSSE3-NEXT: packssdw %xmm13, %xmm0 -; SSSE3-NEXT: movdqa %xmm12, %xmm10 -; SSSE3-NEXT: pxor %xmm1, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm11 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,0,2,2] -; SSSE3-NEXT: pand %xmm11, %xmm13 +; SSSE3-NEXT: por %xmm10, %xmm0 +; SSSE3-NEXT: packssdw %xmm12, %xmm0 +; SSSE3-NEXT: movdqa %xmm11, %xmm9 +; SSSE3-NEXT: pxor %xmm1, %xmm9 +; SSSE3-NEXT: movdqa %xmm9, %xmm10 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSSE3-NEXT: pand %xmm12, %xmm9 ; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSSE3-NEXT: por %xmm13, %xmm10 -; SSSE3-NEXT: pand %xmm10, %xmm12 +; SSSE3-NEXT: por %xmm9, %xmm10 +; SSSE3-NEXT: pand %xmm10, %xmm11 ; SSSE3-NEXT: pandn %xmm6, %xmm10 -; SSSE3-NEXT: por %xmm12, %xmm10 -; SSSE3-NEXT: movdqa %xmm7, %xmm11 -; SSSE3-NEXT: pxor %xmm1, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm12 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] -; SSSE3-NEXT: pand %xmm12, %xmm13 +; SSSE3-NEXT: por %xmm11, %xmm10 +; SSSE3-NEXT: movdqa %xmm8, %xmm9 +; SSSE3-NEXT: pxor %xmm1, %xmm9 +; SSSE3-NEXT: movdqa %xmm9, %xmm11 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSSE3-NEXT: pand %xmm12, %xmm9 ; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] -; SSSE3-NEXT: por %xmm13, %xmm11 -; SSSE3-NEXT: pand %xmm11, %xmm7 +; SSSE3-NEXT: por %xmm9, %xmm11 +; SSSE3-NEXT: pand %xmm11, %xmm8 ; SSSE3-NEXT: pandn %xmm6, %xmm11 -; SSSE3-NEXT: por %xmm7, %xmm11 +; SSSE3-NEXT: por %xmm8, %xmm11 ; SSSE3-NEXT: packssdw %xmm10, %xmm11 ; SSSE3-NEXT: packssdw %xmm11, %xmm0 -; SSSE3-NEXT: movdqa %xmm5, %xmm7 -; SSSE3-NEXT: pxor %xmm1, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm7[0,0,2,2] -; SSSE3-NEXT: pand %xmm10, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm7 -; SSSE3-NEXT: pand %xmm7, %xmm5 -; SSSE3-NEXT: pandn %xmm6, %xmm7 -; SSSE3-NEXT: por %xmm5, %xmm7 +; SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSSE3-NEXT: pxor %xmm1, %xmm8 +; SSSE3-NEXT: movdqa %xmm8, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSSE3-NEXT: por %xmm8, %xmm9 +; SSSE3-NEXT: pand %xmm9, %xmm5 +; SSSE3-NEXT: pandn %xmm6, %xmm9 +; SSSE3-NEXT: por %xmm5, %xmm9 ; SSSE3-NEXT: movdqa %xmm4, %xmm5 ; SSSE3-NEXT: pxor %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,0,2,2] +; SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm5[1,1,3,3] ; SSSE3-NEXT: pand %xmm10, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] ; SSSE3-NEXT: por %xmm11, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm4 ; SSSE3-NEXT: pandn %xmm6, %xmm5 ; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: packssdw %xmm7, %xmm5 +; SSSE3-NEXT: packssdw %xmm9, %xmm5 ; SSSE3-NEXT: movdqa %xmm3, %xmm4 ; SSSE3-NEXT: pxor %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] -; SSSE3-NEXT: pand %xmm7, %xmm10 +; SSSE3-NEXT: movdqa %xmm4, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm10, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: pandn %xmm6, %xmm4 -; SSSE3-NEXT: por %xmm3, %xmm4 +; SSSE3-NEXT: pand %xmm9, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm4, %xmm8 +; SSSE3-NEXT: pand %xmm8, %xmm3 +; SSSE3-NEXT: pandn %xmm6, %xmm8 +; SSSE3-NEXT: por %xmm3, %xmm8 ; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2] -; SSSE3-NEXT: pand %xmm3, %xmm7 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm7, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: pandn %xmm6, %xmm1 -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: packssdw %xmm4, %xmm1 -; SSSE3-NEXT: packssdw %xmm1, %xmm5 +; SSSE3-NEXT: pand %xmm4, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pandn %xmm6, %xmm3 +; SSSE3-NEXT: por %xmm2, %xmm3 +; SSSE3-NEXT: packssdw %xmm8, %xmm3 +; SSSE3-NEXT: packssdw %xmm3, %xmm5 ; SSSE3-NEXT: packsswb %xmm5, %xmm0 ; SSSE3-NEXT: retq ; @@ -4908,11 +4969,12 @@ ; SKX-NEXT: vmovdqa 96(%rdi), %ymm3 ; SKX-NEXT: vpmovsqb %ymm3, %xmm3 ; SKX-NEXT: vpmovsqb %ymm2, %xmm2 -; SKX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SKX-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,4,0,4] +; SKX-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 ; SKX-NEXT: vpmovsqb %ymm1, %xmm1 ; SKX-NEXT: vpmovsqb %ymm0, %xmm0 ; SKX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %a0 = load <16 x i64>, ptr %p0 @@ -5101,7 +5163,8 @@ ; ; AVX512VL-LABEL: trunc_ssat_v4i32_v4i8_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsdb %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovsdb %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_ssat_v4i32_v4i8_store: @@ -5114,12 +5177,14 @@ ; ; AVX512BWVL-LABEL: trunc_ssat_v4i32_v4i8_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsdb %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovsdb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v4i32_v4i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsdb %xmm0, (%rdi) +; SKX-NEXT: vpmovsdb %xmm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <4 x i32> %a0, %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> @@ -5231,7 +5296,8 @@ ; ; AVX512VL-LABEL: trunc_ssat_v8i32_v8i8_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsdb %ymm0, (%rdi) +; AVX512VL-NEXT: vpmovsdb %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -5246,13 +5312,15 @@ ; ; AVX512BWVL-LABEL: trunc_ssat_v8i32_v8i8_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsdb %ymm0, (%rdi) +; AVX512BWVL-NEXT: vpmovsdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v8i32_v8i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsdb %ymm0, (%rdi) +; SKX-NEXT: vpmovsdb %ymm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp slt <8 x i32> %a0, @@ -5418,32 +5486,16 @@ ; AVX-NEXT: vmovq %xmm0, (%rdi) ; AVX-NEXT: retq ; -; AVX512F-LABEL: trunc_ssat_v8i16_v8i8_store: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rdi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_ssat_v8i16_v8i8_store: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, (%rdi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_ssat_v8i16_v8i8_store: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rdi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_ssat_v8i16_v8i8_store: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovswb %xmm0, (%rdi) -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc_ssat_v8i16_v8i8_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rdi) +; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v8i16_v8i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovswb %xmm0, (%rdi) +; SKX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp slt <8 x i16> %a0, %2 = select <8 x i1> %1, <8 x i16> %a0, <8 x i16> diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -24,11 +24,12 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -40,11 +41,12 @@ ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455] +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSSE3-NEXT: pand %xmm3, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -58,10 +60,11 @@ ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] @@ -131,11 +134,12 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -148,11 +152,12 @@ ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455] +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSSE3-NEXT: pand %xmm3, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -167,10 +172,11 @@ ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] @@ -212,7 +218,8 @@ ; ; AVX512VL-LABEL: trunc_usat_v2i64_v2i32_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusqd %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_usat_v2i64_v2i32_store: @@ -225,12 +232,14 @@ ; ; AVX512BWVL-LABEL: trunc_usat_v2i64_v2i32_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusqd %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v2i64_v2i32_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusqd %xmm0, (%rdi) +; SKX-NEXT: vpmovusqd %xmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp ult <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -245,27 +254,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647] -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_usat_v4i64_v4i32: @@ -273,46 +282,47 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: movdqa %xmm0, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647] -; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3 -; SSSE3-NEXT: pand %xmm6, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 -; SSSE3-NEXT: pand %xmm5, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: pand %xmm5, %xmm3 +; SSSE3-NEXT: pxor %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSSE3-NEXT: por %xmm1, %xmm2 ; SSSE3-NEXT: pand %xmm3, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSSE3-NEXT: por %xmm3, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v4i64_v4i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm5, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm7, %xmm3 -; SSE41-NEXT: pand %xmm6, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] ; SSE41-NEXT: pand %xmm5, %xmm0 ; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295] ; SSE41-NEXT: movapd {{.*#+}} xmm5 = [4294967295,429496729] @@ -442,52 +452,52 @@ ; SSE2-LABEL: trunc_usat_v8i64_v8i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm2 -; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm5 ; SSE2-NEXT: movdqa 32(%rdi), %xmm6 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: pxor %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647] -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pxor %xmm0, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm4, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm4, %xmm7 -; SSE2-NEXT: pand %xmm9, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm7 ; SSE2-NEXT: pand %xmm7, %xmm1 ; SSE2-NEXT: pandn %xmm3, %xmm7 ; SSE2-NEXT: por %xmm1, %xmm7 ; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2] -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm4, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm6 ; SSE2-NEXT: pandn %xmm3, %xmm1 ; SSE2-NEXT: por %xmm6, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: movdqa %xmm5, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm0, %xmm6 +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm5 ; SSE2-NEXT: pandn %xmm3, %xmm6 -; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm3, %xmm0 @@ -498,52 +508,52 @@ ; SSSE3-LABEL: trunc_usat_v8i64_v8i32: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa (%rdi), %xmm2 -; SSSE3-NEXT: movdqa 16(%rdi), %xmm0 +; SSSE3-NEXT: movdqa 16(%rdi), %xmm5 ; SSSE3-NEXT: movdqa 32(%rdi), %xmm6 ; SSSE3-NEXT: movdqa 48(%rdi), %xmm1 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: movdqa %xmm1, %xmm7 -; SSSE3-NEXT: pxor %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647] -; SSSE3-NEXT: movdqa %xmm5, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSSE3-NEXT: pxor %xmm0, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSSE3-NEXT: movdqa %xmm4, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm4, %xmm7 -; SSSE3-NEXT: pand %xmm9, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm7 ; SSSE3-NEXT: pand %xmm7, %xmm1 ; SSSE3-NEXT: pandn %xmm3, %xmm7 ; SSSE3-NEXT: por %xmm1, %xmm7 ; SSSE3-NEXT: movdqa %xmm6, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2] -; SSSE3-NEXT: movdqa %xmm5, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pxor %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm4, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm9, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm6 ; SSSE3-NEXT: pandn %xmm3, %xmm1 ; SSSE3-NEXT: por %xmm6, %xmm1 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] -; SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSSE3-NEXT: pxor %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: movdqa %xmm5, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm0, %xmm6 +; SSSE3-NEXT: movdqa %xmm4, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm4, %xmm6 -; SSSE3-NEXT: pand %xmm8, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSSE3-NEXT: pand %xmm7, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm5 ; SSSE3-NEXT: pandn %xmm3, %xmm6 -; SSSE3-NEXT: por %xmm0, %xmm6 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] ; SSSE3-NEXT: pcmpeqd %xmm4, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSSE3-NEXT: pand %xmm5, %xmm0 ; SSSE3-NEXT: pand %xmm0, %xmm2 ; SSSE3-NEXT: pandn %xmm3, %xmm0 @@ -553,54 +563,53 @@ ; ; SSE41-LABEL: trunc_usat_v8i64_v8i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm4 -; SSE41-NEXT: movdqa 16(%rdi), %xmm7 -; SSE41-NEXT: movdqa 32(%rdi), %xmm8 +; SSE41-NEXT: movdqa (%rdi), %xmm3 +; SSE41-NEXT: movdqa 16(%rdi), %xmm5 +; SSE41-NEXT: movdqa 32(%rdi), %xmm7 ; SSE41-NEXT: movdqa 48(%rdi), %xmm1 -; SSE41-NEXT: movapd {{.*#+}} xmm3 = [4294967295,4294967295] -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm5, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2] -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259455,9223372039002259455] +; SSE41-NEXT: movdqa %xmm6, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: movapd %xmm2, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[0,2] +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm6, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm5, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm8[0,2] -; SSE41-NEXT: movaps %xmm3, %xmm0 +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2] +; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v8i64_v8i32: @@ -714,11 +723,12 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -731,11 +741,12 @@ ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002324991,9223372039002324991] +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSSE3-NEXT: pand %xmm3, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -750,10 +761,11 @@ ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] @@ -835,11 +847,12 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -853,11 +866,12 @@ ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002324991,9223372039002324991] +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSSE3-NEXT: pand %xmm3, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -873,10 +887,11 @@ ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] @@ -933,7 +948,8 @@ ; ; AVX512VL-LABEL: trunc_usat_v2i64_v2i16_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusqw %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovusqw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_usat_v2i64_v2i16_store: @@ -946,12 +962,14 @@ ; ; AVX512BWVL-LABEL: trunc_usat_v2i64_v2i16_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusqw %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusqw %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v2i64_v2i16_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusqw %xmm0, (%rdi) +; SKX-NEXT: vpmovusqw %xmm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp ult <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -967,27 +985,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm6, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] @@ -1000,27 +1018,27 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 -; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pand %xmm6, %xmm4 ; SSSE3-NEXT: pand %xmm4, %xmm0 ; SSSE3-NEXT: pandn %xmm2, %xmm4 ; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm0 -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pandn %xmm2, %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] @@ -1031,27 +1049,26 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: movapd {{.*#+}} xmm3 = [65535,65535] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE41-NEXT: pxor %xmm2, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 +; SSE41-NEXT: pxor %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pand %xmm6, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: packusdw %xmm7, %xmm3 +; SSE41-NEXT: packusdw %xmm6, %xmm3 ; SSE41-NEXT: packusdw %xmm3, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: retq @@ -1134,27 +1151,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm6, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] @@ -1168,27 +1185,27 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 -; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pand %xmm6, %xmm4 ; SSSE3-NEXT: pand %xmm4, %xmm0 ; SSSE3-NEXT: pandn %xmm2, %xmm4 ; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm0 -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pandn %xmm2, %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] @@ -1199,30 +1216,29 @@ ; SSE41-LABEL: trunc_usat_v4i64_v4i16_store: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [65535,65535] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE41-NEXT: pxor %xmm2, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pand %xmm6, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE41-NEXT: packusdw %xmm7, %xmm4 -; SSE41-NEXT: packusdw %xmm4, %xmm4 -; SSE41-NEXT: movq %xmm4, (%rdi) +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 +; SSE41-NEXT: pxor %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: packusdw %xmm6, %xmm3 +; SSE41-NEXT: packusdw %xmm3, %xmm3 +; SSE41-NEXT: movq %xmm3, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v4i64_v4i16_store: @@ -1271,7 +1287,8 @@ ; ; AVX512VL-LABEL: trunc_usat_v4i64_v4i16_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusqw %ymm0, (%rdi) +; AVX512VL-NEXT: vpmovusqw %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -1285,13 +1302,15 @@ ; ; AVX512BWVL-LABEL: trunc_usat_v4i64_v4i16_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusqw %ymm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v4i64_v4i16_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusqw %ymm0, (%rdi) +; SKX-NEXT: vpmovusqw %ymm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp ult <4 x i64> %a0, @@ -1304,7 +1323,7 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) { ; SSE2-LABEL: trunc_usat_v8i64_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm5 +; SSE2-NEXT: movdqa (%rdi), %xmm4 ; SSE2-NEXT: movdqa 16(%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm6 ; SSE2-NEXT: movdqa 48(%rdi), %xmm7 @@ -1312,49 +1331,49 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm6, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm4, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pand %xmm8, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm6 ; SSE2-NEXT: pandn %xmm2, %xmm1 ; SSE2-NEXT: por %xmm6, %xmm1 ; SSE2-NEXT: movdqa %xmm7, %xmm6 ; SSE2-NEXT: pxor %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSE2-NEXT: movdqa %xmm4, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: pand %xmm8, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm7 ; SSE2-NEXT: pandn %xmm2, %xmm6 ; SSE2-NEXT: por %xmm7, %xmm6 -; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: movdqa %xmm4, %xmm7 ; SSE2-NEXT: pxor %xmm3, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2-NEXT: movdqa %xmm4, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE2-NEXT: pand %xmm9, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pand %xmm7, %xmm4 ; SSE2-NEXT: pandn %xmm2, %xmm7 -; SSE2-NEXT: por %xmm5, %xmm7 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm5 -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] +; SSE2-NEXT: por %xmm4, %xmm7 +; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] @@ -1369,7 +1388,7 @@ ; ; SSSE3-LABEL: trunc_usat_v8i64_v8i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa (%rdi), %xmm5 +; SSSE3-NEXT: movdqa (%rdi), %xmm4 ; SSSE3-NEXT: movdqa 16(%rdi), %xmm0 ; SSSE3-NEXT: movdqa 32(%rdi), %xmm6 ; SSSE3-NEXT: movdqa 48(%rdi), %xmm7 @@ -1377,49 +1396,49 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: movdqa %xmm6, %xmm1 ; SSSE3-NEXT: pxor %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2] -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm4, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] +; SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm1 -; SSSE3-NEXT: pand %xmm9, %xmm1 +; SSSE3-NEXT: pand %xmm8, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm6 ; SSSE3-NEXT: pandn %xmm2, %xmm1 ; SSSE3-NEXT: por %xmm6, %xmm1 ; SSSE3-NEXT: movdqa %xmm7, %xmm6 ; SSSE3-NEXT: pxor %xmm3, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSSE3-NEXT: movdqa %xmm4, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm6 -; SSSE3-NEXT: pand %xmm9, %xmm6 +; SSSE3-NEXT: pand %xmm8, %xmm6 ; SSSE3-NEXT: pand %xmm6, %xmm7 ; SSSE3-NEXT: pandn %xmm2, %xmm6 ; SSSE3-NEXT: por %xmm7, %xmm6 -; SSSE3-NEXT: movdqa %xmm5, %xmm7 +; SSSE3-NEXT: movdqa %xmm4, %xmm7 ; SSSE3-NEXT: pxor %xmm3, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSSE3-NEXT: movdqa %xmm4, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm7 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm7 -; SSSE3-NEXT: pand %xmm9, %xmm7 -; SSSE3-NEXT: pand %xmm7, %xmm5 +; SSSE3-NEXT: pand %xmm8, %xmm7 +; SSSE3-NEXT: pand %xmm7, %xmm4 ; SSSE3-NEXT: pandn %xmm2, %xmm7 -; SSSE3-NEXT: por %xmm5, %xmm7 -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: pxor %xmm3, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm5 -; SSSE3-NEXT: pand %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pandn %xmm2, %xmm5 -; SSSE3-NEXT: por %xmm0, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] +; SSSE3-NEXT: por %xmm4, %xmm7 +; SSSE3-NEXT: pxor %xmm0, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm0 +; SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSSE3-NEXT: por %xmm0, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] ; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] @@ -1434,55 +1453,54 @@ ; ; SSE41-LABEL: trunc_usat_v8i64_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm8 -; SSE41-NEXT: movdqa 16(%rdi), %xmm2 -; SSE41-NEXT: movdqa 32(%rdi), %xmm4 -; SSE41-NEXT: movdqa 48(%rdi), %xmm7 -; SSE41-NEXT: movapd {{.*#+}} xmm3 = [65535,65535] -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm5, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147549183,2147549183,2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movdqa 32(%rdi), %xmm3 +; SSE41-NEXT: movdqa 48(%rdi), %xmm6 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 -; SSE41-NEXT: packusdw %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002324991,9223372039002324991] ; SSE41-NEXT: movdqa %xmm5, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: movapd %xmm2, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: packusdw %xmm8, %xmm1 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: packusdw %xmm7, %xmm2 +; SSE41-NEXT: packusdw %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: packusdw %xmm8, %xmm3 -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v8i64_v8i16: @@ -1697,7 +1715,8 @@ ; ; AVX512VL-LABEL: trunc_usat_v4i32_v4i16_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusdw %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovusdw %xmm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_usat_v4i32_v4i16_store: @@ -1710,12 +1729,14 @@ ; ; AVX512BWVL-LABEL: trunc_usat_v4i32_v4i16_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusdw %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusdw %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v4i32_v4i16_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusdw %xmm0, (%rdi) +; SKX-NEXT: vpmovusdw %xmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp ult <4 x i32> %a0, %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> @@ -1727,26 +1748,26 @@ define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) { ; SSE2-LABEL: trunc_usat_v8i32_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pslld $16, %xmm4 -; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm2, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: pslld $16, %xmm5 +; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm4, %xmm0 +; SSE2-NEXT: packssdw %xmm5, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_usat_v8i32_v8i16: @@ -1841,39 +1862,39 @@ define <16 x i16> @trunc_usat_v16i32_v16i16(ptr %p0) { ; SSE2-LABEL: trunc_usat_v16i32_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm5 +; SSE2-NEXT: movdqa (%rdi), %xmm6 ; SSE2-NEXT: movdqa 16(%rdi), %xmm4 ; SSE2-NEXT: movdqa 32(%rdi), %xmm0 ; SSE2-NEXT: movdqa 48(%rdi), %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535] +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm6, %xmm3 +; SSE2-NEXT: pxor %xmm7, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm7, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: pxor %xmm6, %xmm0 +; SSE2-NEXT: pxor %xmm7, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm8 -; SSE2-NEXT: pxor %xmm7, %xmm3 +; SSE2-NEXT: pandn %xmm5, %xmm3 ; SSE2-NEXT: por %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm8 -; SSE2-NEXT: pxor %xmm6, %xmm8 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pxor %xmm7, %xmm8 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm7 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm6 +; SSE2-NEXT: pandn %xmm5, %xmm0 +; SSE2-NEXT: por %xmm6, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm5, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 ; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: pslld $16, %xmm0 @@ -1888,39 +1909,39 @@ ; ; SSSE3-LABEL: trunc_usat_v16i32_v16i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa (%rdi), %xmm5 +; SSSE3-NEXT: movdqa (%rdi), %xmm6 ; SSSE3-NEXT: movdqa 16(%rdi), %xmm4 ; SSSE3-NEXT: movdqa 32(%rdi), %xmm0 ; SSSE3-NEXT: movdqa 48(%rdi), %xmm8 -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535] +; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] ; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm6, %xmm3 +; SSSE3-NEXT: pxor %xmm7, %xmm3 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183] ; SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7 ; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm7, %xmm1 +; SSSE3-NEXT: pandn %xmm5, %xmm1 ; SSSE3-NEXT: por %xmm0, %xmm1 ; SSSE3-NEXT: movdqa %xmm8, %xmm0 -; SSSE3-NEXT: pxor %xmm6, %xmm0 +; SSSE3-NEXT: pxor %xmm7, %xmm0 ; SSSE3-NEXT: movdqa %xmm2, %xmm3 ; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 ; SSSE3-NEXT: pand %xmm3, %xmm8 -; SSSE3-NEXT: pxor %xmm7, %xmm3 +; SSSE3-NEXT: pandn %xmm5, %xmm3 ; SSSE3-NEXT: por %xmm8, %xmm3 -; SSSE3-NEXT: movdqa %xmm5, %xmm8 -; SSSE3-NEXT: pxor %xmm6, %xmm8 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pxor %xmm7, %xmm8 ; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: pcmpgtd %xmm8, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm5 -; SSSE3-NEXT: pxor %xmm7, %xmm0 -; SSSE3-NEXT: por %xmm5, %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm2 -; SSSE3-NEXT: pxor %xmm2, %xmm7 -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm0, %xmm6 +; SSSE3-NEXT: pandn %xmm5, %xmm0 +; SSSE3-NEXT: por %xmm6, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm4 +; SSSE3-NEXT: pandn %xmm5, %xmm2 +; SSSE3-NEXT: por %xmm4, %xmm2 ; SSSE3-NEXT: pslld $16, %xmm2 ; SSSE3-NEXT: psrad $16, %xmm2 ; SSSE3-NEXT: pslld $16, %xmm0 @@ -1995,11 +2016,12 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -2014,11 +2036,12 @@ ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259711,9223372039002259711] +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSSE3-NEXT: pand %xmm3, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -2032,10 +2055,11 @@ ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903] -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -2104,11 +2128,12 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -2125,11 +2150,12 @@ ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903] -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259711,9223372039002259711] +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSSE3-NEXT: pand %xmm3, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -2145,10 +2171,11 @@ ; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903] -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] ; SSE41-NEXT: pand %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -2190,7 +2217,8 @@ ; ; AVX512VL-LABEL: trunc_usat_v2i64_v2i8_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusqb %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovusqb %xmm0, %xmm0 +; AVX512VL-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_usat_v2i64_v2i8_store: @@ -2203,12 +2231,14 @@ ; ; AVX512BWVL-LABEL: trunc_usat_v2i64_v2i8_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusqb %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusqb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v2i64_v2i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusqb %xmm0, (%rdi) +; SKX-NEXT: vpmovusqb %xmm0, %xmm0 +; SKX-NEXT: vpextrw $0, %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp ult <2 x i64> %a0, %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> @@ -2224,29 +2254,29 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm6, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm4 ; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm3, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: retq @@ -2257,60 +2287,59 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: movdqa %xmm0, %xmm4 ; SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 -; SSSE3-NEXT: pand %xmm7, %xmm4 +; SSSE3-NEXT: pand %xmm6, %xmm4 ; SSSE3-NEXT: pand %xmm4, %xmm0 ; SSSE3-NEXT: pandn %xmm2, %xmm4 ; SSSE3-NEXT: por %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 -; SSSE3-NEXT: pand %xmm6, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm2, %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 +; SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSSE3-NEXT: por %xmm1, %xmm3 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm1, %xmm4 +; SSSE3-NEXT: pshufb %xmm1, %xmm3 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v4i64_v4i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm5 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm7, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm5, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE41-NEXT: pxor %xmm1, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm6, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm3, %xmm0 +; SSE41-NEXT: movapd %xmm4, %xmm3 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm5 +; SSE41-NEXT: movdqa %xmm6, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pand %xmm7, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm0, %xmm5 ; SSE41-NEXT: pshufb %xmm0, %xmm4 -; SSE41-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm3 +; SSE41-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v4i64_v4i8: @@ -2395,29 +2424,29 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: pand %xmm6, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm4 ; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: packuswb %xmm0, %xmm3 +; SSE2-NEXT: packuswb %xmm4, %xmm3 ; SSE2-NEXT: packuswb %xmm3, %xmm3 ; SSE2-NEXT: packuswb %xmm3, %xmm3 ; SSE2-NEXT: movd %xmm3, (%rdi) @@ -2425,65 +2454,64 @@ ; ; SSSE3-LABEL: trunc_usat_v4i64_v4i8_store: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,255] -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pandn %xmm3, %xmm2 -; SSSE3-NEXT: por %xmm0, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm0 -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pandn %xmm3, %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: pshufb %xmm1, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: movd %xmm2, (%rdi) +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm3, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] +; SSSE3-NEXT: movdqa %xmm5, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: pand %xmm4, %xmm0 +; SSSE3-NEXT: pandn %xmm2, %xmm4 +; SSSE3-NEXT: por %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm5, %xmm0 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm0, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; SSSE3-NEXT: pshufb %xmm0, %xmm3 +; SSSE3-NEXT: pshufb %xmm0, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSSE3-NEXT: movd %xmm4, (%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v4i64_v4i8_store: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm7 -; SSE41-NEXT: pxor %xmm1, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 +; SSE41-NEXT: pxor %xmm1, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pand %xmm6, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm0, %xmm4 -; SSE41-NEXT: pshufb %xmm0, %xmm7 -; SSE41-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE41-NEXT: movd %xmm7, (%rdi) +; SSE41-NEXT: pshufb %xmm0, %xmm3 +; SSE41-NEXT: pshufb %xmm0, %xmm6 +; SSE41-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; SSE41-NEXT: movd %xmm6, (%rdi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v4i64_v4i8_store: @@ -2536,7 +2564,8 @@ ; ; AVX512VL-LABEL: trunc_usat_v4i64_v4i8_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusqb %ymm0, (%rdi) +; AVX512VL-NEXT: vpmovusqb %ymm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -2550,13 +2579,15 @@ ; ; AVX512BWVL-LABEL: trunc_usat_v4i64_v4i8_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusqb %ymm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusqb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, (%rdi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v4i64_v4i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusqb %ymm0, (%rdi) +; SKX-NEXT: vpmovusqb %ymm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp ult <4 x i64> %a0, @@ -2577,51 +2608,51 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm0, %xmm7 ; SSE2-NEXT: pxor %xmm3, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm4, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm4, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE2-NEXT: pand %xmm9, %xmm7 +; SSE2-NEXT: pand %xmm8, %xmm7 ; SSE2-NEXT: pand %xmm7, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm7 ; SSE2-NEXT: por %xmm0, %xmm7 ; SSE2-NEXT: movdqa %xmm6, %xmm0 ; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSE2-NEXT: movdqa %xmm4, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm4, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm9, %xmm0 +; SSE2-NEXT: pand %xmm8, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm6 ; SSE2-NEXT: pandn %xmm2, %xmm0 ; SSE2-NEXT: por %xmm6, %xmm0 ; SSE2-NEXT: packuswb %xmm7, %xmm0 ; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: pxor %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: movdqa %xmm4, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: pand %xmm7, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm5 ; SSE2-NEXT: pandn %xmm2, %xmm6 ; SSE2-NEXT: por %xmm5, %xmm6 -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm5 -; SSE2-NEXT: por %xmm1, %xmm5 -; SSE2-NEXT: packuswb %xmm6, %xmm5 -; SSE2-NEXT: packuswb %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm5, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: packuswb %xmm6, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: retq ; @@ -2635,106 +2666,105 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: movdqa %xmm0, %xmm7 ; SSSE3-NEXT: pxor %xmm3, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm4, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] +; SSSE3-NEXT: movdqa %xmm4, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm7 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm7 -; SSSE3-NEXT: pand %xmm9, %xmm7 +; SSSE3-NEXT: pand %xmm8, %xmm7 ; SSSE3-NEXT: pand %xmm7, %xmm0 ; SSSE3-NEXT: pandn %xmm2, %xmm7 ; SSSE3-NEXT: por %xmm0, %xmm7 ; SSSE3-NEXT: movdqa %xmm6, %xmm0 ; SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] -; SSSE3-NEXT: movdqa %xmm4, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm4, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm0 -; SSSE3-NEXT: pand %xmm9, %xmm0 +; SSSE3-NEXT: pand %xmm8, %xmm0 ; SSSE3-NEXT: pand %xmm0, %xmm6 ; SSSE3-NEXT: pandn %xmm2, %xmm0 ; SSSE3-NEXT: por %xmm6, %xmm0 ; SSSE3-NEXT: packuswb %xmm7, %xmm0 ; SSSE3-NEXT: movdqa %xmm5, %xmm6 ; SSSE3-NEXT: pxor %xmm3, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: movdqa %xmm4, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: movdqa %xmm4, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm6 -; SSSE3-NEXT: pand %xmm8, %xmm6 +; SSSE3-NEXT: pand %xmm7, %xmm6 ; SSSE3-NEXT: pand %xmm6, %xmm5 ; SSSE3-NEXT: pandn %xmm2, %xmm6 ; SSSE3-NEXT: por %xmm5, %xmm6 -; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: pxor %xmm3, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm5 -; SSSE3-NEXT: pand %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pandn %xmm2, %xmm5 -; SSSE3-NEXT: por %xmm1, %xmm5 -; SSSE3-NEXT: packuswb %xmm6, %xmm5 -; SSSE3-NEXT: packuswb %xmm5, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSSE3-NEXT: pand %xmm5, %xmm3 +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: pandn %xmm2, %xmm3 +; SSSE3-NEXT: por %xmm1, %xmm3 +; SSSE3-NEXT: packuswb %xmm6, %xmm3 +; SSSE3-NEXT: packuswb %xmm3, %xmm0 ; SSSE3-NEXT: packuswb %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v8i64_v8i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm8 -; SSE41-NEXT: movdqa 16(%rdi), %xmm2 -; SSE41-NEXT: movdqa 32(%rdi), %xmm4 -; SSE41-NEXT: movdqa 48(%rdi), %xmm7 -; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movdqa 32(%rdi), %xmm3 +; SSE41-NEXT: movdqa 48(%rdi), %xmm6 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] +; SSE41-NEXT: movdqa %xmm5, %xmm8 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 ; SSE41-NEXT: movdqa %xmm5, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 -; SSE41-NEXT: packusdw %xmm9, %xmm2 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm8 ; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm5, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 +; SSE41-NEXT: packusdw %xmm8, %xmm1 +; SSE41-NEXT: movdqa %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm5, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 ; SSE41-NEXT: movdqa %xmm5, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 -; SSE41-NEXT: pxor %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm5, %xmm1 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm5, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: packusdw %xmm7, %xmm2 +; SSE41-NEXT: packusdw %xmm2, %xmm1 +; SSE41-NEXT: packuswb %xmm1, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: packusdw %xmm8, %xmm3 -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: packuswb %xmm2, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v8i64_v8i8: @@ -2819,51 +2849,51 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm5, %xmm7 ; SSE2-NEXT: pxor %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm3, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE2-NEXT: pand %xmm9, %xmm7 +; SSE2-NEXT: pand %xmm8, %xmm7 ; SSE2-NEXT: pand %xmm7, %xmm5 ; SSE2-NEXT: pandn %xmm1, %xmm7 ; SSE2-NEXT: por %xmm5, %xmm7 ; SSE2-NEXT: movdqa %xmm6, %xmm5 ; SSE2-NEXT: pxor %xmm2, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2] -; SSE2-NEXT: movdqa %xmm3, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE2-NEXT: pand %xmm9, %xmm5 +; SSE2-NEXT: pand %xmm8, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm6 ; SSE2-NEXT: pandn %xmm1, %xmm5 ; SSE2-NEXT: por %xmm6, %xmm5 ; SSE2-NEXT: packuswb %xmm7, %xmm5 ; SSE2-NEXT: movdqa %xmm4, %xmm6 ; SSE2-NEXT: pxor %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: movdqa %xmm3, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm6 -; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: pand %xmm7, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm4 ; SSE2-NEXT: pandn %xmm1, %xmm6 ; SSE2-NEXT: por %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: packuswb %xmm6, %xmm4 -; SSE2-NEXT: packuswb %xmm4, %xmm5 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: packuswb %xmm6, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm5 ; SSE2-NEXT: packuswb %xmm5, %xmm5 ; SSE2-NEXT: movq %xmm5, (%rsi) ; SSE2-NEXT: retq @@ -2878,107 +2908,106 @@ ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: movdqa %xmm5, %xmm7 ; SSSE3-NEXT: pxor %xmm2, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm3, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259711,9223372039002259711] +; SSSE3-NEXT: movdqa %xmm3, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm7 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm7 -; SSSE3-NEXT: pand %xmm9, %xmm7 +; SSSE3-NEXT: pand %xmm8, %xmm7 ; SSSE3-NEXT: pand %xmm7, %xmm5 ; SSSE3-NEXT: pandn %xmm1, %xmm7 ; SSSE3-NEXT: por %xmm5, %xmm7 ; SSSE3-NEXT: movdqa %xmm6, %xmm5 ; SSSE3-NEXT: pxor %xmm2, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2] -; SSSE3-NEXT: movdqa %xmm3, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm3, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm5 ; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm5 -; SSSE3-NEXT: pand %xmm9, %xmm5 +; SSSE3-NEXT: pand %xmm8, %xmm5 ; SSSE3-NEXT: pand %xmm5, %xmm6 ; SSSE3-NEXT: pandn %xmm1, %xmm5 ; SSSE3-NEXT: por %xmm6, %xmm5 ; SSSE3-NEXT: packuswb %xmm7, %xmm5 ; SSSE3-NEXT: movdqa %xmm4, %xmm6 ; SSSE3-NEXT: pxor %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: movdqa %xmm3, %xmm8 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: movdqa %xmm3, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm6 -; SSSE3-NEXT: pand %xmm8, %xmm6 +; SSSE3-NEXT: pand %xmm7, %xmm6 ; SSSE3-NEXT: pand %xmm6, %xmm4 ; SSSE3-NEXT: pandn %xmm1, %xmm6 ; SSSE3-NEXT: por %xmm4, %xmm6 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 -; SSSE3-NEXT: pand %xmm3, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pandn %xmm1, %xmm4 -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: packuswb %xmm6, %xmm4 -; SSSE3-NEXT: packuswb %xmm4, %xmm5 +; SSSE3-NEXT: pxor %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: pand %xmm4, %xmm2 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pandn %xmm1, %xmm2 +; SSSE3-NEXT: por %xmm0, %xmm2 +; SSSE3-NEXT: packuswb %xmm6, %xmm2 +; SSSE3-NEXT: packuswb %xmm2, %xmm5 ; SSSE3-NEXT: packuswb %xmm5, %xmm5 ; SSSE3-NEXT: movq %xmm5, (%rsi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v8i64_v8i8_store: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm8 -; SSE41-NEXT: movdqa 16(%rdi), %xmm7 -; SSE41-NEXT: movdqa 32(%rdi), %xmm3 -; SSE41-NEXT: movdqa 48(%rdi), %xmm6 -; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm4, %xmm9 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 -; SSE41-NEXT: packusdw %xmm9, %xmm7 +; SSE41-NEXT: movdqa (%rdi), %xmm7 +; SSE41-NEXT: movdqa 16(%rdi), %xmm6 +; SSE41-NEXT: movdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movdqa 48(%rdi), %xmm5 +; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] ; SSE41-NEXT: movdqa %xmm4, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm8 +; SSE41-NEXT: movapd %xmm1, %xmm8 ; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm8 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: packusdw %xmm8, %xmm2 -; SSE41-NEXT: packusdw %xmm2, %xmm7 -; SSE41-NEXT: packuswb %xmm7, %xmm7 -; SSE41-NEXT: movq %xmm7, (%rsi) +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm6 +; SSE41-NEXT: packusdw %xmm8, %xmm6 +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm4, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 +; SSE41-NEXT: pxor %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] +; SSE41-NEXT: pand %xmm5, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: packusdw %xmm7, %xmm1 +; SSE41-NEXT: packusdw %xmm1, %xmm6 +; SSE41-NEXT: packuswb %xmm6, %xmm6 +; SSE41-NEXT: movq %xmm6, (%rsi) ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v8i64_v8i8_store: @@ -3037,14 +3066,16 @@ ; AVX512-LABEL: trunc_usat_v8i64_v8i8_store: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpmovusqb %zmm0, (%rsi) +; AVX512-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_usat_v8i64_v8i8_store: ; SKX: # %bb.0: ; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 -; SKX-NEXT: vpmovusqb %zmm0, (%rsi) +; SKX-NEXT: vpmovusqb %zmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rsi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %a0 = load <8 x i64>, ptr %p0 @@ -3059,7 +3090,7 @@ ; SSE2-LABEL: trunc_usat_v16i64_v16i8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa 96(%rdi), %xmm1 -; SSE2-NEXT: movdqa 112(%rdi), %xmm4 +; SSE2-NEXT: movdqa 112(%rdi), %xmm3 ; SSE2-NEXT: movdqa 64(%rdi), %xmm6 ; SSE2-NEXT: movdqa 80(%rdi), %xmm7 ; SSE2-NEXT: movdqa (%rdi), %xmm10 @@ -3067,96 +3098,96 @@ ; SSE2-NEXT: movdqa 32(%rdi), %xmm8 ; SSE2-NEXT: movdqa 48(%rdi), %xmm9 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm0, %xmm11 -; SSE2-NEXT: pxor %xmm3, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm5, %xmm13 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm13 +; SSE2-NEXT: pxor %xmm4, %xmm11 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm5, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm11 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm11 -; SSE2-NEXT: pand %xmm13, %xmm11 +; SSE2-NEXT: pand %xmm12, %xmm11 ; SSE2-NEXT: pand %xmm11, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm11 ; SSE2-NEXT: por %xmm0, %xmm11 ; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] -; SSE2-NEXT: movdqa %xmm5, %xmm13 -; SSE2-NEXT: pcmpgtd %xmm12, %xmm13 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm12 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm13, %xmm0 +; SSE2-NEXT: pand %xmm12, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm10 ; SSE2-NEXT: pandn %xmm2, %xmm0 ; SSE2-NEXT: por %xmm10, %xmm0 ; SSE2-NEXT: packuswb %xmm11, %xmm0 ; SSE2-NEXT: movdqa %xmm9, %xmm10 -; SSE2-NEXT: pxor %xmm3, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: movdqa %xmm5, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: pxor %xmm4, %xmm10 +; SSE2-NEXT: movdqa %xmm5, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm10 -; SSE2-NEXT: pand %xmm12, %xmm10 +; SSE2-NEXT: pand %xmm11, %xmm10 ; SSE2-NEXT: pand %xmm10, %xmm9 ; SSE2-NEXT: pandn %xmm2, %xmm10 ; SSE2-NEXT: por %xmm9, %xmm10 ; SSE2-NEXT: movdqa %xmm8, %xmm9 -; SSE2-NEXT: pxor %xmm3, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] -; SSE2-NEXT: movdqa %xmm5, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: pxor %xmm4, %xmm9 +; SSE2-NEXT: movdqa %xmm5, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm9 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm9 -; SSE2-NEXT: pand %xmm12, %xmm9 +; SSE2-NEXT: pand %xmm11, %xmm9 ; SSE2-NEXT: pand %xmm9, %xmm8 ; SSE2-NEXT: pandn %xmm2, %xmm9 ; SSE2-NEXT: por %xmm8, %xmm9 ; SSE2-NEXT: packuswb %xmm10, %xmm9 ; SSE2-NEXT: packuswb %xmm9, %xmm0 ; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: pxor %xmm3, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSE2-NEXT: movdqa %xmm5, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pxor %xmm4, %xmm8 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm8 -; SSE2-NEXT: pand %xmm10, %xmm8 +; SSE2-NEXT: pand %xmm9, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm7 ; SSE2-NEXT: pandn %xmm2, %xmm8 ; SSE2-NEXT: por %xmm7, %xmm8 ; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: pxor %xmm3, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] -; SSE2-NEXT: movdqa %xmm5, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pxor %xmm4, %xmm7 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE2-NEXT: pand %xmm10, %xmm7 +; SSE2-NEXT: pand %xmm9, %xmm7 ; SSE2-NEXT: pand %xmm7, %xmm6 ; SSE2-NEXT: pandn %xmm2, %xmm7 ; SSE2-NEXT: por %xmm6, %xmm7 ; SSE2-NEXT: packuswb %xmm8, %xmm7 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: pxor %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm5, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE2-NEXT: pand %xmm9, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm6 -; SSE2-NEXT: por %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm8, %xmm5 +; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm2, %xmm4 ; SSE2-NEXT: por %xmm1, %xmm4 @@ -3168,7 +3199,7 @@ ; SSSE3-LABEL: trunc_usat_v16i64_v16i8: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa 96(%rdi), %xmm1 -; SSSE3-NEXT: movdqa 112(%rdi), %xmm4 +; SSSE3-NEXT: movdqa 112(%rdi), %xmm3 ; SSSE3-NEXT: movdqa 64(%rdi), %xmm6 ; SSSE3-NEXT: movdqa 80(%rdi), %xmm7 ; SSSE3-NEXT: movdqa (%rdi), %xmm10 @@ -3176,96 +3207,96 @@ ; SSSE3-NEXT: movdqa 32(%rdi), %xmm8 ; SSSE3-NEXT: movdqa 48(%rdi), %xmm9 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255] -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] ; SSSE3-NEXT: movdqa %xmm0, %xmm11 -; SSSE3-NEXT: pxor %xmm3, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm5, %xmm13 -; SSSE3-NEXT: pcmpgtd %xmm12, %xmm13 +; SSSE3-NEXT: pxor %xmm4, %xmm11 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259711,9223372039002259711] +; SSSE3-NEXT: movdqa %xmm5, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm11, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm11 ; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm11 -; SSSE3-NEXT: pand %xmm13, %xmm11 +; SSSE3-NEXT: pand %xmm12, %xmm11 ; SSSE3-NEXT: pand %xmm11, %xmm0 ; SSSE3-NEXT: pandn %xmm2, %xmm11 ; SSSE3-NEXT: por %xmm0, %xmm11 ; SSSE3-NEXT: movdqa %xmm10, %xmm0 -; SSSE3-NEXT: pxor %xmm3, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,0,2,2] -; SSSE3-NEXT: movdqa %xmm5, %xmm13 -; SSSE3-NEXT: pcmpgtd %xmm12, %xmm13 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm5, %xmm12 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm12 +; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm0 -; SSSE3-NEXT: pand %xmm13, %xmm0 +; SSSE3-NEXT: pand %xmm12, %xmm0 ; SSSE3-NEXT: pand %xmm0, %xmm10 ; SSSE3-NEXT: pandn %xmm2, %xmm0 ; SSSE3-NEXT: por %xmm10, %xmm0 ; SSSE3-NEXT: packuswb %xmm11, %xmm0 ; SSSE3-NEXT: movdqa %xmm9, %xmm10 -; SSSE3-NEXT: pxor %xmm3, %xmm10 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSSE3-NEXT: movdqa %xmm5, %xmm12 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm12 +; SSSE3-NEXT: pxor %xmm4, %xmm10 +; SSSE3-NEXT: movdqa %xmm5, %xmm11 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm10 ; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm10 -; SSSE3-NEXT: pand %xmm12, %xmm10 +; SSSE3-NEXT: pand %xmm11, %xmm10 ; SSSE3-NEXT: pand %xmm10, %xmm9 ; SSSE3-NEXT: pandn %xmm2, %xmm10 ; SSSE3-NEXT: por %xmm9, %xmm10 ; SSSE3-NEXT: movdqa %xmm8, %xmm9 -; SSSE3-NEXT: pxor %xmm3, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] -; SSSE3-NEXT: movdqa %xmm5, %xmm12 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm12 +; SSSE3-NEXT: pxor %xmm4, %xmm9 +; SSSE3-NEXT: movdqa %xmm5, %xmm11 +; SSSE3-NEXT: pcmpgtd %xmm9, %xmm11 +; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm9 ; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm9 -; SSSE3-NEXT: pand %xmm12, %xmm9 +; SSSE3-NEXT: pand %xmm11, %xmm9 ; SSSE3-NEXT: pand %xmm9, %xmm8 ; SSSE3-NEXT: pandn %xmm2, %xmm9 ; SSSE3-NEXT: por %xmm8, %xmm9 ; SSSE3-NEXT: packuswb %xmm10, %xmm9 ; SSSE3-NEXT: packuswb %xmm9, %xmm0 ; SSSE3-NEXT: movdqa %xmm7, %xmm8 -; SSSE3-NEXT: pxor %xmm3, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; SSSE3-NEXT: movdqa %xmm5, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm10 +; SSSE3-NEXT: pxor %xmm4, %xmm8 +; SSSE3-NEXT: movdqa %xmm5, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm8 ; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm8 -; SSSE3-NEXT: pand %xmm10, %xmm8 +; SSSE3-NEXT: pand %xmm9, %xmm8 ; SSSE3-NEXT: pand %xmm8, %xmm7 ; SSSE3-NEXT: pandn %xmm2, %xmm8 ; SSSE3-NEXT: por %xmm7, %xmm8 ; SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSSE3-NEXT: pxor %xmm3, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] -; SSSE3-NEXT: movdqa %xmm5, %xmm10 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm10 +; SSSE3-NEXT: pxor %xmm4, %xmm7 +; SSSE3-NEXT: movdqa %xmm5, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm7 ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm7 -; SSSE3-NEXT: pand %xmm10, %xmm7 +; SSSE3-NEXT: pand %xmm9, %xmm7 ; SSSE3-NEXT: pand %xmm7, %xmm6 ; SSSE3-NEXT: pandn %xmm2, %xmm7 ; SSSE3-NEXT: por %xmm6, %xmm7 ; SSSE3-NEXT: packuswb %xmm8, %xmm7 -; SSSE3-NEXT: movdqa %xmm4, %xmm6 -; SSSE3-NEXT: pxor %xmm3, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSSE3-NEXT: movdqa %xmm5, %xmm9 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSSE3-NEXT: movdqa %xmm3, %xmm6 +; SSSE3-NEXT: pxor %xmm4, %xmm6 +; SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm6, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm6 -; SSSE3-NEXT: pand %xmm9, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm4 +; SSSE3-NEXT: pand %xmm8, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm3 ; SSSE3-NEXT: pandn %xmm2, %xmm6 -; SSSE3-NEXT: por %xmm4, %xmm6 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm3, %xmm6 +; SSSE3-NEXT: pxor %xmm1, %xmm4 +; SSSE3-NEXT: movdqa %xmm5, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm5, %xmm4 ; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 -; SSSE3-NEXT: pand %xmm5, %xmm4 +; SSSE3-NEXT: pand %xmm3, %xmm4 ; SSSE3-NEXT: pand %xmm4, %xmm1 ; SSSE3-NEXT: pandn %xmm2, %xmm4 ; SSSE3-NEXT: por %xmm1, %xmm4 @@ -3276,103 +3307,102 @@ ; ; SSE41-LABEL: trunc_usat_v16i64_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa 96(%rdi), %xmm3 -; SSE41-NEXT: movdqa 112(%rdi), %xmm5 -; SSE41-NEXT: movdqa 64(%rdi), %xmm8 -; SSE41-NEXT: movdqa 80(%rdi), %xmm9 -; SSE41-NEXT: movdqa (%rdi), %xmm12 -; SSE41-NEXT: movdqa 16(%rdi), %xmm2 -; SSE41-NEXT: movdqa 32(%rdi), %xmm10 -; SSE41-NEXT: movdqa 48(%rdi), %xmm11 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm6, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm13 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483903,2147483903,2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm14, %xmm0 -; SSE41-NEXT: pand %xmm13, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm13 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm13 -; SSE41-NEXT: movdqa %xmm12, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm0[0,0,2,2] +; SSE41-NEXT: movdqa 96(%rdi), %xmm2 +; SSE41-NEXT: movdqa 112(%rdi), %xmm4 +; SSE41-NEXT: movdqa 64(%rdi), %xmm7 +; SSE41-NEXT: movdqa 80(%rdi), %xmm8 +; SSE41-NEXT: movdqa (%rdi), %xmm11 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movdqa 32(%rdi), %xmm9 +; SSE41-NEXT: movdqa 48(%rdi), %xmm10 +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [255,255] +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm14, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm2 -; SSE41-NEXT: packusdw %xmm13, %xmm2 -; SSE41-NEXT: movdqa %xmm11, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259711,9223372039002259711] ; SSE41-NEXT: movdqa %xmm6, %xmm12 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm12 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm13, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] ; SSE41-NEXT: pand %xmm12, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm12 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm12 +; SSE41-NEXT: movapd %xmm3, %xmm12 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm12 +; SSE41-NEXT: movdqa %xmm11, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm6, %xmm13 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1 +; SSE41-NEXT: packusdw %xmm12, %xmm1 ; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm6, %xmm11 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 -; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm13, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] ; SSE41-NEXT: pand %xmm11, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm11 +; SSE41-NEXT: movapd %xmm3, %xmm11 ; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm11 -; SSE41-NEXT: packusdw %xmm12, %xmm11 -; SSE41-NEXT: packusdw %xmm11, %xmm2 ; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm6, %xmm10 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm12 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm12 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] ; SSE41-NEXT: pand %xmm10, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm10 +; SSE41-NEXT: movapd %xmm3, %xmm10 ; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm10 +; SSE41-NEXT: packusdw %xmm11, %xmm10 +; SSE41-NEXT: packusdw %xmm10, %xmm1 ; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm6, %xmm9 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] ; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm9 +; SSE41-NEXT: movapd %xmm3, %xmm9 ; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9 -; SSE41-NEXT: packusdw %xmm10, %xmm9 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 +; SSE41-NEXT: movdqa %xmm7, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 ; SSE41-NEXT: movdqa %xmm6, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm10 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] ; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm8 -; SSE41-NEXT: pxor %xmm3, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm6, %xmm1 +; SSE41-NEXT: movapd %xmm3, %xmm8 +; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm8 +; SSE41-NEXT: packusdw %xmm9, %xmm8 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] +; SSE41-NEXT: pand %xmm7, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm7 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm7 +; SSE41-NEXT: pxor %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm6, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: packusdw %xmm7, %xmm3 +; SSE41-NEXT: packusdw %xmm3, %xmm8 +; SSE41-NEXT: packuswb %xmm8, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 -; SSE41-NEXT: packusdw %xmm8, %xmm4 -; SSE41-NEXT: packusdw %xmm4, %xmm9 -; SSE41-NEXT: packuswb %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_usat_v16i64_v16i8: @@ -3637,7 +3667,8 @@ ; ; AVX512VL-LABEL: trunc_usat_v4i32_v4i8_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusdb %xmm0, (%rdi) +; AVX512VL-NEXT: vpmovusdb %xmm0, %xmm0 +; AVX512VL-NEXT: vmovd %xmm0, (%rdi) ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: trunc_usat_v4i32_v4i8_store: @@ -3650,12 +3681,14 @@ ; ; AVX512BWVL-LABEL: trunc_usat_v4i32_v4i8_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusdb %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusdb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovd %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v4i32_v4i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusdb %xmm0, (%rdi) +; SKX-NEXT: vpmovusdb %xmm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp ult <4 x i32> %a0, %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> @@ -3862,7 +3895,8 @@ ; ; AVX512VL-LABEL: trunc_usat_v8i32_v8i8_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusdb %ymm0, (%rdi) +; AVX512VL-NEXT: vpmovusdb %ymm0, %xmm0 +; AVX512VL-NEXT: vmovq %xmm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -3876,13 +3910,15 @@ ; ; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i8_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusdb %ymm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v8i32_v8i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovusdb %ymm0, (%rdi) +; SKX-NEXT: vpmovusdb %ymm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %1 = icmp ult <8 x i32> %a0, @@ -4299,12 +4335,14 @@ ; ; AVX512BWVL-LABEL: trunc_usat_v8i16_v8i8_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovuswb %xmm0, (%rdi) +; AVX512BWVL-NEXT: vpmovuswb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rdi) ; AVX512BWVL-NEXT: retq ; ; SKX-LABEL: trunc_usat_v8i16_v8i8_store: ; SKX: # %bb.0: -; SKX-NEXT: vpmovuswb %xmm0, (%rdi) +; SKX-NEXT: vpmovuswb %xmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: retq %1 = icmp ult <8 x i16> %a0, %2 = select <8 x i1> %1, <8 x i16> %a0, <8 x i16> diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -84,7 +84,8 @@ ; ; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_ashr: ; AVX2-FAST-ALL: # %bb.0: # %entry -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [1,3,5,7,1,3,5,7] +; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -132,7 +133,8 @@ ; ; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_lshr: ; AVX2-FAST-ALL: # %bb.0: # %entry -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [1,3,5,7,1,3,5,7] +; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1949,11 +1951,22 @@ } define <8 x i16> @PR32160(<8 x i32> %x) { -; SSE-LABEL: PR32160: -; SSE: # %bb.0: -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: retq +; SSE2-LABEL: PR32160: +; SSE2: # %bb.0: +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: PR32160: +; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: PR32160: +; SSE41: # %bb.0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9] +; SSE41-NEXT: retq ; ; AVX-LABEL: PR32160: ; AVX: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll --- a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll @@ -16,27 +16,28 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandnps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -141,27 +142,28 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandnps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -266,34 +268,35 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandnps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -415,34 +418,35 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandnps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -562,29 +566,30 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX1-LABEL: testv16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandnps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 ; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -710,29 +715,30 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX1-LABEL: testv16i16u: ; AVX1: # %bb.0: -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandnps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 ; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 @@ -860,24 +866,25 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandnps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1002,24 +1009,25 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vandnps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll --- a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll @@ -264,30 +264,31 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512CD-LABEL: testv32i16: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm2 -; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm2 -; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512CD-NEXT: vpsrlw $4, %ymm2, %ymm2 -; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512CD-NEXT: vpaddb %ymm4, %ymm2, %ymm2 +; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512CD-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512CD-NEXT: vpand %ymm1, %ymm4, %ymm4 +; AVX512CD-NEXT: vpshufb %ymm4, %ymm3, %ymm4 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm4, %ymm2 ; AVX512CD-NEXT: vpsllw $8, %ymm2, %ymm4 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm4, %ymm2 ; AVX512CD-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm1 -; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm1 -; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm4 +; AVX512CD-NEXT: vpshufb %ymm4, %ymm3, %ymm4 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm4, %ymm0, %ymm0 ; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -334,19 +335,20 @@ ; ; AVX512VPOPCNTDQ-LABEL: testv32i16: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm2, %zmm2 -; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; BITALG-LABEL: testv32i16: @@ -363,30 +365,31 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512CD-LABEL: testv32i16u: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm2 -; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm2 -; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512CD-NEXT: vpsrlw $4, %ymm2, %ymm2 -; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512CD-NEXT: vpaddb %ymm4, %ymm2, %ymm2 +; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512CD-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512CD-NEXT: vpand %ymm1, %ymm4, %ymm4 +; AVX512CD-NEXT: vpshufb %ymm4, %ymm3, %ymm4 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm4, %ymm2 ; AVX512CD-NEXT: vpsllw $8, %ymm2, %ymm4 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm4, %ymm2 ; AVX512CD-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm1 -; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm1 -; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm4 +; AVX512CD-NEXT: vpshufb %ymm4, %ymm3, %ymm4 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm4, %ymm0, %ymm0 ; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -433,19 +436,20 @@ ; ; AVX512VPOPCNTDQ-LABEL: testv32i16u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm2, %zmm2 -; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; BITALG-LABEL: testv32i16u: @@ -464,25 +468,26 @@ ; AVX512CD: # %bb.0: ; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm3 -; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512CD-NEXT: vpaddb %ymm4, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm2 -; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm2 -; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512CD-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3 +; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512CD-NEXT: retq ; @@ -522,25 +527,26 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -560,25 +566,26 @@ ; AVX512CD: # %bb.0: ; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm3 -; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512CD-NEXT: vpaddb %ymm4, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm2 -; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm2 -; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512CD-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3 +; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512CD-NEXT: retq ; @@ -618,25 +625,26 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll --- a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll @@ -590,13 +590,13 @@ ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE-NEXT: movdqa %xmm2, (%rdi) -; SSE-NEXT: movdqa %xmm1, 16(%rdi) +; SSE-NEXT: movdqa %xmm2, 16(%rdi) +; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rsi) -; SSE-NEXT: movdqa %xmm1, 16(%rsi) +; SSE-NEXT: movdqa %xmm0, 16(%rsi) +; SSE-NEXT: movdqa %xmm1, (%rsi) ; SSE-NEXT: .LBB18_2: # %if.end ; SSE-NEXT: retq ; @@ -609,12 +609,12 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa %xmm2, (%rdi) -; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, (%rsi) -; AVX1-NEXT: vmovdqa %xmm1, 16(%rsi) +; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi) +; AVX1-NEXT: vmovdqa %xmm1, (%rsi) ; AVX1-NEXT: .LBB18_2: # %if.end ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -2332,11 +2332,8 @@ ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movl 8(%rdi), %ecx -; SSE2-NEXT: shll $13, %ecx -; SSE2-NEXT: movq %rax, %rdx -; SSE2-NEXT: shrq $51, %rdx -; SSE2-NEXT: orl %ecx, %edx -; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: shldq $13, %rax, %rcx +; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: shrq $34, %rax ; SSE2-NEXT: movd %eax, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] @@ -2353,11 +2350,8 @@ ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: movl 8(%rdi), %ecx -; SSSE3-NEXT: shll $13, %ecx -; SSSE3-NEXT: movq %rax, %rdx -; SSSE3-NEXT: shrq $51, %rdx -; SSSE3-NEXT: orl %ecx, %edx -; SSSE3-NEXT: movd %edx, %xmm1 +; SSSE3-NEXT: shldq $13, %rax, %rcx +; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: shrq $34, %rax ; SSSE3-NEXT: movd %eax, %xmm2 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] @@ -2367,15 +2361,12 @@ ; ; SSE41-LABEL: zext_4i17_to_4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movl 8(%rdi), %eax -; SSE41-NEXT: shll $13, %eax -; SSE41-NEXT: movq (%rdi), %rcx -; SSE41-NEXT: movq %rcx, %rdx -; SSE41-NEXT: shrq $51, %rdx -; SSE41-NEXT: orl %eax, %edx -; SSE41-NEXT: movq %rcx, %rax +; SSE41-NEXT: movq (%rdi), %rax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: movl 8(%rdi), %edx +; SSE41-NEXT: shldq $13, %rax, %rdx ; SSE41-NEXT: shrq $17, %rax -; SSE41-NEXT: movd %ecx, %xmm0 ; SSE41-NEXT: pinsrd $1, %eax, %xmm0 ; SSE41-NEXT: shrq $34, %rcx ; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 @@ -2385,15 +2376,12 @@ ; ; AVX1-LABEL: zext_4i17_to_4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: movl 8(%rdi), %eax -; AVX1-NEXT: shll $13, %eax -; AVX1-NEXT: movq (%rdi), %rcx -; AVX1-NEXT: movq %rcx, %rdx -; AVX1-NEXT: shrq $51, %rdx -; AVX1-NEXT: orl %eax, %edx -; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: movl 8(%rdi), %edx +; AVX1-NEXT: shldq $13, %rax, %rdx ; AVX1-NEXT: shrq $17, %rax -; AVX1-NEXT: vmovd %ecx, %xmm0 ; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX1-NEXT: shrq $34, %rcx ; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 @@ -2403,15 +2391,12 @@ ; ; AVX2-LABEL: zext_4i17_to_4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: movl 8(%rdi), %eax -; AVX2-NEXT: shll $13, %eax -; AVX2-NEXT: movq (%rdi), %rcx -; AVX2-NEXT: movq %rcx, %rdx -; AVX2-NEXT: shrq $51, %rdx -; AVX2-NEXT: orl %eax, %edx -; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: movl 8(%rdi), %edx +; AVX2-NEXT: shldq $13, %rax, %rdx ; AVX2-NEXT: shrq $17, %rax -; AVX2-NEXT: vmovd %ecx, %xmm0 ; AVX2-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX2-NEXT: shrq $34, %rcx ; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 @@ -2422,15 +2407,12 @@ ; ; AVX512-LABEL: zext_4i17_to_4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: movl 8(%rdi), %eax -; AVX512-NEXT: shll $13, %eax -; AVX512-NEXT: movq (%rdi), %rcx -; AVX512-NEXT: movq %rcx, %rdx -; AVX512-NEXT: shrq $51, %rdx -; AVX512-NEXT: orl %eax, %edx -; AVX512-NEXT: movq %rcx, %rax +; AVX512-NEXT: movq (%rdi), %rax +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: movq %rax, %rcx +; AVX512-NEXT: movl 8(%rdi), %edx +; AVX512-NEXT: shldq $13, %rax, %rdx ; AVX512-NEXT: shrq $17, %rax -; AVX512-NEXT: vmovd %ecx, %xmm0 ; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX512-NEXT: shrq $34, %rcx ; AVX512-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 @@ -2555,25 +2537,31 @@ define <4 x i64> @splatshuf_zext_v4i64(<4 x i32> %x) { ; SSE2-LABEL: splatshuf_zext_v4i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: splatshuf_zext_v4i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: splatshuf_zext_v4i64: ; SSE41: # %bb.0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatshuf_zext_v4i64: @@ -2645,25 +2633,30 @@ define <8 x i32> @splatshuf_zext_v8i32_unmatched_undef(<8 x i16> %x) { ; SSE2-LABEL: splatshuf_zext_v8i32_unmatched_undef: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,3,2,4,5,6,7] ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,5,7,7] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: splatshuf_zext_v8i32_unmatched_undef: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero ; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,xmm1[u,u],zero,zero,xmm1[6,7],zero,zero,xmm1[14,15],zero,zero ; SSSE3-NEXT: retq ; ; SSE41-LABEL: splatshuf_zext_v8i32_unmatched_undef: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero ; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,xmm1[6,7],zero,zero,xmm1[6,7],zero,zero,xmm1[14,15],zero,zero ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatshuf_zext_v8i32_unmatched_undef: diff --git a/llvm/test/CodeGen/X86/viabs.ll b/llvm/test/CodeGen/X86/viabs.ll --- a/llvm/test/CodeGen/X86/viabs.ll +++ b/llvm/test/CodeGen/X86/viabs.ll @@ -174,35 +174,50 @@ define <4 x i32> @test_abs_le_v4i32(<4 x i32> %a) nounwind { ; SSE2-LABEL: test_abs_le_v4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm1, %xmm0 ; SSE2-NEXT: psubd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test_abs_le_v4i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pabsd %xmm0, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: psubd %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: test_abs_le_v4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pabsd %xmm0, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] +; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: psubd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_abs_le_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpabsd %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_le_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpabsd %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; AVX2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_abs_le_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpabsd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1e,0xc0] +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] +; AVX512-NEXT: vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1 # encoding: [0x62,0xf3,0x7d,0x18,0x1f,0x0d,A,A,A,A,0x01] +; AVX512-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-5, kind: reloc_riprel_4byte +; AVX512-NEXT: vpsubd %xmm0, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0xfa,0xc0] ; AVX512-NEXT: retq # encoding: [0xc3] %tmp1neg = sub <4 x i32> zeroinitializer, %a %b = icmp sle <4 x i32> %a, zeroinitializer @@ -411,44 +426,68 @@ define <8 x i32> @test_abs_le_v8i32(<8 x i32> %a) nounwind { ; SSE2-LABEL: test_abs_le_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: psubd %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: psubd %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: psubd %xmm3, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test_abs_le_v8i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pabsd %xmm0, %xmm0 -; SSSE3-NEXT: pabsd %xmm1, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: psubd %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm3, %xmm1 +; SSSE3-NEXT: psubd %xmm3, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: test_abs_le_v8i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pabsd %xmm0, %xmm0 -; SSE41-NEXT: pabsd %xmm1, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: psubd %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm1 +; SSE41-NEXT: psubd %xmm3, %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_abs_le_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpabsd %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpabsd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_le_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpabsd %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_abs_le_v8i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpabsd %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1e,0xc0] +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] +; AVX512-NEXT: vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k1 # encoding: [0x62,0xf3,0x7d,0x38,0x1f,0x0d,A,A,A,A,0x01] +; AVX512-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-5, kind: reloc_riprel_4byte +; AVX512-NEXT: vpsubd %ymm0, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0xfa,0xc0] ; AVX512-NEXT: retq # encoding: [0xc3] %tmp1neg = sub <8 x i32> zeroinitializer, %a %b = icmp sle <8 x i32> %a, zeroinitializer @@ -459,61 +498,103 @@ define <16 x i32> @test_abs_le_16i32(<16 x i32> %a) nounwind { ; SSE2-LABEL: test_abs_le_16i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: psubd %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: psubd %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: psubd %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: psubd %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm7, %xmm1 +; SSE2-NEXT: psubd %xmm7, %xmm1 +; SSE2-NEXT: pxor %xmm6, %xmm2 +; SSE2-NEXT: psubd %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm3 +; SSE2-NEXT: psubd %xmm5, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test_abs_le_16i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pabsd %xmm0, %xmm0 -; SSSE3-NEXT: pabsd %xmm1, %xmm1 -; SSSE3-NEXT: pabsd %xmm2, %xmm2 -; SSSE3-NEXT: pabsd %xmm3, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 +; SSSE3-NEXT: movdqa %xmm4, %xmm6 +; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 +; SSSE3-NEXT: movdqa %xmm4, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: psubd %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm7, %xmm1 +; SSSE3-NEXT: psubd %xmm7, %xmm1 +; SSSE3-NEXT: pxor %xmm6, %xmm2 +; SSSE3-NEXT: psubd %xmm6, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm3 +; SSSE3-NEXT: psubd %xmm5, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: test_abs_le_16i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pabsd %xmm0, %xmm0 -; SSE41-NEXT: pabsd %xmm1, %xmm1 -; SSE41-NEXT: pabsd %xmm2, %xmm2 -; SSE41-NEXT: pabsd %xmm3, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm6 +; SSE41-NEXT: pcmpgtd %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: psubd %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm7, %xmm1 +; SSE41-NEXT: psubd %xmm7, %xmm1 +; SSE41-NEXT: pxor %xmm6, %xmm2 +; SSE41-NEXT: psubd %xmm6, %xmm2 +; SSE41-NEXT: pxor %xmm5, %xmm3 +; SSE41-NEXT: psubd %xmm5, %xmm3 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_abs_le_16i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpabsd %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpabsd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vpabsd %xmm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpabsd %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpsubd %xmm5, %xmm3, %xmm6 +; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1] +; AVX1-NEXT: vpcmpgtd %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpcmpgtd %xmm1, %xmm6, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 +; AVX1-NEXT: vpcmpgtd %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpgtd %xmm0, %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-NEXT: vblendvps %ymm5, %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vblendvps %ymm2, %ymm4, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_le_16i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpabsd %ymm0, %ymm0 -; AVX2-NEXT: vpabsd %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_abs_le_16i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpabsd %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x1e,0xc0] +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] +; AVX512-NEXT: vpcmpltd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1 # encoding: [0x62,0xf3,0x7d,0x58,0x1f,0x0d,A,A,A,A,0x01] +; AVX512-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-5, kind: reloc_riprel_4byte +; AVX512-NEXT: vpsubd %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0xfa,0xc0] ; AVX512-NEXT: retq # encoding: [0xc3] %tmp1neg = sub <16 x i32> zeroinitializer, %a %b = icmp sle <16 x i32> %a, zeroinitializer @@ -637,92 +718,203 @@ define <8 x i64> @test_abs_le_v8i64(<8 x i64> %a) nounwind { ; SSE2-LABEL: test_abs_le_v8i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: psubq %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: psubq %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: psubq %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483649,2147483649] +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: pxor %xmm5, %xmm7 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm7 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: pxor %xmm5, %xmm8 +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm8, %xmm9 +; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm6, %xmm0 +; SSE2-NEXT: psubq %xmm6, %xmm0 +; SSE2-NEXT: pxor %xmm9, %xmm1 +; SSE2-NEXT: psubq %xmm9, %xmm1 +; SSE2-NEXT: pxor %xmm7, %xmm2 +; SSE2-NEXT: psubq %xmm7, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm3 ; SSE2-NEXT: psubq %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test_abs_le_v8i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: psubq %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: psubq %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: psubq %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483649,2147483649] +; SSSE3-NEXT: movdqa %xmm6, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm9, %xmm4 +; SSSE3-NEXT: movdqa %xmm2, %xmm7 +; SSSE3-NEXT: pxor %xmm5, %xmm7 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm7 +; SSSE3-NEXT: movdqa %xmm1, %xmm8 +; SSSE3-NEXT: pxor %xmm5, %xmm8 +; SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSSE3-NEXT: por %xmm8, %xmm9 +; SSSE3-NEXT: pxor %xmm0, %xmm5 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm6, %xmm0 +; SSSE3-NEXT: psubq %xmm6, %xmm0 +; SSSE3-NEXT: pxor %xmm9, %xmm1 +; SSSE3-NEXT: psubq %xmm9, %xmm1 +; SSSE3-NEXT: pxor %xmm7, %xmm2 +; SSSE3-NEXT: psubq %xmm7, %xmm2 ; SSSE3-NEXT: pxor %xmm4, %xmm3 ; SSSE3-NEXT: psubq %xmm4, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: test_abs_le_v8i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: psubq %xmm0, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm4 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: psubq %xmm1, %xmm6 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: psubq %xmm2, %xmm6 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2 -; SSE41-NEXT: psubq %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 -; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483649,2147483649] +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm7 +; SSE41-NEXT: pxor %xmm5, %xmm7 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSE41-NEXT: pand %xmm9, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm7 +; SSE41-NEXT: movdqa %xmm1, %xmm8 +; SSE41-NEXT: pxor %xmm5, %xmm8 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm9 +; SSE41-NEXT: pxor %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm6 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: psubq %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm1 +; SSE41-NEXT: psubq %xmm9, %xmm1 +; SSE41-NEXT: pxor %xmm7, %xmm2 +; SSE41-NEXT: psubq %xmm7, %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm3 +; SSE41-NEXT: psubq %xmm4, %xmm3 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_abs_le_v8i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsubq %xmm0, %xmm3, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpsubq %xmm5, %xmm3, %xmm6 +; AVX1-NEXT: vpsubq %xmm0, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [1,1] +; AVX1-NEXT: # xmm6 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm6, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm6, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-NEXT: vblendvpd %ymm5, %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vblendvpd %ymm2, %ymm4, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_le_v8i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_abs_le_v8i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpabsq %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x1f,0xc0] +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] +; AVX512-NEXT: vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1 # encoding: [0x62,0xf3,0xfd,0x58,0x1f,0x0d,A,A,A,A,0x01] +; AVX512-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-5, kind: reloc_riprel_4byte +; AVX512-NEXT: vpsubq %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0xf5,0x49,0xfb,0xc0] ; AVX512-NEXT: retq # encoding: [0xc3] %tmp1neg = sub <8 x i64> zeroinitializer, %a %b = icmp sle <8 x i64> %a, zeroinitializer @@ -737,20 +929,53 @@ ; SSE2-NEXT: movdqu 16(%rdi), %xmm1 ; SSE2-NEXT: movdqu 32(%rdi), %xmm2 ; SSE2-NEXT: movdqu 48(%rdi), %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: psubq %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: psubq %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: psubq %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm5, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483649,2147483649] +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: pxor %xmm5, %xmm7 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm10, %xmm7 +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: pxor %xmm5, %xmm8 +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSE2-NEXT: por %xmm8, %xmm9 +; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm10, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: pxor %xmm6, %xmm0 +; SSE2-NEXT: psubq %xmm6, %xmm0 +; SSE2-NEXT: pxor %xmm9, %xmm1 +; SSE2-NEXT: psubq %xmm9, %xmm1 +; SSE2-NEXT: pxor %xmm7, %xmm2 +; SSE2-NEXT: psubq %xmm7, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm3 ; SSE2-NEXT: psubq %xmm4, %xmm3 ; SSE2-NEXT: retq @@ -761,81 +986,161 @@ ; SSSE3-NEXT: movdqu 16(%rdi), %xmm1 ; SSSE3-NEXT: movdqu 32(%rdi), %xmm2 ; SSSE3-NEXT: movdqu 48(%rdi), %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: psubq %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: psubq %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: psubq %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483649,2147483649] +; SSSE3-NEXT: movdqa %xmm6, %xmm7 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] +; SSSE3-NEXT: pand %xmm8, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSSE3-NEXT: por %xmm9, %xmm4 +; SSSE3-NEXT: movdqa %xmm2, %xmm7 +; SSSE3-NEXT: pxor %xmm5, %xmm7 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm7 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSSE3-NEXT: pand %xmm9, %xmm10 +; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm10, %xmm7 +; SSSE3-NEXT: movdqa %xmm1, %xmm8 +; SSSE3-NEXT: pxor %xmm5, %xmm8 +; SSSE3-NEXT: movdqa %xmm6, %xmm9 +; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSSE3-NEXT: por %xmm8, %xmm9 +; SSSE3-NEXT: pxor %xmm0, %xmm5 +; SSSE3-NEXT: movdqa %xmm6, %xmm8 +; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSSE3-NEXT: pand %xmm10, %xmm5 +; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] +; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: pxor %xmm6, %xmm0 +; SSSE3-NEXT: psubq %xmm6, %xmm0 +; SSSE3-NEXT: pxor %xmm9, %xmm1 +; SSSE3-NEXT: psubq %xmm9, %xmm1 +; SSSE3-NEXT: pxor %xmm7, %xmm2 +; SSSE3-NEXT: psubq %xmm7, %xmm2 ; SSSE3-NEXT: pxor %xmm4, %xmm3 ; SSSE3-NEXT: psubq %xmm4, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: test_abs_le_v8i64_fold: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqu (%rdi), %xmm1 -; SSE41-NEXT: movdqu 16(%rdi), %xmm2 -; SSE41-NEXT: movdqu 32(%rdi), %xmm3 -; SSE41-NEXT: movdqu 48(%rdi), %xmm4 -; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: psubq %xmm1, %xmm6 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: psubq %xmm2, %xmm6 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2 -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: psubq %xmm3, %xmm6 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3 -; SSE41-NEXT: psubq %xmm4, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm3, %xmm2 -; SSE41-NEXT: movapd %xmm4, %xmm3 +; SSE41-NEXT: movdqu (%rdi), %xmm0 +; SSE41-NEXT: movdqu 16(%rdi), %xmm1 +; SSE41-NEXT: movdqu 32(%rdi), %xmm2 +; SSE41-NEXT: movdqu 48(%rdi), %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm4 +; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483649,2147483649] +; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: pcmpgtd %xmm4, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] +; SSE41-NEXT: pand %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE41-NEXT: por %xmm9, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm7 +; SSE41-NEXT: pxor %xmm5, %xmm7 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSE41-NEXT: pand %xmm9, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm10, %xmm7 +; SSE41-NEXT: movdqa %xmm1, %xmm8 +; SSE41-NEXT: pxor %xmm5, %xmm8 +; SSE41-NEXT: movdqa %xmm6, %xmm9 +; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSE41-NEXT: por %xmm8, %xmm9 +; SSE41-NEXT: pxor %xmm0, %xmm5 +; SSE41-NEXT: movdqa %xmm6, %xmm8 +; SSE41-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] +; SSE41-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE41-NEXT: pand %xmm10, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3] +; SSE41-NEXT: por %xmm5, %xmm6 +; SSE41-NEXT: pxor %xmm6, %xmm0 +; SSE41-NEXT: psubq %xmm6, %xmm0 +; SSE41-NEXT: pxor %xmm9, %xmm1 +; SSE41-NEXT: psubq %xmm9, %xmm1 +; SSE41-NEXT: pxor %xmm7, %xmm2 +; SSE41-NEXT: psubq %xmm7, %xmm2 +; SSE41-NEXT: pxor %xmm4, %xmm3 +; SSE41-NEXT: psubq %xmm4, %xmm3 ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_abs_le_v8i64_fold: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovupd (%rdi), %ymm0 -; AVX1-NEXT: vmovupd 32(%rdi), %ymm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubq 16(%rdi), %xmm2, %xmm3 -; AVX1-NEXT: vpsubq (%rdi), %xmm2, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0 -; AVX1-NEXT: vpsubq 48(%rdi), %xmm2, %xmm3 -; AVX1-NEXT: vpsubq 32(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovupd 32(%rdi), %ymm0 +; AVX1-NEXT: vmovupd (%rdi), %ymm2 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3 +; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vmovdqu (%rdi), %xmm5 +; AVX1-NEXT: vmovdqu 16(%rdi), %xmm6 +; AVX1-NEXT: vmovdqu 32(%rdi), %xmm7 +; AVX1-NEXT: vpsubq %xmm7, %xmm1, %xmm8 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 +; AVX1-NEXT: vpsubq %xmm6, %xmm1, %xmm8 +; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm8 +; AVX1-NEXT: vmovddup {{.*#+}} xmm9 = [1,1] +; AVX1-NEXT: # xmm9 = mem[0,0] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm9, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm9, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvpd %ymm1, %ymm4, %ymm0, %ymm1 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm9, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm9, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vblendvpd %ymm0, %ymm8, %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_le_v8i64_fold: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_abs_le_v8i64_fold: ; AVX512: # %bb.0: -; AVX512-NEXT: vpabsq (%rdi), %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x1f,0x07] +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 # encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x07] +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9] +; AVX512-NEXT: vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1 # encoding: [0x62,0xf3,0xfd,0x58,0x1f,0x0d,A,A,A,A,0x01] +; AVX512-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-5, kind: reloc_riprel_4byte +; AVX512-NEXT: vpsubq %zmm0, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0xf5,0x49,0xfb,0xc0] ; AVX512-NEXT: retq # encoding: [0xc3] %a = load <8 x i64>, ptr %a.ptr, align 8 %tmp1neg = sub <8 x i64> zeroinitializer, %a diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll --- a/llvm/test/CodeGen/X86/vselect-avx.ll +++ b/llvm/test/CodeGen/X86/vselect-avx.ll @@ -95,14 +95,22 @@ define void @test3(<4 x i32> %induction30, ptr %tmp16, ptr %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) { ; AVX1-LABEL: test3: ; AVX1: ## %bb.0: -; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1431655766,1431655766,1431655766,1431655766] +; AVX1-NEXT: vpmuldq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmuldq %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; AVX1-NEXT: vpsrld $31, %xmm3, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rdi) ; AVX1-NEXT: vmovq %xmm1, (%rsi) @@ -110,17 +118,22 @@ ; ; AVX2-LABEL: test3: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2863311531,2863311531,2863311531,2863311531] -; AVX2-NEXT: vpmulld %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [715827882,715827882,715827882,715827882] -; AVX2-NEXT: vpaddd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1431655764,1431655764,1431655764,1431655764] -; AVX2-NEXT: vpminud %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1431655766,1431655766,1431655766,1431655766] +; AVX2-NEXT: vpmuldq %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpmuldq %xmm4, %xmm0, %xmm4 +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX2-NEXT: vpsrld $31, %xmm3, %xmm4 +; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpaddd %xmm3, %xmm3, %xmm4 +; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpsubd %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rdi) ; AVX2-NEXT: vmovq %xmm1, (%rsi) @@ -128,9 +141,18 @@ ; ; AVX512-LABEL: test3: ; AVX512: ## %bb.0: -; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-NEXT: vpcmpleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1431655766,1431655766,1431655766,1431655766] +; AVX512-NEXT: vpmuldq %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vpmuldq %xmm4, %xmm0, %xmm4 +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] +; AVX512-NEXT: vpsrld $31, %xmm3, %xmm4 +; AVX512-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vpaddd %xmm3, %xmm3, %xmm4 +; AVX512-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vpsubd %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512-NEXT: vpmovdw %ymm0, %xmm0 @@ -290,17 +312,18 @@ ; AVX1-NEXT: vmovups (%rax), %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,3,2,1] ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,3,2] -; AVX1-NEXT: vmovups 16, %xmm2 -; AVX1-NEXT: vmovups 32, %xmm3 -; AVX1-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm3[1],mem[2,3] -; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2],xmm4[3] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,3,2,1] -; AVX1-NEXT: vblendps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] -; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; AVX1-NEXT: vmovups 0, %xmm2 +; AVX1-NEXT: vmovups 16, %xmm3 +; AVX1-NEXT: vmovups 32, %xmm4 +; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0],xmm4[1],xmm2[2,3] +; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2],xmm5[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,3,2,1] +; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3] ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,3,2] ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vcmpneqps %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vblendvps %xmm3, %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vblendvps %xmm3, %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vblendvps %xmm3, %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovups %xmm0, (%rax) ; AVX1-NEXT: vmovups %xmm1, (%rax) diff --git a/llvm/test/CodeGen/X86/vselect-zero.ll b/llvm/test/CodeGen/X86/vselect-zero.ll --- a/llvm/test/CodeGen/X86/vselect-zero.ll +++ b/llvm/test/CodeGen/X86/vselect-zero.ll @@ -274,11 +274,18 @@ ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: signbit_mask_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsraw $15, %xmm0, %xmm0 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: signbit_mask_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512DQBW-LABEL: signbit_mask_v8i16: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX512DQBW-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512DQBW-NEXT: retq %cond = icmp slt <8 x i16> %a, zeroinitializer %r = select <8 x i1> %cond, <8 x i16> %b, <8 x i16> zeroinitializer ret <8 x i16> %r @@ -440,11 +447,18 @@ ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: signbit_mask_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsraw $15, %ymm0, %ymm0 -; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: signbit_mask_v16i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512DQBW-LABEL: signbit_mask_v16i16: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: vpsraw $15, %ymm0, %ymm0 +; AVX512DQBW-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQBW-NEXT: retq %cond = icmp slt <16 x i16> %a, zeroinitializer %r = select <16 x i1> %cond, <16 x i16> %b, <16 x i16> zeroinitializer ret <16 x i16> %r @@ -639,11 +653,18 @@ ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: signbit_setmask_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsraw $15, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: signbit_setmask_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512DQBW-LABEL: signbit_setmask_v8i16: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX512DQBW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQBW-NEXT: retq %cond = icmp slt <8 x i16> %a, zeroinitializer %r = select <8 x i1> %cond, <8 x i16> , <8 x i16> %b ret <8 x i16> %r @@ -770,11 +791,18 @@ ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: signbit_setmask_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsraw $15, %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: signbit_setmask_v16i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512DQBW-LABEL: signbit_setmask_v16i16: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: vpsraw $15, %ymm0, %ymm0 +; AVX512DQBW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512DQBW-NEXT: retq %cond = icmp slt <16 x i16> %a, zeroinitializer %r = select <16 x i1> %cond, <16 x i16> , <16 x i16> %b ret <16 x i16> %r @@ -980,11 +1008,18 @@ ; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: not_signbit_mask_swap_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsraw $15, %xmm0, %xmm0 -; AVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: not_signbit_mask_swap_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512DQBW-LABEL: not_signbit_mask_swap_v8i16: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX512DQBW-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512DQBW-NEXT: retq %cond = icmp slt <8 x i16> %a, zeroinitializer %r = select <8 x i1> %cond, <8 x i16> zeroinitializer, <8 x i16> %b ret <8 x i16> %r diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -569,8 +569,7 @@ ; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 @@ -583,23 +582,10 @@ ; SSE41-NEXT: movd %edi, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] ; SSE41-NEXT: por %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE41-NEXT: pinsrd $1, %edi, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq -; -; AVX-LABEL: simplify_select: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX-NEXT: vmovd %edi, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] -; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2 -; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 -; AVX-NEXT: retq %a = insertelement <2 x i32> , i32 %x, i32 1 %b = insertelement <2 x i32> , i32 %x, i32 0 %y = or <2 x i32> %a, %b diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll --- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -671,12 +671,12 @@ ; X86-SSE2-NEXT: andl $15, %ecx ; X86-SSE2-NEXT: movl (%esp,%ecx), %edx ; X86-SSE2-NEXT: movl 4(%esp,%ecx), %esi -; X86-SSE2-NEXT: movl 12(%esp,%ecx), %edi -; X86-SSE2-NEXT: movl 8(%esp,%ecx), %ecx -; X86-SSE2-NEXT: movl %ecx, 8(%eax) -; X86-SSE2-NEXT: movl %edi, 12(%eax) -; X86-SSE2-NEXT: movl %edx, (%eax) +; X86-SSE2-NEXT: movl 8(%esp,%ecx), %edi +; X86-SSE2-NEXT: movl 12(%esp,%ecx), %ecx +; X86-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-SSE2-NEXT: movl %edi, 8(%eax) ; X86-SSE2-NEXT: movl %esi, 4(%eax) +; X86-SSE2-NEXT: movl %edx, (%eax) ; X86-SSE2-NEXT: addl $32, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi @@ -827,12 +827,12 @@ ; X86-SSE2-NEXT: movsbl %cl, %ecx ; X86-SSE2-NEXT: movl 16(%esp,%ecx), %edx ; X86-SSE2-NEXT: movl 20(%esp,%ecx), %esi -; X86-SSE2-NEXT: movl 28(%esp,%ecx), %edi -; X86-SSE2-NEXT: movl 24(%esp,%ecx), %ecx -; X86-SSE2-NEXT: movl %ecx, 8(%eax) -; X86-SSE2-NEXT: movl %edi, 12(%eax) -; X86-SSE2-NEXT: movl %edx, (%eax) +; X86-SSE2-NEXT: movl 24(%esp,%ecx), %edi +; X86-SSE2-NEXT: movl 28(%esp,%ecx), %ecx +; X86-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-SSE2-NEXT: movl %edi, 8(%eax) ; X86-SSE2-NEXT: movl %esi, 4(%eax) +; X86-SSE2-NEXT: movl %edx, (%eax) ; X86-SSE2-NEXT: addl $32, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi @@ -986,12 +986,12 @@ ; X86-SSE2-NEXT: andl $15, %ecx ; X86-SSE2-NEXT: movl (%esp,%ecx), %edx ; X86-SSE2-NEXT: movl 4(%esp,%ecx), %esi -; X86-SSE2-NEXT: movl 12(%esp,%ecx), %edi -; X86-SSE2-NEXT: movl 8(%esp,%ecx), %ecx -; X86-SSE2-NEXT: movl %ecx, 8(%eax) -; X86-SSE2-NEXT: movl %edi, 12(%eax) -; X86-SSE2-NEXT: movl %edx, (%eax) +; X86-SSE2-NEXT: movl 8(%esp,%ecx), %edi +; X86-SSE2-NEXT: movl 12(%esp,%ecx), %ecx +; X86-SSE2-NEXT: movl %ecx, 12(%eax) +; X86-SSE2-NEXT: movl %edi, 8(%eax) ; X86-SSE2-NEXT: movl %esi, 4(%eax) +; X86-SSE2-NEXT: movl %edx, (%eax) ; X86-SSE2-NEXT: addl $32, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi @@ -1088,12 +1088,12 @@ ; X64-SSE2-NEXT: andl $31, %esi ; X64-SSE2-NEXT: movq -64(%rsp,%rsi), %rax ; X64-SSE2-NEXT: movq -56(%rsp,%rsi), %rcx -; X64-SSE2-NEXT: movq -40(%rsp,%rsi), %rdi -; X64-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi -; X64-SSE2-NEXT: movq %rsi, 16(%rdx) -; X64-SSE2-NEXT: movq %rdi, 24(%rdx) -; X64-SSE2-NEXT: movq %rax, (%rdx) +; X64-SSE2-NEXT: movq -48(%rsp,%rsi), %rdi +; X64-SSE2-NEXT: movq -40(%rsp,%rsi), %rsi +; X64-SSE2-NEXT: movq %rsi, 24(%rdx) +; X64-SSE2-NEXT: movq %rdi, 16(%rdx) ; X64-SSE2-NEXT: movq %rcx, 8(%rdx) +; X64-SSE2-NEXT: movq %rax, (%rdx) ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: lshr_32bytes: @@ -1121,10 +1121,8 @@ ; X64-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: andl $31, %eax -; X64-AVX-NEXT: vmovups -64(%rsp,%rax), %xmm0 -; X64-AVX-NEXT: vmovups -48(%rsp,%rax), %xmm1 -; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX-NEXT: vmovups -64(%rsp,%rax), %ymm0 +; X64-AVX-NEXT: vmovups %ymm0, (%rdx) ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq ; @@ -1171,23 +1169,23 @@ ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 12(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 20(%esp,%eax), %esi -; X86-SSE2-NEXT: movl 16(%esp,%eax), %edi -; X86-SSE2-NEXT: movl 28(%esp,%eax), %ebx -; X86-SSE2-NEXT: movl 24(%esp,%eax), %ebp -; X86-SSE2-NEXT: movl 36(%esp,%eax), %edx -; X86-SSE2-NEXT: movl 32(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 16(%esp,%eax), %esi +; X86-SSE2-NEXT: movl 20(%esp,%eax), %edi +; X86-SSE2-NEXT: movl 24(%esp,%eax), %ebx +; X86-SSE2-NEXT: movl 28(%esp,%eax), %ebp +; X86-SSE2-NEXT: movl 32(%esp,%eax), %edx +; X86-SSE2-NEXT: movl 36(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %ecx, 24(%eax) -; X86-SSE2-NEXT: movl %edx, 28(%eax) -; X86-SSE2-NEXT: movl %ebp, 16(%eax) -; X86-SSE2-NEXT: movl %ebx, 20(%eax) -; X86-SSE2-NEXT: movl %edi, 8(%eax) -; X86-SSE2-NEXT: movl %esi, 12(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, (%eax) +; X86-SSE2-NEXT: movl %ecx, 28(%eax) +; X86-SSE2-NEXT: movl %edx, 24(%eax) +; X86-SSE2-NEXT: movl %ebp, 20(%eax) +; X86-SSE2-NEXT: movl %ebx, 16(%eax) +; X86-SSE2-NEXT: movl %edi, 12(%eax) +; X86-SSE2-NEXT: movl %esi, 8(%eax) ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: addl $72, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi @@ -1229,10 +1227,8 @@ ; X86-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: vmovups %ymm0, (%esp) ; X86-AVX-NEXT: andl $31, %ecx -; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0 -; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1 -; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) -; X86-AVX-NEXT: vmovups %xmm0, (%eax) +; X86-AVX-NEXT: vmovups (%esp,%ecx), %ymm0 +; X86-AVX-NEXT: vmovups %ymm0, (%eax) ; X86-AVX-NEXT: addl $64, %esp ; X86-AVX-NEXT: vzeroupper ; X86-AVX-NEXT: retl @@ -1264,12 +1260,12 @@ ; X64-SSE2-NEXT: movsbq %sil, %rax ; X64-SSE2-NEXT: movq -32(%rsp,%rax), %rcx ; X64-SSE2-NEXT: movq -24(%rsp,%rax), %rsi -; X64-SSE2-NEXT: movq -8(%rsp,%rax), %rdi -; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rax -; X64-SSE2-NEXT: movq %rax, 16(%rdx) -; X64-SSE2-NEXT: movq %rdi, 24(%rdx) -; X64-SSE2-NEXT: movq %rcx, (%rdx) +; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rdi +; X64-SSE2-NEXT: movq -8(%rsp,%rax), %rax +; X64-SSE2-NEXT: movq %rax, 24(%rdx) +; X64-SSE2-NEXT: movq %rdi, 16(%rdx) ; X64-SSE2-NEXT: movq %rsi, 8(%rdx) +; X64-SSE2-NEXT: movq %rcx, (%rdx) ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: shl_32bytes: @@ -1301,10 +1297,8 @@ ; X64-AVX-NEXT: andb $31, %al ; X64-AVX-NEXT: negb %al ; X64-AVX-NEXT: movsbq %al, %rax -; X64-AVX-NEXT: vmovups -32(%rsp,%rax), %xmm0 -; X64-AVX-NEXT: vmovups -16(%rsp,%rax), %xmm1 -; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX-NEXT: vmovups -32(%rsp,%rax), %ymm0 +; X64-AVX-NEXT: vmovups %ymm0, (%rdx) ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq ; @@ -1348,28 +1342,28 @@ ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: andb $31, %al ; X86-SSE2-NEXT: negb %al -; X86-SSE2-NEXT: movsbl %al, %eax -; X86-SSE2-NEXT: movl 40(%esp,%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 44(%esp,%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 52(%esp,%eax), %esi -; X86-SSE2-NEXT: movl 48(%esp,%eax), %edi -; X86-SSE2-NEXT: movl 60(%esp,%eax), %ebx -; X86-SSE2-NEXT: movl 56(%esp,%eax), %ebp -; X86-SSE2-NEXT: movl 68(%esp,%eax), %edx -; X86-SSE2-NEXT: movl 64(%esp,%eax), %ecx +; X86-SSE2-NEXT: movsbl %al, %ecx +; X86-SSE2-NEXT: movl 40(%esp,%ecx), %eax +; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 44(%esp,%ecx), %eax +; X86-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-SSE2-NEXT: movl 48(%esp,%ecx), %esi +; X86-SSE2-NEXT: movl 52(%esp,%ecx), %edi +; X86-SSE2-NEXT: movl 56(%esp,%ecx), %ebx +; X86-SSE2-NEXT: movl 60(%esp,%ecx), %ebp +; X86-SSE2-NEXT: movl 64(%esp,%ecx), %edx +; X86-SSE2-NEXT: movl 68(%esp,%ecx), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %ecx, 24(%eax) -; X86-SSE2-NEXT: movl %edx, 28(%eax) -; X86-SSE2-NEXT: movl %ebp, 16(%eax) -; X86-SSE2-NEXT: movl %ebx, 20(%eax) -; X86-SSE2-NEXT: movl %edi, 8(%eax) -; X86-SSE2-NEXT: movl %esi, 12(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, (%eax) +; X86-SSE2-NEXT: movl %ecx, 28(%eax) +; X86-SSE2-NEXT: movl %edx, 24(%eax) +; X86-SSE2-NEXT: movl %ebp, 20(%eax) +; X86-SSE2-NEXT: movl %ebx, 16(%eax) +; X86-SSE2-NEXT: movl %edi, 12(%eax) +; X86-SSE2-NEXT: movl %esi, 8(%eax) ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: addl $72, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi @@ -1415,10 +1409,8 @@ ; X86-AVX-NEXT: andb $31, %cl ; X86-AVX-NEXT: negb %cl ; X86-AVX-NEXT: movsbl %cl, %ecx -; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm0 -; X86-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm1 -; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) -; X86-AVX-NEXT: vmovups %xmm0, (%eax) +; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %ymm0 +; X86-AVX-NEXT: vmovups %ymm0, (%eax) ; X86-AVX-NEXT: addl $64, %esp ; X86-AVX-NEXT: vzeroupper ; X86-AVX-NEXT: retl @@ -1449,12 +1441,12 @@ ; X64-SSE2-NEXT: andl $31, %esi ; X64-SSE2-NEXT: movq -64(%rsp,%rsi), %rax ; X64-SSE2-NEXT: movq -56(%rsp,%rsi), %rcx -; X64-SSE2-NEXT: movq -40(%rsp,%rsi), %rdi -; X64-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi -; X64-SSE2-NEXT: movq %rsi, 16(%rdx) -; X64-SSE2-NEXT: movq %rdi, 24(%rdx) -; X64-SSE2-NEXT: movq %rax, (%rdx) +; X64-SSE2-NEXT: movq -48(%rsp,%rsi), %rdi +; X64-SSE2-NEXT: movq -40(%rsp,%rsi), %rsi +; X64-SSE2-NEXT: movq %rsi, 24(%rdx) +; X64-SSE2-NEXT: movq %rdi, 16(%rdx) ; X64-SSE2-NEXT: movq %rcx, 8(%rdx) +; X64-SSE2-NEXT: movq %rax, (%rdx) ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: ashr_32bytes: @@ -1493,10 +1485,9 @@ ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: andl $31, %esi -; X64-AVX-NEXT: vmovups -64(%rsp,%rsi), %xmm0 -; X64-AVX-NEXT: vmovups -48(%rsp,%rsi), %xmm1 -; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX-NEXT: vmovups -64(%rsp,%rsi), %ymm0 +; X64-AVX-NEXT: vmovups %ymm0, (%rdx) +; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq ; ; X86-SSE2-LABEL: ashr_32bytes: @@ -1543,23 +1534,23 @@ ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 12(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 20(%esp,%eax), %esi -; X86-SSE2-NEXT: movl 16(%esp,%eax), %edi -; X86-SSE2-NEXT: movl 28(%esp,%eax), %ebx -; X86-SSE2-NEXT: movl 24(%esp,%eax), %ebp -; X86-SSE2-NEXT: movl 36(%esp,%eax), %edx -; X86-SSE2-NEXT: movl 32(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 16(%esp,%eax), %esi +; X86-SSE2-NEXT: movl 20(%esp,%eax), %edi +; X86-SSE2-NEXT: movl 24(%esp,%eax), %ebx +; X86-SSE2-NEXT: movl 28(%esp,%eax), %ebp +; X86-SSE2-NEXT: movl 32(%esp,%eax), %edx +; X86-SSE2-NEXT: movl 36(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %ecx, 24(%eax) -; X86-SSE2-NEXT: movl %edx, 28(%eax) -; X86-SSE2-NEXT: movl %ebp, 16(%eax) -; X86-SSE2-NEXT: movl %ebx, 20(%eax) -; X86-SSE2-NEXT: movl %edi, 8(%eax) -; X86-SSE2-NEXT: movl %esi, 12(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, (%eax) +; X86-SSE2-NEXT: movl %ecx, 28(%eax) +; X86-SSE2-NEXT: movl %edx, 24(%eax) +; X86-SSE2-NEXT: movl %ebp, 20(%eax) +; X86-SSE2-NEXT: movl %ebx, 16(%eax) +; X86-SSE2-NEXT: movl %edi, 12(%eax) +; X86-SSE2-NEXT: movl %esi, 8(%eax) ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: addl $72, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi @@ -1637,14 +1628,13 @@ ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: andl $31, %ecx -; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0 -; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1 -; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) -; X86-AVX-NEXT: vmovups %xmm0, (%eax) +; X86-AVX-NEXT: vmovups (%esp,%ecx), %ymm0 +; X86-AVX-NEXT: vmovups %ymm0, (%eax) ; X86-AVX-NEXT: addl $64, %esp ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: popl %ebx +; X86-AVX-NEXT: vzeroupper ; X86-AVX-NEXT: retl %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 @@ -1686,20 +1676,20 @@ ; X64-SSE2-NEXT: andl $63, %esi ; X64-SSE2-NEXT: movq -128(%rsp,%rsi), %rax ; X64-SSE2-NEXT: movq -120(%rsp,%rsi), %rcx -; X64-SSE2-NEXT: movq -104(%rsp,%rsi), %rdi -; X64-SSE2-NEXT: movq -112(%rsp,%rsi), %r8 -; X64-SSE2-NEXT: movq -88(%rsp,%rsi), %r9 -; X64-SSE2-NEXT: movq -96(%rsp,%rsi), %r10 -; X64-SSE2-NEXT: movq -72(%rsp,%rsi), %r11 -; X64-SSE2-NEXT: movq -80(%rsp,%rsi), %rsi -; X64-SSE2-NEXT: movq %rsi, 48(%rdx) -; X64-SSE2-NEXT: movq %r11, 56(%rdx) -; X64-SSE2-NEXT: movq %r10, 32(%rdx) -; X64-SSE2-NEXT: movq %r9, 40(%rdx) -; X64-SSE2-NEXT: movq %r8, 16(%rdx) -; X64-SSE2-NEXT: movq %rdi, 24(%rdx) -; X64-SSE2-NEXT: movq %rax, (%rdx) +; X64-SSE2-NEXT: movq -112(%rsp,%rsi), %rdi +; X64-SSE2-NEXT: movq -104(%rsp,%rsi), %r8 +; X64-SSE2-NEXT: movq -96(%rsp,%rsi), %r9 +; X64-SSE2-NEXT: movq -88(%rsp,%rsi), %r10 +; X64-SSE2-NEXT: movq -80(%rsp,%rsi), %r11 +; X64-SSE2-NEXT: movq -72(%rsp,%rsi), %rsi +; X64-SSE2-NEXT: movq %rsi, 56(%rdx) +; X64-SSE2-NEXT: movq %r11, 48(%rdx) +; X64-SSE2-NEXT: movq %r10, 40(%rdx) +; X64-SSE2-NEXT: movq %r9, 32(%rdx) +; X64-SSE2-NEXT: movq %r8, 24(%rdx) +; X64-SSE2-NEXT: movq %rdi, 16(%rdx) ; X64-SSE2-NEXT: movq %rcx, 8(%rdx) +; X64-SSE2-NEXT: movq %rax, (%rdx) ; X64-SSE2-NEXT: popq %rbx ; X64-SSE2-NEXT: retq ; @@ -1725,8 +1715,8 @@ ; X64-SSE42-NEXT: movups -96(%rsp,%rax), %xmm2 ; X64-SSE42-NEXT: movups -80(%rsp,%rax), %xmm3 ; X64-SSE42-NEXT: movups %xmm3, 48(%rdx) -; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm2, 32(%rdx) +; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm0, (%rdx) ; X64-SSE42-NEXT: retq ; @@ -1741,14 +1731,10 @@ ; X64-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; X64-AVX1-NEXT: andl $63, %eax -; X64-AVX1-NEXT: vmovups -128(%rsp,%rax), %xmm0 -; X64-AVX1-NEXT: vmovups -112(%rsp,%rax), %xmm1 -; X64-AVX1-NEXT: vmovups -96(%rsp,%rax), %xmm2 -; X64-AVX1-NEXT: vmovups -80(%rsp,%rax), %xmm3 -; X64-AVX1-NEXT: vmovups %xmm3, 48(%rdx) -; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx) -; X64-AVX1-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX1-NEXT: vmovups -128(%rsp,%rax), %ymm0 +; X64-AVX1-NEXT: vmovups -96(%rsp,%rax), %ymm1 +; X64-AVX1-NEXT: vmovups %ymm1, 32(%rdx) +; X64-AVX1-NEXT: vmovups %ymm0, (%rdx) ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -1760,14 +1746,8 @@ ; X64-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; X64-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) ; X64-AVX512-NEXT: andl $63, %eax -; X64-AVX512-NEXT: vmovups -128(%rsp,%rax), %xmm0 -; X64-AVX512-NEXT: vmovups -112(%rsp,%rax), %xmm1 -; X64-AVX512-NEXT: vmovups -96(%rsp,%rax), %xmm2 -; X64-AVX512-NEXT: vmovups -80(%rsp,%rax), %xmm3 -; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx) -; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx) -; X64-AVX512-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX512-NEXT: vmovups -128(%rsp,%rax), %zmm0 +; X64-AVX512-NEXT: vmovups %zmm0, (%rdx) ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq ; @@ -1854,55 +1834,55 @@ ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 44(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 52(%esp,%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 48(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 60(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 52(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 56(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 68(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 60(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 64(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 76(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 68(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 72(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 76(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 84(%esp,%eax), %ebp -; X86-SSE2-NEXT: movl 80(%esp,%eax), %ebx -; X86-SSE2-NEXT: movl 92(%esp,%eax), %edi -; X86-SSE2-NEXT: movl 88(%esp,%eax), %esi -; X86-SSE2-NEXT: movl 100(%esp,%eax), %edx -; X86-SSE2-NEXT: movl 96(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 80(%esp,%eax), %ebp +; X86-SSE2-NEXT: movl 84(%esp,%eax), %ebx +; X86-SSE2-NEXT: movl 88(%esp,%eax), %edi +; X86-SSE2-NEXT: movl 92(%esp,%eax), %esi +; X86-SSE2-NEXT: movl 96(%esp,%eax), %edx +; X86-SSE2-NEXT: movl 100(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %ecx, 56(%eax) -; X86-SSE2-NEXT: movl %edx, 60(%eax) -; X86-SSE2-NEXT: movl %esi, 48(%eax) -; X86-SSE2-NEXT: movl %edi, 52(%eax) -; X86-SSE2-NEXT: movl %ebx, 40(%eax) -; X86-SSE2-NEXT: movl %ebp, 44(%eax) +; X86-SSE2-NEXT: movl %ecx, 60(%eax) +; X86-SSE2-NEXT: movl %edx, 56(%eax) +; X86-SSE2-NEXT: movl %esi, 52(%eax) +; X86-SSE2-NEXT: movl %edi, 48(%eax) +; X86-SSE2-NEXT: movl %ebx, 44(%eax) +; X86-SSE2-NEXT: movl %ebp, 40(%eax) ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 32(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 36(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 24(%eax) +; X86-SSE2-NEXT: movl %ecx, 32(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 28(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 16(%eax) +; X86-SSE2-NEXT: movl %ecx, 24(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 20(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-SSE2-NEXT: movl %ecx, 16(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 12(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, (%eax) +; X86-SSE2-NEXT: movl %ecx, 8(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: addl $168, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi @@ -1957,14 +1937,10 @@ ; X86-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; X86-AVX1-NEXT: vmovups %ymm0, (%esp) ; X86-AVX1-NEXT: andl $63, %ecx -; X86-AVX1-NEXT: vmovups (%esp,%ecx), %xmm0 -; X86-AVX1-NEXT: vmovups 16(%esp,%ecx), %xmm1 -; X86-AVX1-NEXT: vmovups 32(%esp,%ecx), %xmm2 -; X86-AVX1-NEXT: vmovups 48(%esp,%ecx), %xmm3 -; X86-AVX1-NEXT: vmovups %xmm3, 48(%eax) -; X86-AVX1-NEXT: vmovups %xmm2, 32(%eax) -; X86-AVX1-NEXT: vmovups %xmm1, 16(%eax) -; X86-AVX1-NEXT: vmovups %xmm0, (%eax) +; X86-AVX1-NEXT: vmovups (%esp,%ecx), %ymm0 +; X86-AVX1-NEXT: vmovups 32(%esp,%ecx), %ymm1 +; X86-AVX1-NEXT: vmovups %ymm1, 32(%eax) +; X86-AVX1-NEXT: vmovups %ymm0, (%eax) ; X86-AVX1-NEXT: addl $128, %esp ; X86-AVX1-NEXT: vzeroupper ; X86-AVX1-NEXT: retl @@ -1981,14 +1957,8 @@ ; X86-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; X86-AVX512-NEXT: vmovups %zmm0, (%esp) ; X86-AVX512-NEXT: andl $63, %ecx -; X86-AVX512-NEXT: vmovups (%esp,%ecx), %xmm0 -; X86-AVX512-NEXT: vmovups 16(%esp,%ecx), %xmm1 -; X86-AVX512-NEXT: vmovups 32(%esp,%ecx), %xmm2 -; X86-AVX512-NEXT: vmovups 48(%esp,%ecx), %xmm3 -; X86-AVX512-NEXT: vmovups %xmm3, 48(%eax) -; X86-AVX512-NEXT: vmovups %xmm2, 32(%eax) -; X86-AVX512-NEXT: vmovups %xmm1, 16(%eax) -; X86-AVX512-NEXT: vmovups %xmm0, (%eax) +; X86-AVX512-NEXT: vmovups (%esp,%ecx), %zmm0 +; X86-AVX512-NEXT: vmovups %zmm0, (%eax) ; X86-AVX512-NEXT: addl $128, %esp ; X86-AVX512-NEXT: vzeroupper ; X86-AVX512-NEXT: retl @@ -2033,20 +2003,20 @@ ; X64-SSE2-NEXT: movslq %esi, %rax ; X64-SSE2-NEXT: movq -64(%rsp,%rax), %rcx ; X64-SSE2-NEXT: movq -56(%rsp,%rax), %rsi -; X64-SSE2-NEXT: movq -40(%rsp,%rax), %rdi -; X64-SSE2-NEXT: movq -48(%rsp,%rax), %r8 -; X64-SSE2-NEXT: movq -24(%rsp,%rax), %r9 -; X64-SSE2-NEXT: movq -32(%rsp,%rax), %r10 -; X64-SSE2-NEXT: movq -8(%rsp,%rax), %r11 -; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rax -; X64-SSE2-NEXT: movq %rax, 48(%rdx) -; X64-SSE2-NEXT: movq %r11, 56(%rdx) -; X64-SSE2-NEXT: movq %r10, 32(%rdx) -; X64-SSE2-NEXT: movq %r9, 40(%rdx) -; X64-SSE2-NEXT: movq %r8, 16(%rdx) -; X64-SSE2-NEXT: movq %rdi, 24(%rdx) -; X64-SSE2-NEXT: movq %rcx, (%rdx) +; X64-SSE2-NEXT: movq -48(%rsp,%rax), %rdi +; X64-SSE2-NEXT: movq -40(%rsp,%rax), %r8 +; X64-SSE2-NEXT: movq -32(%rsp,%rax), %r9 +; X64-SSE2-NEXT: movq -24(%rsp,%rax), %r10 +; X64-SSE2-NEXT: movq -16(%rsp,%rax), %r11 +; X64-SSE2-NEXT: movq -8(%rsp,%rax), %rax +; X64-SSE2-NEXT: movq %rax, 56(%rdx) +; X64-SSE2-NEXT: movq %r11, 48(%rdx) +; X64-SSE2-NEXT: movq %r10, 40(%rdx) +; X64-SSE2-NEXT: movq %r9, 32(%rdx) +; X64-SSE2-NEXT: movq %r8, 24(%rdx) +; X64-SSE2-NEXT: movq %rdi, 16(%rdx) ; X64-SSE2-NEXT: movq %rsi, 8(%rdx) +; X64-SSE2-NEXT: movq %rcx, (%rdx) ; X64-SSE2-NEXT: popq %rbx ; X64-SSE2-NEXT: retq ; @@ -2074,8 +2044,8 @@ ; X64-SSE42-NEXT: movups -32(%rsp,%rax), %xmm2 ; X64-SSE42-NEXT: movups -16(%rsp,%rax), %xmm3 ; X64-SSE42-NEXT: movups %xmm3, 48(%rdx) -; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm2, 32(%rdx) +; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm0, (%rdx) ; X64-SSE42-NEXT: retq ; @@ -2092,14 +2062,10 @@ ; X64-AVX1-NEXT: andl $63, %eax ; X64-AVX1-NEXT: negl %eax ; X64-AVX1-NEXT: cltq -; X64-AVX1-NEXT: vmovups -64(%rsp,%rax), %xmm0 -; X64-AVX1-NEXT: vmovups -48(%rsp,%rax), %xmm1 -; X64-AVX1-NEXT: vmovups -32(%rsp,%rax), %xmm2 -; X64-AVX1-NEXT: vmovups -16(%rsp,%rax), %xmm3 -; X64-AVX1-NEXT: vmovups %xmm3, 48(%rdx) -; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx) -; X64-AVX1-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX1-NEXT: vmovups -64(%rsp,%rax), %ymm0 +; X64-AVX1-NEXT: vmovups -32(%rsp,%rax), %ymm1 +; X64-AVX1-NEXT: vmovups %ymm1, 32(%rdx) +; X64-AVX1-NEXT: vmovups %ymm0, (%rdx) ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; @@ -2113,14 +2079,8 @@ ; X64-AVX512-NEXT: andl $63, %eax ; X64-AVX512-NEXT: negl %eax ; X64-AVX512-NEXT: cltq -; X64-AVX512-NEXT: vmovups -64(%rsp,%rax), %xmm0 -; X64-AVX512-NEXT: vmovups -48(%rsp,%rax), %xmm1 -; X64-AVX512-NEXT: vmovups -32(%rsp,%rax), %xmm2 -; X64-AVX512-NEXT: vmovups -16(%rsp,%rax), %xmm3 -; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx) -; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx) -; X64-AVX512-NEXT: vmovups %xmm0, (%rdx) +; X64-AVX512-NEXT: vmovups -64(%rsp,%rax), %zmm0 +; X64-AVX512-NEXT: vmovups %zmm0, (%rdx) ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq ; @@ -2207,58 +2167,58 @@ ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl (%ecx), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 4(%ecx), %edx +; X86-SSE2-NEXT: movl 8(%ecx), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 12(%ecx), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 8(%ecx), %edx +; X86-SSE2-NEXT: movl 16(%ecx), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 20(%ecx), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 16(%ecx), %edx +; X86-SSE2-NEXT: movl 24(%ecx), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 28(%ecx), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 24(%ecx), %edx +; X86-SSE2-NEXT: movl 32(%ecx), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 36(%ecx), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 32(%ecx), %edx -; X86-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 44(%ecx), %ebp -; X86-SSE2-NEXT: movl 40(%ecx), %ebx -; X86-SSE2-NEXT: movl 52(%ecx), %edi -; X86-SSE2-NEXT: movl 60(%ecx), %esi +; X86-SSE2-NEXT: movl 40(%ecx), %ebp +; X86-SSE2-NEXT: movl 44(%ecx), %ebx +; X86-SSE2-NEXT: movl 48(%ecx), %edi +; X86-SSE2-NEXT: movl 52(%ecx), %esi ; X86-SSE2-NEXT: movl 56(%ecx), %edx +; X86-SSE2-NEXT: movl 60(%ecx), %ecx ; X86-SSE2-NEXT: negl %eax -; X86-SSE2-NEXT: movl 152(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 108(%esp,%eax), %eax +; X86-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl %ecx, 60(%eax) ; X86-SSE2-NEXT: movl %edx, 56(%eax) -; X86-SSE2-NEXT: movl %esi, 60(%eax) -; X86-SSE2-NEXT: movl %ecx, 48(%eax) -; X86-SSE2-NEXT: movl %edi, 52(%eax) -; X86-SSE2-NEXT: movl %ebx, 40(%eax) -; X86-SSE2-NEXT: movl %ebp, 44(%eax) -; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 32(%eax) +; X86-SSE2-NEXT: movl %esi, 52(%eax) +; X86-SSE2-NEXT: movl %edi, 48(%eax) +; X86-SSE2-NEXT: movl %ebx, 44(%eax) +; X86-SSE2-NEXT: movl %ebp, 40(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 36(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 24(%eax) +; X86-SSE2-NEXT: movl %ecx, 32(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 28(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 16(%eax) +; X86-SSE2-NEXT: movl %ecx, 24(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 20(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-SSE2-NEXT: movl %ecx, 16(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 12(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, (%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: addl $168, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi @@ -2290,13 +2250,13 @@ ; X86-SSE42-NEXT: leal {{[0-9]+}}(%esp), %edx ; X86-SSE42-NEXT: subl %ecx, %edx ; X86-SSE42-NEXT: movups (%edx), %xmm0 -; X86-SSE42-NEXT: movups 16(%edx), %xmm1 -; X86-SSE42-NEXT: movups 32(%edx), %xmm2 +; X86-SSE42-NEXT: movups 32(%edx), %xmm1 +; X86-SSE42-NEXT: movups 48(%edx), %xmm2 ; X86-SSE42-NEXT: negl %ecx -; X86-SSE42-NEXT: movups 112(%esp,%ecx), %xmm3 -; X86-SSE42-NEXT: movups %xmm3, 48(%eax) -; X86-SSE42-NEXT: movups %xmm2, 32(%eax) -; X86-SSE42-NEXT: movups %xmm1, 16(%eax) +; X86-SSE42-NEXT: movups 80(%esp,%ecx), %xmm3 +; X86-SSE42-NEXT: movups %xmm2, 48(%eax) +; X86-SSE42-NEXT: movups %xmm1, 32(%eax) +; X86-SSE42-NEXT: movups %xmm3, 16(%eax) ; X86-SSE42-NEXT: movups %xmm0, (%eax) ; X86-SSE42-NEXT: addl $128, %esp ; X86-SSE42-NEXT: retl @@ -2318,15 +2278,11 @@ ; X86-AVX1-NEXT: andl $63, %ecx ; X86-AVX1-NEXT: leal {{[0-9]+}}(%esp), %edx ; X86-AVX1-NEXT: subl %ecx, %edx -; X86-AVX1-NEXT: vmovups (%edx), %xmm0 -; X86-AVX1-NEXT: vmovups 16(%edx), %xmm1 -; X86-AVX1-NEXT: vmovups 32(%edx), %xmm2 +; X86-AVX1-NEXT: vmovups (%edx), %ymm0 ; X86-AVX1-NEXT: negl %ecx -; X86-AVX1-NEXT: vmovups 112(%esp,%ecx), %xmm3 -; X86-AVX1-NEXT: vmovups %xmm3, 48(%eax) -; X86-AVX1-NEXT: vmovups %xmm2, 32(%eax) -; X86-AVX1-NEXT: vmovups %xmm1, 16(%eax) -; X86-AVX1-NEXT: vmovups %xmm0, (%eax) +; X86-AVX1-NEXT: vmovups 96(%esp,%ecx), %ymm1 +; X86-AVX1-NEXT: vmovups %ymm1, 32(%eax) +; X86-AVX1-NEXT: vmovups %ymm0, (%eax) ; X86-AVX1-NEXT: addl $128, %esp ; X86-AVX1-NEXT: vzeroupper ; X86-AVX1-NEXT: retl @@ -2343,17 +2299,9 @@ ; X86-AVX512-NEXT: vmovups %zmm1, (%esp) ; X86-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) ; X86-AVX512-NEXT: andl $63, %ecx -; X86-AVX512-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-AVX512-NEXT: subl %ecx, %edx -; X86-AVX512-NEXT: vmovups (%edx), %xmm0 -; X86-AVX512-NEXT: vmovups 16(%edx), %xmm1 -; X86-AVX512-NEXT: vmovups 32(%edx), %xmm2 ; X86-AVX512-NEXT: negl %ecx -; X86-AVX512-NEXT: vmovups 112(%esp,%ecx), %xmm3 -; X86-AVX512-NEXT: vmovups %xmm3, 48(%eax) -; X86-AVX512-NEXT: vmovups %xmm2, 32(%eax) -; X86-AVX512-NEXT: vmovups %xmm1, 16(%eax) -; X86-AVX512-NEXT: vmovups %xmm0, (%eax) +; X86-AVX512-NEXT: vmovups 64(%esp,%ecx), %zmm0 +; X86-AVX512-NEXT: vmovups %zmm0, (%eax) ; X86-AVX512-NEXT: addl $128, %esp ; X86-AVX512-NEXT: vzeroupper ; X86-AVX512-NEXT: retl @@ -2398,20 +2346,20 @@ ; X64-SSE2-NEXT: andl $63, %eax ; X64-SSE2-NEXT: movq -128(%rsp,%rax), %rcx ; X64-SSE2-NEXT: movq -120(%rsp,%rax), %rsi -; X64-SSE2-NEXT: movq -104(%rsp,%rax), %rdi -; X64-SSE2-NEXT: movq -112(%rsp,%rax), %r8 -; X64-SSE2-NEXT: movq -88(%rsp,%rax), %r9 -; X64-SSE2-NEXT: movq -96(%rsp,%rax), %r10 -; X64-SSE2-NEXT: movq -72(%rsp,%rax), %r11 -; X64-SSE2-NEXT: movq -80(%rsp,%rax), %rax -; X64-SSE2-NEXT: movq %rax, 48(%rdx) -; X64-SSE2-NEXT: movq %r11, 56(%rdx) -; X64-SSE2-NEXT: movq %r10, 32(%rdx) -; X64-SSE2-NEXT: movq %r9, 40(%rdx) -; X64-SSE2-NEXT: movq %r8, 16(%rdx) -; X64-SSE2-NEXT: movq %rdi, 24(%rdx) -; X64-SSE2-NEXT: movq %rcx, (%rdx) +; X64-SSE2-NEXT: movq -112(%rsp,%rax), %rdi +; X64-SSE2-NEXT: movq -104(%rsp,%rax), %r8 +; X64-SSE2-NEXT: movq -96(%rsp,%rax), %r9 +; X64-SSE2-NEXT: movq -88(%rsp,%rax), %r10 +; X64-SSE2-NEXT: movq -80(%rsp,%rax), %r11 +; X64-SSE2-NEXT: movq -72(%rsp,%rax), %rax +; X64-SSE2-NEXT: movq %rax, 56(%rdx) +; X64-SSE2-NEXT: movq %r11, 48(%rdx) +; X64-SSE2-NEXT: movq %r10, 40(%rdx) +; X64-SSE2-NEXT: movq %r9, 32(%rdx) +; X64-SSE2-NEXT: movq %r8, 24(%rdx) +; X64-SSE2-NEXT: movq %rdi, 16(%rdx) ; X64-SSE2-NEXT: movq %rsi, 8(%rdx) +; X64-SSE2-NEXT: movq %rcx, (%rdx) ; X64-SSE2-NEXT: popq %rbx ; X64-SSE2-NEXT: popq %r14 ; X64-SSE2-NEXT: retq @@ -2443,43 +2391,65 @@ ; X64-SSE42-NEXT: movups -112(%rsp,%rsi), %xmm1 ; X64-SSE42-NEXT: movups -96(%rsp,%rsi), %xmm2 ; X64-SSE42-NEXT: movups -80(%rsp,%rsi), %xmm3 -; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) -; X64-SSE42-NEXT: movups %xmm2, 32(%rdx) ; X64-SSE42-NEXT: movups %xmm3, 48(%rdx) +; X64-SSE42-NEXT: movups %xmm2, 32(%rdx) +; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm0, (%rdx) ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: ashr_64bytes: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX-NEXT: vmovups 32(%rdi), %xmm1 -; X64-AVX-NEXT: movq 48(%rdi), %rax -; X64-AVX-NEXT: movq 56(%rdi), %rcx -; X64-AVX-NEXT: movl (%rsi), %esi -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: sarq $63, %rcx -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: andl $63, %esi -; X64-AVX-NEXT: vmovups -128(%rsp,%rsi), %xmm0 -; X64-AVX-NEXT: vmovups -112(%rsp,%rsi), %xmm1 -; X64-AVX-NEXT: vmovups -96(%rsp,%rsi), %xmm2 -; X64-AVX-NEXT: vmovups -80(%rsp,%rsi), %xmm3 -; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) -; X64-AVX-NEXT: vmovups %xmm2, 32(%rdx) -; X64-AVX-NEXT: vmovups %xmm3, 48(%rdx) -; X64-AVX-NEXT: vmovups %xmm0, (%rdx) -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: ashr_64bytes: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1 +; X64-AVX1-NEXT: movq 48(%rdi), %rax +; X64-AVX1-NEXT: movq 56(%rdi), %rcx +; X64-AVX1-NEXT: movl (%rsi), %esi +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: sarq $63, %rcx +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX1-NEXT: andl $63, %esi +; X64-AVX1-NEXT: vmovups -128(%rsp,%rsi), %ymm0 +; X64-AVX1-NEXT: vmovups -96(%rsp,%rsi), %ymm1 +; X64-AVX1-NEXT: vmovups %ymm1, 32(%rdx) +; X64-AVX1-NEXT: vmovups %ymm0, (%rdx) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: ashr_64bytes: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovups (%rdi), %ymm0 +; X64-AVX512-NEXT: vmovups 32(%rdi), %xmm1 +; X64-AVX512-NEXT: movq 48(%rdi), %rax +; X64-AVX512-NEXT: movq 56(%rdi), %rcx +; X64-AVX512-NEXT: movl (%rsi), %esi +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: sarq $63, %rcx +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-AVX512-NEXT: andl $63, %esi +; X64-AVX512-NEXT: vmovups -128(%rsp,%rsi), %zmm0 +; X64-AVX512-NEXT: vmovups %zmm0, (%rdx) +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq ; ; X86-SSE2-LABEL: ashr_64bytes: ; X86-SSE2: # %bb.0: @@ -2565,55 +2535,55 @@ ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 44(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 52(%esp,%eax), %ecx -; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 48(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 60(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 52(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 56(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 68(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 60(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 64(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SSE2-NEXT: movl 76(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 68(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 72(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 76(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-SSE2-NEXT: movl 84(%esp,%eax), %ebp -; X86-SSE2-NEXT: movl 80(%esp,%eax), %ebx -; X86-SSE2-NEXT: movl 92(%esp,%eax), %edi -; X86-SSE2-NEXT: movl 88(%esp,%eax), %esi -; X86-SSE2-NEXT: movl 100(%esp,%eax), %edx -; X86-SSE2-NEXT: movl 96(%esp,%eax), %ecx +; X86-SSE2-NEXT: movl 80(%esp,%eax), %ebp +; X86-SSE2-NEXT: movl 84(%esp,%eax), %ebx +; X86-SSE2-NEXT: movl 88(%esp,%eax), %edi +; X86-SSE2-NEXT: movl 92(%esp,%eax), %esi +; X86-SSE2-NEXT: movl 96(%esp,%eax), %edx +; X86-SSE2-NEXT: movl 100(%esp,%eax), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl %ecx, 56(%eax) -; X86-SSE2-NEXT: movl %edx, 60(%eax) -; X86-SSE2-NEXT: movl %esi, 48(%eax) -; X86-SSE2-NEXT: movl %edi, 52(%eax) -; X86-SSE2-NEXT: movl %ebx, 40(%eax) -; X86-SSE2-NEXT: movl %ebp, 44(%eax) +; X86-SSE2-NEXT: movl %ecx, 60(%eax) +; X86-SSE2-NEXT: movl %edx, 56(%eax) +; X86-SSE2-NEXT: movl %esi, 52(%eax) +; X86-SSE2-NEXT: movl %edi, 48(%eax) +; X86-SSE2-NEXT: movl %ebx, 44(%eax) +; X86-SSE2-NEXT: movl %ebp, 40(%eax) ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 32(%eax) -; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 36(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 24(%eax) +; X86-SSE2-NEXT: movl %ecx, 32(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 28(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 16(%eax) +; X86-SSE2-NEXT: movl %ecx, 24(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 20(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, 8(%eax) +; X86-SSE2-NEXT: movl %ecx, 16(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 12(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-SSE2-NEXT: movl %ecx, (%eax) +; X86-SSE2-NEXT: movl %ecx, 8(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: addl $168, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi @@ -2677,60 +2647,105 @@ ; X86-SSE42-NEXT: popl %ebx ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: ashr_64bytes: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: pushl %ebx -; X86-AVX-NEXT: pushl %edi -; X86-AVX-NEXT: pushl %esi -; X86-AVX-NEXT: subl $128, %esp -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX-NEXT: vmovups (%edx), %ymm0 -; X86-AVX-NEXT: vmovups 32(%edx), %xmm1 -; X86-AVX-NEXT: movl 48(%edx), %esi -; X86-AVX-NEXT: movl 52(%edx), %edi -; X86-AVX-NEXT: movl 56(%edx), %ebx -; X86-AVX-NEXT: movl 60(%edx), %edx -; X86-AVX-NEXT: movl (%ecx), %ecx -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: vmovups %ymm0, (%esp) -; X86-AVX-NEXT: sarl $31, %edx -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-AVX-NEXT: andl $63, %ecx -; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0 -; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1 -; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm2 -; X86-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm3 -; X86-AVX-NEXT: vmovups %xmm3, 48(%eax) -; X86-AVX-NEXT: vmovups %xmm2, 32(%eax) -; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) -; X86-AVX-NEXT: vmovups %xmm0, (%eax) -; X86-AVX-NEXT: addl $128, %esp -; X86-AVX-NEXT: popl %esi -; X86-AVX-NEXT: popl %edi -; X86-AVX-NEXT: popl %ebx -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: ashr_64bytes: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebx +; X86-AVX1-NEXT: pushl %edi +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: subl $128, %esp +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: vmovups (%edx), %ymm0 +; X86-AVX1-NEXT: vmovups 32(%edx), %xmm1 +; X86-AVX1-NEXT: movl 48(%edx), %esi +; X86-AVX1-NEXT: movl 52(%edx), %edi +; X86-AVX1-NEXT: movl 56(%edx), %ebx +; X86-AVX1-NEXT: movl 60(%edx), %edx +; X86-AVX1-NEXT: movl (%ecx), %ecx +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: vmovups %ymm0, (%esp) +; X86-AVX1-NEXT: sarl $31, %edx +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX1-NEXT: andl $63, %ecx +; X86-AVX1-NEXT: vmovups (%esp,%ecx), %ymm0 +; X86-AVX1-NEXT: vmovups 32(%esp,%ecx), %ymm1 +; X86-AVX1-NEXT: vmovups %ymm1, 32(%eax) +; X86-AVX1-NEXT: vmovups %ymm0, (%eax) +; X86-AVX1-NEXT: addl $128, %esp +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: popl %edi +; X86-AVX1-NEXT: popl %ebx +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: ashr_64bytes: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: pushl %ebx +; X86-AVX512-NEXT: pushl %edi +; X86-AVX512-NEXT: pushl %esi +; X86-AVX512-NEXT: subl $128, %esp +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX512-NEXT: vmovups (%edx), %ymm0 +; X86-AVX512-NEXT: vmovups 32(%edx), %xmm1 +; X86-AVX512-NEXT: movl 48(%edx), %esi +; X86-AVX512-NEXT: movl 52(%edx), %edi +; X86-AVX512-NEXT: movl 56(%edx), %ebx +; X86-AVX512-NEXT: movl 60(%edx), %edx +; X86-AVX512-NEXT: movl (%ecx), %ecx +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: vmovups %ymm0, (%esp) +; X86-AVX512-NEXT: sarl $31, %edx +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-AVX512-NEXT: andl $63, %ecx +; X86-AVX512-NEXT: vmovups (%esp,%ecx), %zmm0 +; X86-AVX512-NEXT: vmovups %zmm0, (%eax) +; X86-AVX512-NEXT: addl $128, %esp +; X86-AVX512-NEXT: popl %esi +; X86-AVX512-NEXT: popl %edi +; X86-AVX512-NEXT: popl %ebx +; X86-AVX512-NEXT: vzeroupper +; X86-AVX512-NEXT: retl %src = load i512, ptr %src.ptr, align 1 %byteOff = load i512, ptr %byteOff.ptr, align 1 %bitOff = shl i512 %byteOff, 3 @@ -2772,5 +2787,5 @@ ; FALLBACK7: {{.*}} ; FALLBACK8: {{.*}} ; FALLBACK9: {{.*}} -; X86: {{.*}} ; X64: {{.*}} +; X86: {{.*}} diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll --- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll @@ -608,40 +608,42 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ah ; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %ah -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ah, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ah, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ecx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%ebp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl (%esp), %ebp # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 8(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 4(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%edx) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $36, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi @@ -689,13 +691,11 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ecx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp @@ -731,30 +731,30 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %bl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %bl, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -768,7 +768,7 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $32, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $36, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx @@ -779,7 +779,7 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -789,26 +789,28 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%edx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%edx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %bl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%edx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%edx), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ebx, %ebp, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%edx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 4(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 4(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $36, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -920,43 +922,44 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: negb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ebp), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %cl, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ebp), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ebp), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ecx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 12(%edx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 8(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%edx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%edx) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi @@ -998,22 +1001,25 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: notb %cl -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%ebp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%ebp), %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%ecx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 8(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -1048,29 +1054,29 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %al ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, 28(%esp,%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, 28(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi @@ -1122,10 +1128,10 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 12(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %edi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi @@ -1239,40 +1245,42 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: andb $15, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ecx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%ebp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl (%esp), %ebp # 4-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 12(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%edx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 4(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 8(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 4(%edx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%edx) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $36, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi @@ -1321,13 +1329,11 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%ebp), %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%ecx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 8(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $32, %esp @@ -1364,30 +1370,30 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $15, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $32, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1401,7 +1407,7 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $32, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $36, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx @@ -1412,7 +1418,7 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -1423,26 +1429,28 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $15, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%edx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%edx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %bl -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esp,%edx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%edx), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ebx, %ebp, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp,%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esp,%edx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %edx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esp,%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %esi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 8(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 4(%ebp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $32, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 4(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $36, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -1458,7 +1466,6 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx ; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8 @@ -1475,39 +1482,37 @@ ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X64-NO-BMI2-NO-SHLD-NEXT: andb $7, %al ; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %sil, %r9d -; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r9), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r9), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %sil, %r8d +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r8), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r8), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r9), %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r8), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r9), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r8), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_32bytes: @@ -1544,8 +1549,8 @@ ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r10 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: retq @@ -1569,19 +1574,19 @@ ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -64(%rsp,%rcx), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -64(%rsp,%rcx), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rsi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx @@ -1589,8 +1594,8 @@ ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes: @@ -1627,8 +1632,8 @@ ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r11 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx @@ -1640,17 +1645,17 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $84, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb (%ecx), %ch +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%ecx), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edx), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edx), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edx), %edx @@ -1660,10 +1665,10 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -1672,95 +1677,87 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ch, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah -; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ebp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebx), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%ebx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebx), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 28(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 24(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 20(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $88, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $84, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -1822,26 +1819,26 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -1849,24 +1846,24 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 28(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 24(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -1880,13 +1877,13 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $84, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $80, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp @@ -1900,7 +1897,7 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -1915,66 +1912,62 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 20(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp), %ebx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edi, %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edi, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edi, %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 20(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 16(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $84, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $80, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx @@ -1987,13 +1980,13 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $84, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $96, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp @@ -2007,10 +2000,10 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -2023,54 +2016,60 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%edi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%edi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%edi), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%edi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%edi), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%edi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%edi), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%edi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edx,%edx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%edi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, (%esp) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%edi), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%edi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $84, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $96, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -2116,26 +2115,26 @@ ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%r10), %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%r10), %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%r10), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%r10), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: retq @@ -2174,11 +2173,12 @@ ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rsi), %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: retq ; @@ -2205,24 +2205,24 @@ ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rcx), %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rcx), %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rcx), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rcx), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rcx), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rcx), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r9, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; @@ -2263,8 +2263,8 @@ ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %r11 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq @@ -2279,9 +2279,9 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp @@ -2295,10 +2295,10 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -2312,90 +2312,88 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: negb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %cl, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ecx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%ecx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ecx), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah -; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ecx), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 28(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 20(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 24(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 20(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $88, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi @@ -2496,12 +2494,13 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%ebx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 24(%ebx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -2519,13 +2518,13 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $88, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $80, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ebp @@ -2539,10 +2538,10 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -2555,72 +2554,64 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%ecx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%ecx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ecx), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 84(%esp,%ecx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, (%esp), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 76(%esp,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $88, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 20(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 16(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $80, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx @@ -2709,11 +2700,11 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %ecx, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 4(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) @@ -2734,7 +2725,6 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx ; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8 @@ -2752,39 +2742,37 @@ ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X64-NO-BMI2-NO-SHLD-NEXT: andb $7, %al ; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %sil, %r9d -; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r9), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r9), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %sil, %r8d +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r8), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r8), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil -; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r9), %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r8), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r9), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r8), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_32bytes: @@ -2822,8 +2810,8 @@ ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r10 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: retq @@ -2848,19 +2836,19 @@ ; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -64(%rsp,%rcx), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -64(%rsp,%rcx), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rax, %rcx, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rsi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx @@ -2868,8 +2856,8 @@ ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes: @@ -2907,8 +2895,8 @@ ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r11 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx @@ -2920,18 +2908,18 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $88, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $84, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esi), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esi), %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb (%eax), %ch +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esi), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esi), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esi), %eax @@ -2940,11 +2928,11 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: sarl $31, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -2955,95 +2943,87 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %al +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: andb $7, %al -; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %ch, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ah -; X86-NO-BMI2-NO-SHLD-NEXT: notb %ah -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ebp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebx), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%ebx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ebx), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ah, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 28(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 28(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 24(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 20(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $88, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $84, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx @@ -3108,26 +3088,26 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%ebp), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%ebp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%ecx,%ecx), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: leal (%edi,%edi), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -3135,24 +3115,24 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%ebp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 28(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 20(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, 28(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 24(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 20(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $92, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -3166,7 +3146,7 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $84, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $80, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esi), %ecx @@ -3174,7 +3154,7 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esi), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx @@ -3186,7 +3166,7 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) @@ -3201,70 +3181,65 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 20(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %edx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $7, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 28(%edi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 24(%edi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 20(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%ecx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 20(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 16(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 12(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $84, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $80, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx @@ -3277,13 +3252,13 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $84, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $96, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %ebp @@ -3297,10 +3272,10 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -3314,59 +3289,65 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andb $7, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esp,%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%edi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %al -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esp,%edi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%edi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%esi,%esi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%edi), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %eax, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %eax, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esp,%edi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebp,%ebp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%edi), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%edi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%ebx,%ebx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %edx, %esi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%edi), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esp,%edi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal (%edx,%edx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebx, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esp,%edi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, (%esp) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esp,%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%edi), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %edi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esp,%edi), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %eax, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $84, %esp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 24(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 16(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $96, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl %src = load i256, ptr %src.ptr, align 1 %bitOff = load i256, ptr %bitOff.ptr, align 1 %res = ashr i256 %src, %bitOff @@ -3377,7 +3358,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: lshr_64bytes: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13 @@ -3412,80 +3392,76 @@ ; X64-NO-BMI2-NO-SHLD-NEXT: andl $7, %eax ; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %r8d ; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %r8d -; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi ; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %edi -; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%r8), %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%r8), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%r8), %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r11, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%r8), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%r8), %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbp,%rbp), %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r14, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%r8), %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%r8), %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%r8), %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 56(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r13, 40(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 40(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_64bytes: @@ -3566,8 +3542,8 @@ ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 56(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 32(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx) @@ -3620,55 +3596,55 @@ ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r10, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r10, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r13d +; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %r13b +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %rbp -; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r15, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %rax, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r15, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 @@ -3743,8 +3719,8 @@ ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r13, %r9 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 48(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, 56(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 48(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 32(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) @@ -4207,41 +4183,41 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 48(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $204, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -4722,9 +4698,9 @@ ; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: negl %esi -; X64-NO-BMI2-NO-SHLD-NEXT: movslq %esi, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r14), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r14), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movslq %esi, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rbx), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rbx), %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi @@ -4735,65 +4711,65 @@ ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r14), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r14), %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rbx), %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi ; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %edi ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%r14), %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rbx), %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%r14), %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rbx), %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%r14), %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rbx), %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%r14), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rbx), %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r13, %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%rbx), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, 48(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, 56(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 40(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r13, 56(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 48(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, 40(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 16(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12 @@ -4842,55 +4818,57 @@ ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: negl %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r10), %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NO-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r9), %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r10), %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r10), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r9), %rcx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r9), %r8 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %rbx, %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r10), %r15 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r9), %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r12 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r10), %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r9), %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r12, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r10), %r12 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r9), %r12 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r12, %r13 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r13 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r10), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbp +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r9), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %rbp ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %rbp ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rbp ; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r13, %rbp ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r15, %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%r9), %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r15, %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%r10), %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r12, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r8 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 56(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 40(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 40(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, (%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 32(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx) @@ -4942,55 +4920,55 @@ ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: negl %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx), %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rcx), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r9d +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rcx), %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rbx, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r10d -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rcx), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rcx), %r15 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r15, %r12 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %bpl -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rbx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rcx), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r11, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r10d -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r9d +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r9d ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r10, %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r9, %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rcx), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rcx), %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rcx), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r10, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r9, %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r10, %r15, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %r15, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r9, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rbp, %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r13, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 48(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r15, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 32(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 @@ -5039,9 +5017,9 @@ ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negl %esi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movslq %esi, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rsi, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rdi, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %rsi, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r8, %r9 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %r10 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r10, %r11 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebx @@ -5052,9 +5030,9 @@ ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, %r12 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %r12 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbx, %r12, %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r8, %r12 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, %r13 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rdi, %r12 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, %r13 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrq %r13 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbx, %r13, %r13 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r9, %r13 @@ -5064,17 +5042,17 @@ ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rbx, %rbp, %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r14, %rbp ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r11, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %r9 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%rax), %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r14, %r15 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 40(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 24(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, (%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 48(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r13, 32(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r12, 16(%rdx) @@ -5093,67 +5071,61 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: subl $192, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%ebx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%ebx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%ebx), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%ebx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebx), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%eax), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: subl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -5170,196 +5142,197 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: andl $7, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: notb %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, (%esp) # 1-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: andl $7, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: subl %eax, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: negl %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 132(%esp,%eax), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: notl %edx -; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, (%esp) # 1-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: notl %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb (%esp), %ch # 1-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: negl %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 176(%esp,%ecx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%ecx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 56(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, (%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 60(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 56(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 52(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 48(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 48(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 52(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 44(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 40(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 44(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 36(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 32(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 36(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 24(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 20(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 16(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 20(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $192, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi @@ -5374,7 +5347,7 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $204, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $212, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -5385,13 +5358,13 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx @@ -5414,13 +5387,13 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -5452,123 +5425,132 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%esi), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $7, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%esi), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: notl %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 152(%esp,%edi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%esi), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%esi), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%esi), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%esi), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: negl %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 188(%esp,%edi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esi), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esi), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esi), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebp, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%esi), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebp ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esi), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%esi), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 52(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 52(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 12(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ebx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 56(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -5581,7 +5563,7 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $204, %esp +; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $212, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -5836,44 +5818,45 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $204, %esp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $200, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edi), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%esi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%esi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edi), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%esi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%esi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%esi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%esi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%esi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%esi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%esi), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edi), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%edi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%edi), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%edi), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%edi), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%esi), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%esi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esi), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esi), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esi), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esi), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esi), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -5883,22 +5866,21 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $7, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl %esi, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -5915,111 +5897,109 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: notl %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebp, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebp, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%edx), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%edx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edx), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%edx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edx), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%edx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%edx), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%edx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negl %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 188(%esp,%esi), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %esi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 140(%esp,%esi), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%edx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %edi, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: orl %ecx, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, (%esp) # 4-byte Folded Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%edx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%edx), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebx, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 52(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 44(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlxl %eax, %ebp, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %eax, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 44(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 4(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 4(%edx) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 56(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -6034,7 +6014,7 @@ ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $204, %esp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $200, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx @@ -6049,7 +6029,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13 @@ -6085,80 +6064,76 @@ ; X64-NO-BMI2-NO-SHLD-NEXT: andl $7, %eax ; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %r8d ; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %r8d -; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi ; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %edi -; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: notb %sil -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%r8), %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%r8), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%r8), %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r11, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%r8), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%r8), %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbp,%rbp), %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r14, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%r8), %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%r8), %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%r8), %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 56(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r13, 40(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 40(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_64bytes: @@ -6240,8 +6215,8 @@ ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8 ; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 56(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, 48(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, 32(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx) @@ -6295,55 +6270,55 @@ ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $7, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r10, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r10, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %r13d +; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %r13b +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %rbp -; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r15, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r13, %rax, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 32(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r15, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 @@ -6419,8 +6394,8 @@ ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r13, %r9 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 48(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, 56(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 48(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 32(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) @@ -6885,41 +6860,41 @@ ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, (%esp) # 4-byte Folded Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%esi), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 136(%esp,%esi), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl %cl, %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 48(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%edx) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%edx) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, 60(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, 56(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 48(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 40(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 32(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 24(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, 52(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 36(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 28(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 20(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax) ; X86-NO-BMI2-HAVE-SHLD-NEXT: addl $204, %esp ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -7367,9 +7342,9 @@ } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; ALL: {{.*}} -; X86: {{.*}} -; X86-NO-SHLD: {{.*}} -; X86-SHLD: {{.*}} ; X64: {{.*}} ; X64-NO-SHLD: {{.*}} ; X64-SHLD: {{.*}} +; X86: {{.*}} +; X86-NO-SHLD: {{.*}} +; X86-SHLD: {{.*}} diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll @@ -28,25 +28,24 @@ ; ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half: ; X86-NO-BMI2: # %bb.0: -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: shlb $3, %cl ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NEXT: movzbl (%eax), %eax -; X86-NO-BMI2-NEXT: shll $3, %ecx -; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NEXT: movb %al, (%edx) +; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NEXT: movzbl (%edx), %edx +; X86-NO-BMI2-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NEXT: movb %dl, (%eax) ; X86-NO-BMI2-NEXT: retl ; ; X86-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca_with_zero_upper_half: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: shlb $3, %al ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: movzbl (%edx), %edx -; X86-BMI2-NEXT: shll $3, %ecx -; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx -; X86-BMI2-NEXT: movb %cl, (%eax) +; X86-BMI2-NEXT: shrxl %eax, %edx, %eax +; X86-BMI2-NEXT: movb %al, (%ecx) ; X86-BMI2-NEXT: retl %init1 = load i8, ptr %src, align 1 %intermediate.sroa.0.0.vec.insert = insertelement <2 x i8> , i8 %init1, i64 0 @@ -81,25 +80,24 @@ ; ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half: ; X86-NO-BMI2: # %bb.0: -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: shlb $3, %cl ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NEXT: movzwl (%eax), %eax -; X86-NO-BMI2-NEXT: shll $3, %ecx -; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NEXT: movb %al, (%edx) +; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NEXT: movzwl (%edx), %edx +; X86-NO-BMI2-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NEXT: movb %dl, (%eax) ; X86-NO-BMI2-NEXT: retl ; ; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca_with_zero_upper_half: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: shlb $3, %al ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: movzwl (%edx), %edx -; X86-BMI2-NEXT: shll $3, %ecx -; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx -; X86-BMI2-NEXT: movb %cl, (%eax) +; X86-BMI2-NEXT: shrxl %eax, %edx, %eax +; X86-BMI2-NEXT: movb %al, (%ecx) ; X86-BMI2-NEXT: retl %init = load <2 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <2 x i8> %init, <2 x i8> poison, <4 x i32> @@ -135,25 +133,24 @@ ; ; X86-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half: ; X86-NO-BMI2: # %bb.0: +; X86-NO-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: shlb $3, %cl ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NEXT: movzwl (%edx), %edx -; X86-NO-BMI2-NEXT: shll $3, %ecx -; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NEXT: shrl %cl, %edx ; X86-NO-BMI2-NEXT: movw %dx, (%eax) ; X86-NO-BMI2-NEXT: retl ; ; X86-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca_with_zero_upper_half: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: shlb $3, %al ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: movzwl (%edx), %edx -; X86-BMI2-NEXT: shll $3, %ecx -; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx -; X86-BMI2-NEXT: movw %cx, (%eax) +; X86-BMI2-NEXT: shrxl %eax, %edx, %eax +; X86-BMI2-NEXT: movw %ax, (%ecx) ; X86-BMI2-NEXT: retl %init = load <2 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <2 x i8> %init, <2 x i8> poison, <4 x i32> @@ -193,11 +190,11 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shlb $3, %al ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm1, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi @@ -220,11 +217,11 @@ ; X86-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx @@ -241,11 +238,11 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm1, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi @@ -268,11 +265,11 @@ ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi @@ -322,11 +319,11 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shlb $3, %al ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm1, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi @@ -349,11 +346,11 @@ ; X86-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi @@ -370,11 +367,11 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm1, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi @@ -396,11 +393,11 @@ ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca_with_zero_upper_half: ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi @@ -448,11 +445,11 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shlb $3, %al ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm1, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi @@ -475,11 +472,11 @@ ; X86-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi @@ -496,11 +493,11 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm1, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi @@ -522,11 +519,11 @@ ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca_with_zero_upper_half: ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi @@ -623,10 +620,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $32, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -638,9 +634,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $15, %ecx ; X86-NEXT: movzbl (%esp,%ecx), %ecx ; X86-NEXT: movb %cl, (%eax) ; X86-NEXT: addl $32, %esp @@ -733,10 +727,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $32, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -748,9 +741,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $15, %ecx ; X86-NEXT: movl (%esp,%ecx), %ecx ; X86-NEXT: movw %cx, (%eax) ; X86-NEXT: addl $32, %esp @@ -842,10 +833,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $32, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -857,9 +847,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $15, %ecx ; X86-NEXT: movl (%esp,%ecx), %ecx ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: addl $32, %esp @@ -951,10 +939,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $32, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -966,9 +953,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $15, %ecx ; X86-NEXT: movl (%esp,%ecx), %edx ; X86-NEXT: movl 4(%esp,%ecx), %ecx ; X86-NEXT: movl %ecx, 4(%eax) @@ -1012,10 +997,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -1035,8 +1019,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $31, %ecx ; X86-NEXT: movzbl (%esp,%ecx), %ecx ; X86-NEXT: movb %cl, (%eax) ; X86-NEXT: addl $64, %esp @@ -1079,10 +1062,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -1102,8 +1084,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $31, %ecx ; X86-NEXT: movl (%esp,%ecx), %ecx ; X86-NEXT: movw %cx, (%eax) ; X86-NEXT: addl $64, %esp @@ -1145,10 +1126,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -1168,8 +1148,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $31, %ecx ; X86-NEXT: movl (%esp,%ecx), %ecx ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: addl $64, %esp @@ -1211,10 +1190,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -1234,8 +1212,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $31, %ecx ; X86-NEXT: movl (%esp,%ecx), %edx ; X86-NEXT: movl 4(%esp,%ecx), %ecx ; X86-NEXT: movl %ecx, 4(%eax) @@ -1283,10 +1260,9 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -1306,8 +1282,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $31, %ecx ; X86-NEXT: movl (%esp,%ecx), %edx ; X86-NEXT: movl 4(%esp,%ecx), %esi ; X86-NEXT: movl 8(%esp,%ecx), %edi diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll @@ -30,25 +30,24 @@ ; ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca: ; X86-NO-BMI2: # %bb.0: -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: shlb $3, %cl ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NEXT: movzwl (%eax), %eax -; X86-NO-BMI2-NEXT: shll $3, %ecx -; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NEXT: movb %al, (%edx) +; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NEXT: movzwl (%edx), %edx +; X86-NO-BMI2-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NEXT: movb %dl, (%eax) ; X86-NO-BMI2-NEXT: retl ; ; X86-BMI2-LABEL: load_1byte_chunk_of_2byte_alloca: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: shlb $3, %al ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI2-NEXT: movzwl (%edx), %edx -; X86-BMI2-NEXT: shll $3, %ecx -; X86-BMI2-NEXT: shrxl %ecx, %edx, %ecx -; X86-BMI2-NEXT: movb %cl, (%eax) +; X86-BMI2-NEXT: shrxl %eax, %edx, %eax +; X86-BMI2-NEXT: movb %al, (%ecx) ; X86-BMI2-NEXT: retl %init = load <2 x i8>, ptr %src, align 1 %intermediate.val.frozen = freeze <2 x i8> %init @@ -83,24 +82,23 @@ ; ; X86-NO-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca: ; X86-NO-BMI2: # %bb.0: -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: shlb $3, %cl ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NEXT: movl (%eax), %eax -; X86-NO-BMI2-NEXT: shll $3, %ecx -; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NEXT: movb %al, (%edx) +; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NO-BMI2-NEXT: movl (%edx), %edx +; X86-NO-BMI2-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NEXT: movb %dl, (%eax) ; X86-NO-BMI2-NEXT: retl ; ; X86-BMI2-LABEL: load_1byte_chunk_of_4byte_alloca: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: shlb $3, %al ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-BMI2-NEXT: shll $3, %ecx -; X86-BMI2-NEXT: shrxl %ecx, (%edx), %ecx -; X86-BMI2-NEXT: movb %cl, (%eax) +; X86-BMI2-NEXT: shrxl %eax, (%edx), %eax +; X86-BMI2-NEXT: movb %al, (%ecx) ; X86-BMI2-NEXT: retl %init = load <4 x i8>, ptr %src, align 1 %intermediate.val.frozen = freeze <4 x i8> %init @@ -133,24 +131,23 @@ ; ; X86-NO-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca: ; X86-NO-BMI2: # %bb.0: +; X86-NO-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-NEXT: shlb $3, %cl ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NEXT: movl (%edx), %edx -; X86-NO-BMI2-NEXT: shll $3, %ecx -; X86-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NEXT: shrl %cl, %edx ; X86-NO-BMI2-NEXT: movw %dx, (%eax) ; X86-NO-BMI2-NEXT: retl ; ; X86-BMI2-LABEL: load_2byte_chunk_of_4byte_alloca: ; X86-BMI2: # %bb.0: -; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-BMI2-NEXT: shlb $3, %al ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-BMI2-NEXT: shll $3, %ecx -; X86-BMI2-NEXT: shrxl %ecx, (%edx), %ecx -; X86-BMI2-NEXT: movw %cx, (%eax) +; X86-BMI2-NEXT: shrxl %eax, (%edx), %eax +; X86-BMI2-NEXT: movw %ax, (%ecx) ; X86-BMI2-NEXT: retl %init = load <4 x i8>, ptr %src, align 1 %intermediate.val.frozen = freeze <4 x i8> %init @@ -187,11 +184,11 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shlb $3, %al ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %ebx @@ -214,11 +211,11 @@ ; X86-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca: ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx @@ -235,11 +232,11 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi @@ -262,11 +259,11 @@ ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi @@ -311,11 +308,11 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shlb $3, %al ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi @@ -338,11 +335,11 @@ ; X86-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca: ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi @@ -359,11 +356,11 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi @@ -385,11 +382,11 @@ ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_8byte_alloca: ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi @@ -432,11 +429,11 @@ ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shlb $3, %al ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi @@ -459,11 +456,11 @@ ; X86-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca: ; X86-NO-BMI2-HAVE-SHLD: # %bb.0: ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx ; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi @@ -480,11 +477,11 @@ ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi @@ -506,11 +503,11 @@ ; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_8byte_alloca: ; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0: ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shlb $3, %cl ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi @@ -607,10 +604,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $32, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -622,9 +618,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $15, %ecx ; X86-NEXT: movzbl (%esp,%ecx), %ecx ; X86-NEXT: movb %cl, (%eax) ; X86-NEXT: addl $32, %esp @@ -715,10 +709,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $32, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -730,9 +723,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $15, %ecx ; X86-NEXT: movl (%esp,%ecx), %ecx ; X86-NEXT: movw %cx, (%eax) ; X86-NEXT: addl $32, %esp @@ -822,10 +813,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $32, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -837,9 +827,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $15, %ecx ; X86-NEXT: movl (%esp,%ecx), %ecx ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: addl $32, %esp @@ -929,10 +917,9 @@ ; X86: # %bb.0: ; X86-NEXT: subl $32, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] @@ -944,9 +931,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $15, %ecx ; X86-NEXT: movl (%esp,%ecx), %edx ; X86-NEXT: movl 4(%esp,%ecx), %ecx ; X86-NEXT: movl %ecx, 4(%eax) @@ -992,11 +977,10 @@ ; X86: # %bb.0: ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 ; X86-NEXT: movdqu 16(%edx), %xmm1 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] @@ -1019,8 +1003,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $31, %ecx ; X86-NEXT: movzbl (%esp,%ecx), %ecx ; X86-NEXT: movb %cl, (%eax) ; X86-NEXT: addl $64, %esp @@ -1063,11 +1046,10 @@ ; X86: # %bb.0: ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 ; X86-NEXT: movdqu 16(%edx), %xmm1 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] @@ -1090,8 +1072,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $31, %ecx ; X86-NEXT: movl (%esp,%ecx), %ecx ; X86-NEXT: movw %cx, (%eax) ; X86-NEXT: addl $64, %esp @@ -1133,11 +1114,10 @@ ; X86: # %bb.0: ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 ; X86-NEXT: movdqu 16(%edx), %xmm1 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] @@ -1160,8 +1140,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $31, %ecx ; X86-NEXT: movl (%esp,%ecx), %ecx ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: addl $64, %esp @@ -1203,11 +1182,10 @@ ; X86: # %bb.0: ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 ; X86-NEXT: movdqu 16(%edx), %xmm1 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] @@ -1230,8 +1208,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $31, %ecx ; X86-NEXT: movl (%esp,%ecx), %edx ; X86-NEXT: movl 4(%esp,%ecx), %ecx ; X86-NEXT: movl %ecx, 4(%eax) @@ -1279,11 +1256,10 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: subl $64, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movdqu (%edx), %xmm0 ; X86-NEXT: movdqu 16(%edx), %xmm1 -; X86-NEXT: shll $3, %ecx ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] @@ -1306,8 +1282,7 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: shrb $3, %cl -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: andl $31, %ecx ; X86-NEXT: movl (%esp,%ecx), %edx ; X86-NEXT: movl 4(%esp,%ecx), %esi ; X86-NEXT: movl 8(%esp,%ecx), %edi @@ -1334,7 +1309,7 @@ ; no @load_32byte_chunk_of_32byte_alloca ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; ALL: {{.*}} -; X86-NO-SHLD: {{.*}} -; X86-SHLD: {{.*}} ; X64-NO-SHLD: {{.*}} ; X64-SHLD: {{.*}} +; X86-NO-SHLD: {{.*}} +; X86-SHLD: {{.*}} diff --git a/llvm/test/CodeGen/X86/widen_bitops-1.ll b/llvm/test/CodeGen/X86/widen_bitops-1.ll --- a/llvm/test/CodeGen/X86/widen_bitops-1.ll +++ b/llvm/test/CodeGen/X86/widen_bitops-1.ll @@ -70,8 +70,91 @@ define i32 @and_i32_as_v8i4(i32 %a, i32 %b) nounwind { ; X86-LABEL: and_i32_as_v8i4: ; X86: # %bb.0: +; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $4, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $15, %edx +; X86-NEXT: movd %edx, %xmm0 +; X86-NEXT: pinsrw $1, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $8, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $2, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $12, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $3, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $16, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $4, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $20, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $5, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $24, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $6, %ecx, %xmm0 +; X86-NEXT: shrl $28, %eax +; X86-NEXT: pinsrw $7, %eax, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $4, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $15, %edx +; X86-NEXT: movd %edx, %xmm1 +; X86-NEXT: pinsrw $1, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $8, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $2, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $12, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $3, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $16, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $4, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $20, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $5, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $24, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $6, %ecx, %xmm1 +; X86-NEXT: shrl $28, %eax +; X86-NEXT: pinsrw $7, %eax, %xmm1 +; X86-NEXT: pand %xmm0, %xmm1 +; X86-NEXT: pextrw $0, %xmm1, %eax +; X86-NEXT: pextrw $1, %xmm1, %ecx +; X86-NEXT: shll $4, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: pextrw $2, %xmm1, %edx +; X86-NEXT: shll $8, %edx +; X86-NEXT: pextrw $3, %xmm1, %eax +; X86-NEXT: shll $12, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: pextrw $4, %xmm1, %ecx +; X86-NEXT: shll $16, %ecx +; X86-NEXT: pextrw $5, %xmm1, %edx +; X86-NEXT: shll $20, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: pextrw $6, %xmm1, %ecx +; X86-NEXT: shll $24, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: pextrw $7, %xmm1, %eax +; X86-NEXT: shll $28, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: popl %ecx ; X86-NEXT: retl ; ; X64-LABEL: and_i32_as_v8i4: @@ -89,8 +172,91 @@ define i32 @xor_i32_as_v8i4(i32 %a, i32 %b) nounwind { ; X86-LABEL: xor_i32_as_v8i4: ; X86: # %bb.0: +; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $4, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $15, %edx +; X86-NEXT: movd %edx, %xmm0 +; X86-NEXT: pinsrw $1, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $8, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $2, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $12, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $3, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $16, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $4, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $20, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $5, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $24, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $6, %ecx, %xmm0 +; X86-NEXT: shrl $28, %eax +; X86-NEXT: pinsrw $7, %eax, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $4, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $15, %edx +; X86-NEXT: movd %edx, %xmm1 +; X86-NEXT: pinsrw $1, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $8, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $2, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $12, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $3, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $16, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $4, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $20, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $5, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $24, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $6, %ecx, %xmm1 +; X86-NEXT: shrl $28, %eax +; X86-NEXT: pinsrw $7, %eax, %xmm1 +; X86-NEXT: pxor %xmm0, %xmm1 +; X86-NEXT: pextrw $0, %xmm1, %eax +; X86-NEXT: pextrw $1, %xmm1, %ecx +; X86-NEXT: shll $4, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: pextrw $2, %xmm1, %edx +; X86-NEXT: shll $8, %edx +; X86-NEXT: pextrw $3, %xmm1, %eax +; X86-NEXT: shll $12, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: pextrw $4, %xmm1, %ecx +; X86-NEXT: shll $16, %ecx +; X86-NEXT: pextrw $5, %xmm1, %edx +; X86-NEXT: shll $20, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: pextrw $6, %xmm1, %ecx +; X86-NEXT: shll $24, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: pextrw $7, %xmm1, %eax +; X86-NEXT: shll $28, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: popl %ecx ; X86-NEXT: retl ; ; X64-LABEL: xor_i32_as_v8i4: @@ -108,8 +274,91 @@ define i32 @or_i32_as_v8i4(i32 %a, i32 %b) nounwind { ; X86-LABEL: or_i32_as_v8i4: ; X86: # %bb.0: +; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $4, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $15, %edx +; X86-NEXT: movd %edx, %xmm0 +; X86-NEXT: pinsrw $1, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $8, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $2, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $12, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $3, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $16, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $4, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $20, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $5, %ecx, %xmm0 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $24, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $6, %ecx, %xmm0 +; X86-NEXT: shrl $28, %eax +; X86-NEXT: pinsrw $7, %eax, %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $4, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $15, %edx +; X86-NEXT: movd %edx, %xmm1 +; X86-NEXT: pinsrw $1, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $8, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $2, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $12, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $3, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $16, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $4, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $20, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $5, %ecx, %xmm1 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrl $24, %ecx +; X86-NEXT: andl $15, %ecx +; X86-NEXT: pinsrw $6, %ecx, %xmm1 +; X86-NEXT: shrl $28, %eax +; X86-NEXT: pinsrw $7, %eax, %xmm1 +; X86-NEXT: por %xmm0, %xmm1 +; X86-NEXT: pextrw $0, %xmm1, %eax +; X86-NEXT: pextrw $1, %xmm1, %ecx +; X86-NEXT: shll $4, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: pextrw $2, %xmm1, %edx +; X86-NEXT: shll $8, %edx +; X86-NEXT: pextrw $3, %xmm1, %eax +; X86-NEXT: shll $12, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: pextrw $4, %xmm1, %ecx +; X86-NEXT: shll $16, %ecx +; X86-NEXT: pextrw $5, %xmm1, %edx +; X86-NEXT: shll $20, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: pextrw $6, %xmm1, %ecx +; X86-NEXT: shll $24, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: pextrw $7, %xmm1, %eax +; X86-NEXT: shll $28, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: popl %ecx ; X86-NEXT: retl ; ; X64-LABEL: or_i32_as_v8i4: diff --git a/llvm/test/CodeGen/X86/widen_cast-2.ll b/llvm/test/CodeGen/X86/widen_cast-2.ll --- a/llvm/test/CodeGen/X86/widen_cast-2.ll +++ b/llvm/test/CodeGen/X86/widen_cast-2.ll @@ -22,9 +22,9 @@ ; CHECK-NEXT: psubw %xmm0, %xmm2 ; CHECK-NEXT: psubw %xmm0, %xmm1 ; CHECK-NEXT: movdqa %xmm1, (%ecx,%eax) -; CHECK-NEXT: movd %xmm2, 16(%ecx,%eax) -; CHECK-NEXT: pextrd $1, %xmm2, 20(%ecx,%eax) ; CHECK-NEXT: pextrd $2, %xmm2, 24(%ecx,%eax) +; CHECK-NEXT: pextrd $1, %xmm2, 20(%ecx,%eax) +; CHECK-NEXT: movd %xmm2, 16(%ecx,%eax) ; CHECK-NEXT: incl (%esp) ; CHECK-NEXT: cmpl $3, (%esp) ; CHECK-NEXT: jle .LBB0_2 diff --git a/llvm/test/CodeGen/X86/widen_fdiv.ll b/llvm/test/CodeGen/X86/widen_fdiv.ll --- a/llvm/test/CodeGen/X86/widen_fdiv.ll +++ b/llvm/test/CodeGen/X86/widen_fdiv.ll @@ -67,17 +67,46 @@ ; ; AVX1OR2-LABEL: widen_fdiv_v2f32_v8f32: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovups (%rdi), %ymm0 -; AVX1OR2-NEXT: vdivps (%rsi), %ymm0, %ymm0 +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero +; AVX1OR2-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1OR2-NEXT: vdivps %ymm6, %ymm2, %ymm2 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm3 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vdivps %ymm3, %ymm0, %ymm0 +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3] +; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1OR2-NEXT: vmovups %ymm0, (%rdx) ; AVX1OR2-NEXT: vzeroupper ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: widen_fdiv_v2f32_v8f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovups (%rdi), %ymm0 -; AVX512F-NEXT: vdivps (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovups %ymm0, (%rdx) +; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX512F-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero +; AVX512F-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero +; AVX512F-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero +; AVX512F-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero +; AVX512F-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX512F-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512F-NEXT: vdivps %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm3 +; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vdivps %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,8,10] +; AVX512F-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 +; AVX512F-NEXT: vmovupd %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -89,18 +118,17 @@ ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero -; AVX512VL-NEXT: vdivps %xmm5, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero -; AVX512VL-NEXT: vdivps %xmm6, %xmm3, %xmm3 -; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero +; AVX512VL-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vdivps %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm3 -; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vdivps %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX512VL-NEXT: vmovups %ymm0, (%rdx) +; AVX512VL-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,6] +; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 +; AVX512VL-NEXT: vmovupd %ymm1, (%rdx) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %a2 = getelementptr inbounds i8, ptr %a0, i64 8 @@ -170,12 +198,42 @@ ; ; AVX1OR2-LABEL: widen_fdiv_v2f32_v16f32: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovups (%rdi), %ymm0 -; AVX1OR2-NEXT: vmovups 32(%rdi), %ymm1 -; AVX1OR2-NEXT: vdivps (%rsi), %ymm0, %ymm0 -; AVX1OR2-NEXT: vdivps 32(%rsi), %ymm1, %ymm1 +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm9 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm10 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm11 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm12 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm13 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm14 = mem[0],zero +; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm15 = mem[0],zero +; AVX1OR2-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1OR2-NEXT: vdivps %ymm6, %ymm2, %ymm2 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm3 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vdivps %ymm3, %ymm0, %ymm0 +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3] +; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX1OR2-NEXT: vmovups %ymm0, (%rdx) -; AVX1OR2-NEXT: vmovups %ymm1, 32(%rdx) +; AVX1OR2-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm1 +; AVX1OR2-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm1 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm2 +; AVX1OR2-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] +; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] +; AVX1OR2-NEXT: vmovups %ymm0, 32(%rdx) ; AVX1OR2-NEXT: vzeroupper ; AVX1OR2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/widen_load-2.ll b/llvm/test/CodeGen/X86/widen_load-2.ll --- a/llvm/test/CodeGen/X86/widen_load-2.ll +++ b/llvm/test/CodeGen/X86/widen_load-2.ll @@ -47,9 +47,9 @@ ; X86-NEXT: pinsrd $1, 4(%ecx), %xmm1 ; X86-NEXT: pinsrd $2, 8(%ecx), %xmm1 ; X86-NEXT: paddd %xmm0, %xmm1 -; X86-NEXT: movd %xmm1, (%eax) -; X86-NEXT: pextrd $1, %xmm1, 4(%eax) ; X86-NEXT: pextrd $2, %xmm1, 8(%eax) +; X86-NEXT: pextrd $1, %xmm1, 4(%eax) +; X86-NEXT: movd %xmm1, (%eax) ; X86-NEXT: retl $4 ; ; X64-LABEL: add3i32_2: @@ -81,9 +81,9 @@ ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddd (%ecx), %xmm0 ; X86-NEXT: paddd 16(%ecx), %xmm1 -; X86-NEXT: movd %xmm1, 16(%eax) -; X86-NEXT: pextrd $1, %xmm1, 20(%eax) ; X86-NEXT: pextrd $2, %xmm1, 24(%eax) +; X86-NEXT: pextrd $1, %xmm1, 20(%eax) +; X86-NEXT: movd %xmm1, 16(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; @@ -94,8 +94,8 @@ ; X64-NEXT: movdqa 16(%rsi), %xmm1 ; X64-NEXT: paddd (%rdx), %xmm0 ; X64-NEXT: paddd 16(%rdx), %xmm1 -; X64-NEXT: movq %xmm1, 16(%rdi) ; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) +; X64-NEXT: movq %xmm1, 16(%rdi) ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq %a = load %i32vec7, ptr %ap, align 16 @@ -116,10 +116,10 @@ ; X86-NEXT: movdqa (%edx), %xmm1 ; X86-NEXT: movdqa 16(%edx), %xmm2 ; X86-NEXT: paddd (%ecx), %xmm1 -; X86-NEXT: paddd 32(%ecx), %xmm0 ; X86-NEXT: paddd 16(%ecx), %xmm2 -; X86-NEXT: movdqa %xmm2, 16(%eax) +; X86-NEXT: paddd 32(%ecx), %xmm0 ; X86-NEXT: movdqa %xmm0, 32(%eax) +; X86-NEXT: movdqa %xmm2, 16(%eax) ; X86-NEXT: movdqa %xmm1, (%eax) ; X86-NEXT: retl $4 ; @@ -130,10 +130,10 @@ ; X64-NEXT: movdqa 16(%rsi), %xmm1 ; X64-NEXT: movdqa 32(%rsi), %xmm2 ; X64-NEXT: paddd (%rdx), %xmm0 -; X64-NEXT: paddd 32(%rdx), %xmm2 ; X64-NEXT: paddd 16(%rdx), %xmm1 -; X64-NEXT: movdqa %xmm1, 16(%rdi) +; X64-NEXT: paddd 32(%rdx), %xmm2 ; X64-NEXT: movdqa %xmm2, 32(%rdi) +; X64-NEXT: movdqa %xmm1, 16(%rdi) ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq %a = load %i32vec12, ptr %ap, align 16 @@ -215,8 +215,8 @@ ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddw (%ecx), %xmm0 ; X86-NEXT: paddw 16(%ecx), %xmm1 -; X86-NEXT: movd %xmm1, 16(%eax) ; X86-NEXT: pextrd $1, %xmm1, 20(%eax) +; X86-NEXT: movd %xmm1, 16(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; @@ -248,10 +248,10 @@ ; X86-NEXT: movdqa (%edx), %xmm1 ; X86-NEXT: movdqa 16(%edx), %xmm2 ; X86-NEXT: paddw (%ecx), %xmm1 -; X86-NEXT: paddw 32(%ecx), %xmm0 ; X86-NEXT: paddw 16(%ecx), %xmm2 -; X86-NEXT: movdqa %xmm2, 16(%eax) +; X86-NEXT: paddw 32(%ecx), %xmm0 ; X86-NEXT: movd %xmm0, 32(%eax) +; X86-NEXT: movdqa %xmm2, 16(%eax) ; X86-NEXT: movdqa %xmm1, (%eax) ; X86-NEXT: retl $4 ; @@ -262,10 +262,10 @@ ; X64-NEXT: movdqa 16(%rsi), %xmm1 ; X64-NEXT: movdqa 32(%rsi), %xmm2 ; X64-NEXT: paddw (%rdx), %xmm0 -; X64-NEXT: paddw 32(%rdx), %xmm2 ; X64-NEXT: paddw 16(%rdx), %xmm1 -; X64-NEXT: movdqa %xmm1, 16(%rdi) +; X64-NEXT: paddw 32(%rdx), %xmm2 ; X64-NEXT: movd %xmm2, 32(%rdi) +; X64-NEXT: movdqa %xmm1, 16(%rdi) ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq %a = load %i16vec18, ptr %ap, align 16 @@ -317,11 +317,11 @@ ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddb (%ecx), %xmm0 ; X86-NEXT: paddb 16(%ecx), %xmm1 -; X86-NEXT: movd %xmm1, 16(%eax) -; X86-NEXT: pextrd $1, %xmm1, 20(%eax) -; X86-NEXT: pextrd $2, %xmm1, 24(%eax) -; X86-NEXT: pextrw $6, %xmm1, 28(%eax) ; X86-NEXT: pextrb $14, %xmm1, 30(%eax) +; X86-NEXT: pextrw $6, %xmm1, 28(%eax) +; X86-NEXT: pextrd $2, %xmm1, 24(%eax) +; X86-NEXT: pextrd $1, %xmm1, 20(%eax) +; X86-NEXT: movd %xmm1, 16(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; @@ -332,10 +332,10 @@ ; X64-NEXT: movdqa 16(%rsi), %xmm1 ; X64-NEXT: paddb (%rdx), %xmm0 ; X64-NEXT: paddb 16(%rdx), %xmm1 -; X64-NEXT: movq %xmm1, 16(%rdi) -; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) -; X64-NEXT: pextrw $6, %xmm1, 28(%rdi) ; X64-NEXT: pextrb $14, %xmm1, 30(%rdi) +; X64-NEXT: pextrw $6, %xmm1, 28(%rdi) +; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) +; X64-NEXT: movq %xmm1, 16(%rdi) ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq %a = load %i8vec31, ptr %ap, align 16 diff --git a/llvm/test/CodeGen/X86/win64-byval.ll b/llvm/test/CodeGen/X86/win64-byval.ll --- a/llvm/test/CodeGen/X86/win64-byval.ll +++ b/llvm/test/CodeGen/X86/win64-byval.ll @@ -64,12 +64,12 @@ ; CHECK-NEXT: movq 8(%rax), %rax ; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rax diff --git a/llvm/test/CodeGen/X86/win64_frame.ll b/llvm/test/CodeGen/X86/win64_frame.ll --- a/llvm/test/CodeGen/X86/win64_frame.ll +++ b/llvm/test/CodeGen/X86/win64_frame.ll @@ -27,9 +27,9 @@ ; CHECK-NEXT: movq %rsp, %rbp ; CHECK-NEXT: .seh_setframe %rbp, 0 ; CHECK-NEXT: .seh_endprologue -; CHECK-NEXT: movq %rdx, 32(%rbp) -; CHECK-NEXT: movq %r8, 40(%rbp) ; CHECK-NEXT: movq %r9, 48(%rbp) +; CHECK-NEXT: movq %r8, 40(%rbp) +; CHECK-NEXT: movq %rdx, 32(%rbp) ; CHECK-NEXT: leaq 32(%rbp), %rax ; CHECK-NEXT: movq %rax, (%rbp) ; CHECK-NEXT: addq $8, %rsp diff --git a/llvm/test/CodeGen/X86/x86-64-baseptr.ll b/llvm/test/CodeGen/X86/x86-64-baseptr.ll --- a/llvm/test/CodeGen/X86/x86-64-baseptr.ll +++ b/llvm/test/CodeGen/X86/x86-64-baseptr.ll @@ -318,11 +318,11 @@ ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $200, %rsp ; CHECK-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movq %rsi, -184(%rbp) -; CHECK-NEXT: movq %rdx, -176(%rbp) -; CHECK-NEXT: movq %rcx, -168(%rbp) -; CHECK-NEXT: movq %r8, -160(%rbp) ; CHECK-NEXT: movq %r9, -152(%rbp) +; CHECK-NEXT: movq %r8, -160(%rbp) +; CHECK-NEXT: movq %rcx, -168(%rbp) +; CHECK-NEXT: movq %rdx, -176(%rbp) +; CHECK-NEXT: movq %rsi, -184(%rbp) ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: je .LBB3_2 ; CHECK-NEXT: # %bb.1: # %entry @@ -361,11 +361,11 @@ ; X32ABI-NEXT: andl $-16, %esp ; X32ABI-NEXT: subl $208, %esp ; X32ABI-NEXT: movl %esp, %ebx -; X32ABI-NEXT: movq %rsi, 24(%ebx) -; X32ABI-NEXT: movq %rdx, 32(%ebx) -; X32ABI-NEXT: movq %rcx, 40(%ebx) -; X32ABI-NEXT: movq %r8, 48(%ebx) ; X32ABI-NEXT: movq %r9, 56(%ebx) +; X32ABI-NEXT: movq %r8, 48(%ebx) +; X32ABI-NEXT: movq %rcx, 40(%ebx) +; X32ABI-NEXT: movq %rdx, 32(%ebx) +; X32ABI-NEXT: movq %rsi, 24(%ebx) ; X32ABI-NEXT: testb %al, %al ; X32ABI-NEXT: je .LBB3_2 ; X32ABI-NEXT: # %bb.1: # %entry diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -407,11 +407,10 @@ ; AVX512-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vmovdqu (%rdi), %ymm1 -; AVX512-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512-NEXT: vmovdqu (%rdi), %ymm2 -; AVX512-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512-NEXT: vpmovwb %zmm2, %ymm2 -; AVX512-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpmovdw %zmm1, %ymm2 +; AVX512-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; AVX512-NEXT: vpmullw %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vzeroupper @@ -1162,18 +1161,18 @@ ; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm11 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-NEXT: vpshufb %xmm9, %xmm6, %xmm6 -; AVX1-NEXT: vmovdqu %xmm7, 80(%rdi) -; AVX1-NEXT: vmovdqu %xmm0, 64(%rdi) -; AVX1-NEXT: vmovdqu %xmm5, 16(%rdi) -; AVX1-NEXT: vmovdqu %xmm4, (%rdi) -; AVX1-NEXT: vmovdqu %xmm12, 48(%rdi) -; AVX1-NEXT: vmovdqu %xmm8, 32(%rdi) ; AVX1-NEXT: vmovdqu %xmm6, 176(%rdi) ; AVX1-NEXT: vmovdqu %xmm1, 160(%rdi) -; AVX1-NEXT: vmovdqu %xmm10, 112(%rdi) -; AVX1-NEXT: vmovdqu %xmm3, 96(%rdi) ; AVX1-NEXT: vmovdqu %xmm11, 144(%rdi) ; AVX1-NEXT: vmovdqu %xmm2, 128(%rdi) +; AVX1-NEXT: vmovdqu %xmm10, 112(%rdi) +; AVX1-NEXT: vmovdqu %xmm3, 96(%rdi) +; AVX1-NEXT: vmovdqu %xmm7, 80(%rdi) +; AVX1-NEXT: vmovdqu %xmm0, 64(%rdi) +; AVX1-NEXT: vmovdqu %xmm12, 48(%rdi) +; AVX1-NEXT: vmovdqu %xmm8, 32(%rdi) +; AVX1-NEXT: vmovdqu %xmm5, 16(%rdi) +; AVX1-NEXT: vmovdqu %xmm4, (%rdi) ; AVX1-NEXT: popq %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1256,11 +1255,11 @@ ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm4 ; AVX512-NEXT: vpshufb %zmm4, %zmm2, %zmm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512-NEXT: vpshufb %zmm4, %zmm5, %zmm4 +; AVX512-NEXT: vpshufb %zmm3, %zmm5, %zmm3 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vpshufb %zmm3, %zmm0, %zmm0 +; AVX512-NEXT: vpshufb %zmm4, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, 128(%rdi) -; AVX512-NEXT: vmovdqu64 %zmm4, 64(%rdi) +; AVX512-NEXT: vmovdqu64 %zmm3, 64(%rdi) ; AVX512-NEXT: vmovdqu64 %zmm2, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1652,10 +1651,10 @@ ; ; AVX512-LABEL: splat2_v4f64_load_store: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %ymm0 -; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] -; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vmovups %zmm0, (%rsi) +; AVX512-NEXT: vbroadcastf64x4 (%rdi), %zmm0 # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] +; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = load <4 x double>, ptr %s, align 8 @@ -1689,10 +1688,10 @@ ; ; AVX512-LABEL: splat2_v4i64_load_store: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovups (%rdi), %ymm0 -; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] -; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vmovups %zmm0, (%rsi) +; AVX512-NEXT: vbroadcasti64x4 (%rdi), %zmm0 # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7] +; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = load <4 x i64>, ptr %s, align 8 @@ -1705,22 +1704,22 @@ define void @splat4_v8f32_load_store(ptr %s, ptr %d) nounwind { ; AVX1-LABEL: splat4_v8f32_load_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss 16(%rdi), %xmm0 -; AVX1-NEXT: vbroadcastss 20(%rdi), %xmm1 -; AVX1-NEXT: vbroadcastss 24(%rdi), %xmm2 -; AVX1-NEXT: vbroadcastss 28(%rdi), %xmm3 -; AVX1-NEXT: vbroadcastss (%rdi), %xmm4 -; AVX1-NEXT: vbroadcastss 4(%rdi), %xmm5 -; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm6 -; AVX1-NEXT: vbroadcastss 12(%rdi), %xmm7 -; AVX1-NEXT: vmovups %xmm7, 48(%rsi) -; AVX1-NEXT: vmovups %xmm6, 32(%rsi) -; AVX1-NEXT: vmovups %xmm5, 16(%rsi) -; AVX1-NEXT: vmovups %xmm4, (%rsi) -; AVX1-NEXT: vmovups %xmm3, 112(%rsi) -; AVX1-NEXT: vmovups %xmm2, 96(%rsi) -; AVX1-NEXT: vmovups %xmm1, 80(%rsi) -; AVX1-NEXT: vmovups %xmm0, 64(%rsi) +; AVX1-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX1-NEXT: vbroadcastss 4(%rdi), %xmm1 +; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm2 +; AVX1-NEXT: vbroadcastss 12(%rdi), %xmm3 +; AVX1-NEXT: vbroadcastss 16(%rdi), %xmm4 +; AVX1-NEXT: vbroadcastss 20(%rdi), %xmm5 +; AVX1-NEXT: vbroadcastss 24(%rdi), %xmm6 +; AVX1-NEXT: vbroadcastss 28(%rdi), %xmm7 +; AVX1-NEXT: vmovups %xmm7, 112(%rsi) +; AVX1-NEXT: vmovups %xmm6, 96(%rsi) +; AVX1-NEXT: vmovups %xmm5, 80(%rsi) +; AVX1-NEXT: vmovups %xmm4, 64(%rsi) +; AVX1-NEXT: vmovups %xmm3, 48(%rsi) +; AVX1-NEXT: vmovups %xmm2, 32(%rsi) +; AVX1-NEXT: vmovups %xmm1, 16(%rsi) +; AVX1-NEXT: vmovups %xmm0, (%rsi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: splat4_v8f32_load_store: @@ -1837,14 +1836,15 @@ ; ; AVX512-LABEL: splat4_v4f64_load_store: ; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX512-NEXT: vbroadcastsd 16(%rdi), %ymm1 -; AVX512-NEXT: vbroadcastsd 8(%rdi), %ymm2 -; AVX512-NEXT: vbroadcastsd 24(%rdi), %ymm3 -; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512-NEXT: vmovups %zmm1, 64(%rsi) -; AVX512-NEXT: vmovups %zmm0, (%rsi) +; AVX512-NEXT: vbroadcastf64x4 (%rdi), %zmm0 # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpbroadcastq %xmm0, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,2,2,2] +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm0[1,1,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, 64(%rsi) +; AVX512-NEXT: vmovdqu64 %zmm1, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = load <4 x double>, ptr %s, align 8 @@ -1871,14 +1871,15 @@ ; ; AVX512-LABEL: splat4_v4i64_load_store: ; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX512-NEXT: vbroadcastsd 16(%rdi), %ymm1 -; AVX512-NEXT: vbroadcastsd 8(%rdi), %ymm2 -; AVX512-NEXT: vbroadcastsd 24(%rdi), %ymm3 -; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512-NEXT: vmovups %zmm1, 64(%rsi) -; AVX512-NEXT: vmovups %zmm0, (%rsi) +; AVX512-NEXT: vbroadcasti64x4 (%rdi), %zmm0 # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpbroadcastq %xmm0, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,2,2,2] +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm0[1,1,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, 64(%rsi) +; AVX512-NEXT: vmovdqu64 %zmm1, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %x = load <4 x i64>, ptr %s, align 8 diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll --- a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll +++ b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll @@ -1178,12 +1178,14 @@ ; ENABLE: ## %bb.0: ## %entry ; ENABLE-NEXT: pushq %rbx ; ENABLE-NEXT: subq $16, %rsp -; ENABLE-NEXT: xorl %eax, %eax -; ENABLE-NEXT: cmpb $0, _b(%rip) -; ENABLE-NEXT: movl $48, %ecx -; ENABLE-NEXT: cmovnel %eax, %ecx -; ENABLE-NEXT: movb %cl, _c(%rip) -; ENABLE-NEXT: je LBB14_4 +; ENABLE-NEXT: movzbl _b(%rip), %eax +; ENABLE-NEXT: xorl %ecx, %ecx +; ENABLE-NEXT: testb %al, %al +; ENABLE-NEXT: movl $48, %r8d +; ENABLE-NEXT: cmovnel %ecx, %r8d +; ENABLE-NEXT: movb %r8b, _c(%rip) +; ENABLE-NEXT: cmpb $1, %al +; ENABLE-NEXT: jne LBB14_4 ; ENABLE-NEXT: ## %bb.1: ## %for.body.lr.ph ; ENABLE-NEXT: ## InlineAsm Start ; ENABLE-NEXT: nop @@ -1213,12 +1215,14 @@ ; DISABLE: ## %bb.0: ## %entry ; DISABLE-NEXT: pushq %rbx ; DISABLE-NEXT: subq $16, %rsp -; DISABLE-NEXT: xorl %eax, %eax -; DISABLE-NEXT: cmpb $0, _b(%rip) -; DISABLE-NEXT: movl $48, %ecx -; DISABLE-NEXT: cmovnel %eax, %ecx -; DISABLE-NEXT: movb %cl, _c(%rip) -; DISABLE-NEXT: je LBB14_4 +; DISABLE-NEXT: movzbl _b(%rip), %eax +; DISABLE-NEXT: xorl %ecx, %ecx +; DISABLE-NEXT: testb %al, %al +; DISABLE-NEXT: movl $48, %r8d +; DISABLE-NEXT: cmovnel %ecx, %r8d +; DISABLE-NEXT: movb %r8b, _c(%rip) +; DISABLE-NEXT: cmpb $1, %al +; DISABLE-NEXT: jne LBB14_4 ; DISABLE-NEXT: ## %bb.1: ## %for.body.lr.ph ; DISABLE-NEXT: ## InlineAsm Start ; DISABLE-NEXT: nop diff --git a/llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll --- a/llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll @@ -8,8 +8,9 @@ define <4 x i64> @broadcast128(<2 x i64> %src) { ; CHECK-LABEL: broadcast128: ; CHECK: ## %bb.0: +; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 ; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: retq %1 = alloca <2 x i64>, align 16 store <2 x i64> %src, ptr %1, align 16 diff --git a/llvm/test/CodeGen/X86/xop-shifts.ll b/llvm/test/CodeGen/X86/xop-shifts.ll --- a/llvm/test/CodeGen/X86/xop-shifts.ll +++ b/llvm/test/CodeGen/X86/xop-shifts.ll @@ -8,9 +8,10 @@ define <16 x i8> @demandedelts_vpshab(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: demandedelts_vpshab: ; CHECK: # %bb.0: +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vpshab %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %shuffle = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> %shift = call <16 x i8> @llvm.x86.xop.vpshab(<16 x i8> %shuffle, <16 x i8> %a1) diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll --- a/llvm/test/CodeGen/X86/xor.ll +++ b/llvm/test/CodeGen/X86/xor.ll @@ -394,8 +394,8 @@ define i32 @PR17487(i1 %tobool) { ; X86-LABEL: PR17487: ; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: notb %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: notl %ecx ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: testb $1, %cl ; X86-NEXT: sete %al diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll @@ -1439,38 +1439,38 @@ ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v32i8_to_v16i16_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE42-NEXT: paddb (%rdx), %xmm2 ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v32i8_to_v16i16_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_v32i8_to_v16i16_factor2: @@ -1522,12 +1522,12 @@ ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v32i8_to_v8i32_factor4: @@ -1606,12 +1606,12 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v32i8_to_v4i64_factor8: @@ -1830,12 +1830,12 @@ ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v16i16_to_v8i32_factor2: @@ -1845,23 +1845,23 @@ ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm2 -; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v16i16_to_v8i32_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_v16i16_to_v8i32_factor2: @@ -1915,12 +1915,12 @@ ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v16i16_to_v4i64_factor4: @@ -2122,38 +2122,82 @@ ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; -; AVX2-LABEL: vec256_v16i16_to_v1i256_factor16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vec256_v16i16_to_v1i256_factor16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq ; -; AVX512F-LABEL: vec256_v16i16_to_v1i256_factor16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX2-FAST-PERLANE-LABEL: vec256_v16i16_to_v1i256_factor16: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; -; AVX512BW-LABEL: vec256_v16i16_to_v1i256_factor16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX2-FAST-LABEL: vec256_v16i16_to_v1i256_factor16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-SLOW-LABEL: vec256_v16i16_to_v1i256_factor16: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: vec256_v16i16_to_v1i256_factor16: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512F-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec256_v16i16_to_v1i256_factor16: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec256_v16i16_to_v1i256_factor16: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512BW-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias @@ -2175,12 +2219,12 @@ ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_v8i32_to_v4i64_factor2: @@ -2190,23 +2234,23 @@ ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero ; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm2 -; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_v8i32_to_v4i64_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_v8i32_to_v4i64_factor2: @@ -2291,11 +2335,11 @@ ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2327,8 +2371,8 @@ ; ; AVX2-FAST-LABEL: vec256_v8i32_to_v2i128_factor4: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,u,1,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -2465,10 +2509,10 @@ ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: paddb 16(%rdx), %xmm0 ; SSE-NEXT: paddb (%rdx), %xmm1 -; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: paddb 16(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec256_v4i64_to_v2i128_factor2: @@ -2478,11 +2522,11 @@ ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2672,14 +2716,14 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v48i8_to_v24i16_factor2: @@ -2689,15 +2733,15 @@ ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE42-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE42-NEXT: pxor %xmm3, %xmm3 -; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE42-NEXT: paddb (%rdx), %xmm3 ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm3, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v24i16_factor2: @@ -2706,15 +2750,15 @@ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; @@ -2739,6 +2783,8 @@ ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) @@ -2798,30 +2844,30 @@ ; SSE42-NEXT: movdqa %xmm0, %xmm1 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[11],zero,zero,xmm1[12],zero,zero,xmm1[13],zero,zero,xmm1[14],zero,zero,xmm1[15],zero,zero ; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero,zero,xmm2[5] -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[6],zero,zero,xmm0[7],zero,zero,xmm0[8],zero,zero,xmm0[9],zero,zero,xmm0[10],zero -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,xmm2[6],zero,zero,xmm2[7],zero,zero,xmm2[8],zero,zero,xmm2[9],zero,zero,xmm2[10],zero +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero,zero,xmm0[5] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v16i24_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero,zero,xmm0[5] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[6],zero,zero,xmm0[7],zero,zero,xmm0[8],zero,zero,xmm0[9],zero,zero,xmm0[10],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[6],zero,zero,xmm0[7],zero,zero,xmm0[8],zero,zero,xmm0[9],zero,zero,xmm0[10],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero,zero,xmm0[5] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX-NEXT: retq +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_v48i8_to_v16i24_factor3: ; AVX2: # %bb.0: @@ -2844,6 +2890,8 @@ ; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0],zero,zero,ymm1[1],zero,zero,ymm1[2],zero,zero,ymm1[3],zero,zero,ymm1[4],zero,zero,ymm1[5],zero,zero,ymm1[22],zero,zero,ymm1[23],zero,zero,ymm1[24],zero,zero,ymm1[25],zero,zero,ymm1[26],zero ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) @@ -2887,14 +2935,14 @@ ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v48i8_to_v12i32_factor4: @@ -2977,6 +3025,8 @@ ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) @@ -3013,16 +3063,16 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v48i8_to_v8i48_factor6: @@ -3032,28 +3082,28 @@ ; SSE42-NEXT: movdqa %xmm0, %xmm1 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[6],zero,zero,zero,zero,zero,xmm1[7],zero,zero,zero,zero,zero ; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,xmm2[2],zero,zero,zero -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,xmm0[5],zero -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,xmm2[3],zero,zero,zero,zero,zero,xmm2[4],zero,zero,zero,zero,zero,xmm2[5],zero +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v8i48_factor6: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,xmm0[5],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,xmm0[5],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: retq ; @@ -3076,14 +3126,16 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,u,3,u,2,u,1,u,4,u,5,u,6,u,5,u] -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,u,3,u,2,u,1,u,4,u,5,u,6,u,5,u] +; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3091,10 +3143,8 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,3,0,1,4,0,2,5,0,3,0,1,4,0,2,5] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,u,3,u,2,u,1,u,4,u,5,u,6,u,5,u] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 @@ -3126,14 +3176,14 @@ ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v48i8_to_v6i64_factor8: @@ -3216,6 +3266,8 @@ ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) @@ -3275,28 +3327,28 @@ ; SSE42-NEXT: movdqa %xmm0, %xmm1 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2],zero,zero,zero,zero,zero,zero,zero +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v48i8_to_v4i96_factor12: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: retq ; @@ -3319,14 +3371,16 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3442,14 +3496,16 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3515,14 +3571,14 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -3586,8 +3642,8 @@ ; SSE-NEXT: movaps 16(%rdx), %xmm1 ; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; @@ -3663,14 +3719,14 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v24i16_to_v12i32_factor2: @@ -3683,12 +3739,12 @@ ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm3 +; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm3, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v24i16_to_v12i32_factor2: @@ -3697,15 +3753,15 @@ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; @@ -3730,6 +3786,8 @@ ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) @@ -3770,16 +3828,16 @@ ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,2] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v24i16_to_v8i48_factor3: @@ -3789,51 +3847,51 @@ ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1,2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm3 +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm3 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm3, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v24i16_to_v8i48_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2] ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7] +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_v24i16_to_v8i48_factor3: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,1,1,u,2,2> -; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,1,1,u,2,2> +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -3841,14 +3899,14 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,1,1,u,2,2> -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,1,1,u,2,2> +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -3856,14 +3914,14 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,1,1,u,2,2> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,u,1,1,u,2,2> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -3877,6 +3935,8 @@ ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) @@ -3953,14 +4013,14 @@ ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v24i16_to_v6i64_factor4: @@ -3984,30 +4044,30 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_v24i16_to_v6i64_factor4: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -4015,12 +4075,12 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -4028,12 +4088,12 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -4084,17 +4144,17 @@ ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v24i16_to_v4i96_factor6: @@ -4120,24 +4180,24 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,3],zero,zero -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6,7] -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4,5,6,7] -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX-NEXT: retq -; -; AVX2-SLOW-LABEL: vec384_v24i16_to_v4i96_factor6: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,3],zero,zero +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7] +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm3, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: retq +; +; AVX2-SLOW-LABEL: vec384_v24i16_to_v4i96_factor6: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] @@ -4188,10 +4248,12 @@ ; AVX512F-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 ; AVX512F-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) @@ -4297,32 +4359,32 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] -; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-SLOW-NEXT: vzeroupper @@ -4330,15 +4392,15 @@ ; ; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v3i128_factor8: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -4346,15 +4408,15 @@ ; ; AVX2-FAST-LABEL: vec384_v24i16_to_v3i128_factor8: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-NEXT: vzeroupper @@ -4367,11 +4429,13 @@ ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 ; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) @@ -4551,8 +4615,8 @@ ; SSE2-NEXT: movaps 16(%rdx), %xmm1 ; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm0 -; SSE2-NEXT: movaps %xmm1, 16(%rcx) ; SSE2-NEXT: movaps %xmm2, 32(%rcx) +; SSE2-NEXT: movaps %xmm1, 16(%rcx) ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; @@ -4565,8 +4629,8 @@ ; SSE42-NEXT: movaps 16(%rdx), %xmm0 ; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movaps %xmm0, 16(%rcx) ; SSE42-NEXT: movaps %xmm2, 32(%rcx) +; SSE42-NEXT: movaps %xmm0, 16(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; @@ -4585,18 +4649,44 @@ ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX2-LABEL: vec384_v24i16_to_v1i384_factor24: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vec384_v24i16_to_v1i384_factor24: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v1i384_factor24: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec384_v24i16_to_v1i384_factor24: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: vec384_v24i16_to_v1i384_factor24: ; AVX512F: # %bb.0: @@ -4644,14 +4734,14 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_v12i32_to_v6i64_factor2: @@ -4664,12 +4754,12 @@ ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm3 +; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm3, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_v12i32_to_v6i64_factor2: @@ -4678,15 +4768,15 @@ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; @@ -4697,10 +4787,10 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4709,11 +4799,11 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4787,20 +4877,20 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -4808,16 +4898,16 @@ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -4825,14 +4915,14 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -4840,14 +4930,14 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,2,1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -4858,11 +4948,11 @@ ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <16,1,2,17,4,5,18,7,8,19,10,11,u,u,u,u> ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4953,36 +5043,36 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_v12i32_to_v3i128_factor4: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-SLOW-NEXT: vzeroupper @@ -4990,15 +5080,15 @@ ; ; AVX2-FAST-PERLANE-LABEL: vec384_v12i32_to_v3i128_factor4: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -5008,15 +5098,15 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -5027,11 +5117,11 @@ ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <16,1,2,3,17,5,6,7,18,9,10,11,u,u,u,u> ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5122,12 +5212,12 @@ ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7] ; AVX-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -5163,13 +5253,13 @@ ; ; AVX2-FAST-LABEL: vec384_v12i32_to_v2i192_factor6: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0] ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) @@ -5181,10 +5271,11 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: movb $65, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z} -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [16,1,2,3,4,5,17,7,16,1,2,3,4,5,17,7] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -5226,8 +5317,8 @@ ; SSE2-NEXT: movaps 16(%rdx), %xmm0 ; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movaps %xmm0, 16(%rcx) ; SSE2-NEXT: movaps %xmm2, 32(%rcx) +; SSE2-NEXT: movaps %xmm0, 16(%rcx) ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; @@ -5240,8 +5331,8 @@ ; SSE42-NEXT: movaps 16(%rdx), %xmm0 ; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movaps %xmm0, 16(%rcx) ; SSE42-NEXT: movaps %xmm2, 32(%rcx) +; SSE42-NEXT: movaps %xmm0, 16(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; @@ -5249,9 +5340,9 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) @@ -5320,12 +5411,12 @@ ; SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero ; SSE-NEXT: movq {{.*#+}} xmm2 = xmm0[0],zero ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: paddb 16(%rdx), %xmm0 ; SSE-NEXT: paddb (%rdx), %xmm2 +; SSE-NEXT: paddb 16(%rdx), %xmm0 ; SSE-NEXT: paddb 32(%rdx), %xmm1 ; SSE-NEXT: movdqa %xmm1, 32(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) ; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: movdqa %xmm2, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec384_v6i64_to_v3i128_factor2: @@ -5338,13 +5429,13 @@ ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3] ; AVX-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -5371,11 +5462,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,9,1,11,2,13,u,u> ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5445,12 +5536,12 @@ ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -5472,10 +5563,11 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: movb $9, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z} -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [8,1,2,9,8,1,2,9] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -5516,8 +5608,8 @@ ; SSE-NEXT: movaps 16(%rdx), %xmm1 ; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; @@ -5525,8 +5617,8 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) @@ -5590,8 +5682,8 @@ ; SSE-NEXT: movaps 16(%rdx), %xmm1 ; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; @@ -5621,8 +5713,9 @@ ; ; AVX512F-LABEL: vec384_v3i128_to_v1i384_factor3: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) @@ -5661,19 +5754,19 @@ ; SSE2-NEXT: paddb 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm4 -; SSE2-NEXT: paddb 48(%rdx), %xmm1 -; SSE2-NEXT: paddb 32(%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, 32(%rcx) -; SSE2-NEXT: movdqa %xmm1, 48(%rcx) -; SSE2-NEXT: movdqa %xmm4, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm4 +; SSE2-NEXT: paddb 32(%rdx), %xmm1 +; SSE2-NEXT: paddb 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 48(%rcx) +; SSE2-NEXT: movdqa %xmm1, 32(%rcx) +; SSE2-NEXT: movdqa %xmm4, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v64i8_to_v32i16_factor2: @@ -5682,53 +5775,53 @@ ; SSE42-NEXT: movdqa 16(%rdi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 16(%rsi), %xmm1 -; SSE42-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE42-NEXT: pxor %xmm3, %xmm3 -; SSE42-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE42-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE42-NEXT: paddb (%rdx), %xmm4 +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb 32(%rdx), %xmm3 ; SSE42-NEXT: paddb 48(%rdx), %xmm1 -; SSE42-NEXT: paddb 32(%rdx), %xmm2 -; SSE42-NEXT: movdqa %xmm2, 32(%rcx) ; SSE42-NEXT: movdqa %xmm1, 48(%rcx) -; SSE42-NEXT: movdqa %xmm4, (%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm4, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v64i8_to_v32i16_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vmovdqa %xmm3, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v64i8_to_v32i16_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5736,13 +5829,13 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5774,20 +5867,20 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm4 -; SSE2-NEXT: paddb 48(%rdx), %xmm2 -; SSE2-NEXT: paddb 32(%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, 48(%rcx) -; SSE2-NEXT: movdqa %xmm4, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm4 +; SSE2-NEXT: paddb 32(%rdx), %xmm2 +; SSE2-NEXT: paddb 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 48(%rcx) +; SSE2-NEXT: movdqa %xmm2, 32(%rcx) +; SSE2-NEXT: movdqa %xmm4, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v64i8_to_v16i32_factor4: @@ -5795,19 +5888,19 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb 48(%rdx), %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: paddb 32(%rdx), %xmm3 +; SSE42-NEXT: paddb 48(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) +; SSE42-NEXT: movdqa %xmm2, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; @@ -5816,20 +5909,20 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v64i8_to_v16i32_factor4: @@ -5851,11 +5944,11 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5888,20 +5981,20 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm4 -; SSE2-NEXT: paddb 48(%rdx), %xmm2 -; SSE2-NEXT: paddb 32(%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, 48(%rcx) -; SSE2-NEXT: movdqa %xmm4, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm4 +; SSE2-NEXT: paddb 32(%rdx), %xmm2 +; SSE2-NEXT: paddb 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 48(%rcx) +; SSE2-NEXT: movdqa %xmm2, 32(%rcx) +; SSE2-NEXT: movdqa %xmm4, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v64i8_to_v8i64_factor8: @@ -5909,20 +6002,20 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: psrlq $48, %xmm2 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: psrlq $48, %xmm3 +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: psrld $16, %xmm0 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb 48(%rdx), %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: paddb 32(%rdx), %xmm3 +; SSE42-NEXT: paddb 48(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) +; SSE42-NEXT: movdqa %xmm2, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; @@ -5931,26 +6024,26 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpsrld $16, %xmm0, %xmm2 +; AVX-NEXT: vpsrlq $48, %xmm0, %xmm2 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v64i8_to_v8i64_factor8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero @@ -5966,11 +6059,11 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6000,20 +6093,20 @@ ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2] ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: paddb 16(%rdx), %xmm0 -; SSE-NEXT: paddb 48(%rdx), %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm2 +; SSE-NEXT: paddb 32(%rdx), %xmm3 +; SSE-NEXT: paddb 48(%rdx), %xmm2 ; SSE-NEXT: paddb (%rdx), %xmm1 ; SSE-NEXT: movdqa %xmm1, (%rcx) -; SSE-NEXT: movdqa %xmm2, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, 48(%rcx) +; SSE-NEXT: movdqa %xmm2, 48(%rcx) +; SSE-NEXT: movdqa %xmm3, 32(%rcx) ; SSE-NEXT: movdqa %xmm0, 16(%rcx) ; SSE-NEXT: retq ; @@ -6022,20 +6115,20 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] ; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16: @@ -6062,18 +6155,18 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -6081,18 +6174,18 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -6108,11 +6201,11 @@ ; AVX512F-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; @@ -6127,12 +6220,12 @@ ; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16: @@ -6207,8 +6300,8 @@ ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq @@ -6217,13 +6310,13 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -6231,13 +6324,13 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0] +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6273,8 +6366,8 @@ ; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: movaps 48(%rdx), %xmm3 ; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps %xmm3, 48(%rcx) +; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq @@ -6348,19 +6441,19 @@ ; SSE2-NEXT: paddb 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm4 -; SSE2-NEXT: paddb 48(%rdx), %xmm1 -; SSE2-NEXT: paddb 32(%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, 32(%rcx) -; SSE2-NEXT: movdqa %xmm1, 48(%rcx) -; SSE2-NEXT: movdqa %xmm4, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm4 +; SSE2-NEXT: paddb 32(%rdx), %xmm1 +; SSE2-NEXT: paddb 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 48(%rcx) +; SSE2-NEXT: movdqa %xmm1, 32(%rcx) +; SSE2-NEXT: movdqa %xmm4, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v32i16_to_v16i32_factor2: @@ -6374,48 +6467,48 @@ ; SSE42-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm4 -; SSE42-NEXT: paddb 48(%rdx), %xmm1 +; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm3 -; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 48(%rcx) -; SSE42-NEXT: movdqa %xmm4, (%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm4, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v32i16_to_v16i32_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vmovdqa %xmm3, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v32i16_to_v16i32_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -6424,11 +6517,11 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6462,20 +6555,20 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm4 -; SSE2-NEXT: paddb 48(%rdx), %xmm2 -; SSE2-NEXT: paddb 32(%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, 48(%rcx) -; SSE2-NEXT: movdqa %xmm4, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm4 +; SSE2-NEXT: paddb 32(%rdx), %xmm2 +; SSE2-NEXT: paddb 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 48(%rcx) +; SSE2-NEXT: movdqa %xmm2, 32(%rcx) +; SSE2-NEXT: movdqa %xmm4, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v32i16_to_v8i64_factor4: @@ -6483,19 +6576,19 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb 48(%rdx), %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: paddb 32(%rdx), %xmm3 +; SSE42-NEXT: paddb 48(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) +; SSE42-NEXT: movdqa %xmm2, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; @@ -6504,26 +6597,26 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v32i16_to_v8i64_factor4: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero @@ -6539,11 +6632,11 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6576,18 +6669,18 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb 48(%rdx), %xmm2 ; SSE2-NEXT: paddb 32(%rdx), %xmm0 +; SSE2-NEXT: paddb 48(%rdx), %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) ; SSE2-NEXT: movdqa %xmm2, 48(%rcx) +; SSE2-NEXT: movdqa %xmm0, 32(%rcx) ; SSE2-NEXT: movdqa %xmm3, 16(%rcx) ; SSE2-NEXT: retq ; @@ -6598,18 +6691,18 @@ ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE42-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] ; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE42-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: psrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE42-NEXT: paddb 16(%rdx), %xmm3 -; SSE42-NEXT: paddb 48(%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm0 +; SSE42-NEXT: paddb 48(%rdx), %xmm2 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 32(%rcx) ; SSE42-NEXT: movdqa %xmm2, 48(%rcx) +; SSE42-NEXT: movdqa %xmm0, 32(%rcx) ; SSE42-NEXT: movdqa %xmm3, 16(%rcx) ; SSE42-NEXT: retq ; @@ -6619,20 +6712,20 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec512_v32i16_to_v4i128_factor8: @@ -6659,16 +6752,16 @@ ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -6677,16 +6770,16 @@ ; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -6713,25 +6806,26 @@ ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX512F-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; ; AVX512BW-LABEL: vec512_v32i16_to_v4i128_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,34,17,18,19,20,21,22,23,35,25,26,27,28,29,30,31] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,50,17,18,19,20,21,22,23,51,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 @@ -6799,52 +6893,107 @@ ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; -; AVX2-LABEL: vec512_v32i16_to_v2i256_factor16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vec512_v32i16_to_v2i256_factor16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq ; -; AVX512F-LABEL: vec512_v32i16_to_v2i256_factor16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm1 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX2-FAST-PERLANE-LABEL: vec512_v32i16_to_v2i256_factor16: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [65535,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpand %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; -; AVX512BW-LABEL: vec512_v32i16_to_v2i256_factor16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX512BW-NEXT: vpand %ymm0, %ymm1, %ymm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX2-FAST-LABEL: vec512_v32i16_to_v2i256_factor16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [65535,0,0,0] +; AVX2-FAST-NEXT: vpand %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-SLOW-LABEL: vec512_v32i16_to_v2i256_factor16: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: vec512_v32i16_to_v2i256_factor16: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [65535,0,0,0] +; AVX512F-FAST-NEXT: vpand %ymm0, %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec512_v32i16_to_v2i256_factor16: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec512_v32i16_to_v2i256_factor16: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512BW-FAST-NEXT: vpand %ymm0, %ymm1, %ymm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias @@ -6867,8 +7016,8 @@ ; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: movaps 48(%rdx), %xmm3 ; SSE2-NEXT: paddb (%rdx), %xmm0 -; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movaps %xmm3, 48(%rcx) +; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movaps %xmm1, 16(%rcx) ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq @@ -6883,8 +7032,8 @@ ; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: movaps 48(%rdx), %xmm3 ; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movaps %xmm2, 32(%rcx) ; SSE42-NEXT: movaps %xmm3, 48(%rcx) +; SSE42-NEXT: movaps %xmm2, 32(%rcx) ; SSE42-NEXT: movaps %xmm0, 16(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq @@ -6904,18 +7053,44 @@ ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX2-LABEL: vec512_v32i16_to_v1i512_factor32: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vec512_v32i16_to_v1i512_factor32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec512_v32i16_to_v1i512_factor32: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec512_v32i16_to_v1i512_factor32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: vec512_v32i16_to_v1i512_factor32: ; AVX512F: # %bb.0: @@ -6960,19 +7135,19 @@ ; SSE2-NEXT: paddb 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm4 -; SSE2-NEXT: paddb 48(%rdx), %xmm1 -; SSE2-NEXT: paddb 32(%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, 32(%rcx) -; SSE2-NEXT: movdqa %xmm1, 48(%rcx) -; SSE2-NEXT: movdqa %xmm4, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm4 +; SSE2-NEXT: paddb 32(%rdx), %xmm1 +; SSE2-NEXT: paddb 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 48(%rcx) +; SSE2-NEXT: movdqa %xmm1, 32(%rcx) +; SSE2-NEXT: movdqa %xmm4, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_v16i32_to_v8i64_factor2: @@ -6986,48 +7161,48 @@ ; SSE42-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero ; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm4 -; SSE42-NEXT: paddb 48(%rdx), %xmm1 +; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm3 -; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 48(%rcx) -; SSE42-NEXT: movdqa %xmm4, (%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm4, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_v16i32_to_v8i64_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vmovdqa %xmm3, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v16i32_to_v8i64_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -7036,11 +7211,11 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -7124,16 +7299,16 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm3 +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm3, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -7142,16 +7317,16 @@ ; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -7160,34 +7335,34 @@ ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec512_v16i32_to_v4i128_factor4: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <2,u,u,u,3,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,u,u,u,3,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -7198,11 +7373,11 @@ ; AVX512F-NEXT: movw $4369, %ax # imm = 0x1111 ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -7276,18 +7451,17 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[1],zero,zero,zero ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] -; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec512_v16i32_to_v2i256_factor8: @@ -7309,13 +7483,13 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -7323,13 +7497,13 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -7340,11 +7514,11 @@ ; AVX512F-NEXT: movw $257, %ax # imm = 0x101 ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -7397,8 +7571,8 @@ ; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: movaps 48(%rdx), %xmm3 ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movaps %xmm3, 48(%rcx) +; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movaps %xmm0, 16(%rcx) ; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq @@ -7413,8 +7587,8 @@ ; SSE42-NEXT: movaps 32(%rdx), %xmm2 ; SSE42-NEXT: movaps 48(%rdx), %xmm3 ; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movaps %xmm2, 32(%rcx) ; SSE42-NEXT: movaps %xmm3, 48(%rcx) +; SSE42-NEXT: movaps %xmm2, 32(%rcx) ; SSE42-NEXT: movaps %xmm0, 16(%rcx) ; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq @@ -7423,9 +7597,9 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) @@ -7493,37 +7667,37 @@ ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: movq {{.*#+}} xmm3 = xmm0[0],zero ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: paddb 16(%rdx), %xmm0 ; SSE-NEXT: paddb (%rdx), %xmm3 -; SSE-NEXT: paddb 48(%rdx), %xmm1 +; SSE-NEXT: paddb 16(%rdx), %xmm0 ; SSE-NEXT: paddb 32(%rdx), %xmm2 -; SSE-NEXT: movdqa %xmm2, 32(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm1 ; SSE-NEXT: movdqa %xmm1, 48(%rcx) -; SSE-NEXT: movdqa %xmm3, (%rcx) +; SSE-NEXT: movdqa %xmm2, 32(%rcx) ; SSE-NEXT: movdqa %xmm0, 16(%rcx) +; SSE-NEXT: movdqa %xmm3, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_v8i64_to_v4i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[3] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm3 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -7532,14 +7706,14 @@ ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -7550,11 +7724,11 @@ ; AVX512F-NEXT: movb $85, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -7623,27 +7797,27 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v8i64_to_v2i256_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = xmm0[0],zero -; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -7654,11 +7828,11 @@ ; AVX512F-NEXT: movb $17, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -7695,8 +7869,8 @@ ; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: movaps 48(%rdx), %xmm3 ; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps %xmm3, 48(%rcx) +; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq @@ -7705,8 +7879,8 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) @@ -7781,16 +7955,16 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) ; AVX-NEXT: vmovaps %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v4i128_to_v2i256_factor2: @@ -7813,11 +7987,11 @@ ; AVX512F-NEXT: movb $51, %al ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -7853,8 +8027,8 @@ ; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: movaps 48(%rdx), %xmm3 ; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps %xmm3, 48(%rcx) +; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq @@ -7885,8 +8059,9 @@ ; ; AVX512F-LABEL: vec512_v4i128_to_v1i512_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %xmm0, %xmm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) @@ -7919,30 +8094,30 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: paddb 16(%rsi), %xmm1 +; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: movaps 32(%rdx), %xmm2 ; SSE-NEXT: movaps 48(%rdx), %xmm3 -; SSE-NEXT: paddb 16(%rdx), %xmm1 ; SSE-NEXT: paddb (%rdx), %xmm0 -; SSE-NEXT: movaps %xmm2, 32(%rcx) +; SSE-NEXT: paddb 16(%rdx), %xmm1 ; SSE-NEXT: movaps %xmm3, 48(%rcx) -; SSE-NEXT: movdqa %xmm0, (%rcx) +; SSE-NEXT: movaps %xmm2, 32(%rcx) ; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: movdqa %xmm0, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_v2i256_to_v1i512_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -316,7 +316,7 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -326,7 +326,7 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -336,7 +336,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -976,38 +976,77 @@ ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; -; AVX512F-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] -; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpermd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq ; -; AVX512DQ-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512F-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,3] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq ; -; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,7] -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512DQ-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,3] +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,7] +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,3] +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias @@ -1027,22 +1066,22 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -1050,21 +1089,21 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> -; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: pshufb %xmm3, %xmm2 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE42-NEXT: pshufb %xmm3, %xmm2 -; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -1072,21 +1111,21 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -1160,20 +1199,20 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1181,19 +1220,19 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] -; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] -; SSE42-NEXT: pshufb %xmm1, %xmm3 -; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] -; SSE42-NEXT: pshufb %xmm1, %xmm0 -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm3 -; SSE42-NEXT: movdqa %xmm3, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] +; SSE42-NEXT: pshufb %xmm2, %xmm3 +; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; SSE42-NEXT: pshufb %xmm2, %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm3 +; SSE42-NEXT: movdqa %xmm3, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1201,18 +1240,18 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1285,20 +1324,20 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1306,19 +1345,19 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] -; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] -; SSE42-NEXT: pshufb %xmm1, %xmm3 -; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] -; SSE42-NEXT: pshufb %xmm1, %xmm0 -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm3 -; SSE42-NEXT: movdqa %xmm3, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] +; SSE42-NEXT: pshufb %xmm2, %xmm3 +; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; SSE42-NEXT: pshufb %xmm2, %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm3 +; SSE42-NEXT: movdqa %xmm3, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1326,18 +1365,18 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1410,19 +1449,19 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1430,17 +1469,17 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm1 ; SSE42-NEXT: movdqa 32(%rdi), %xmm2 ; SSE42-NEXT: movdqa 48(%rdi), %xmm3 -; SSE42-NEXT: paddb 48(%rsi), %xmm3 ; SSE42-NEXT: paddb 32(%rsi), %xmm2 +; SSE42-NEXT: paddb 48(%rsi), %xmm3 ; SSE42-NEXT: paddb (%rsi), %xmm1 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE42-NEXT: movdqa %xmm1, %xmm4 -; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm4 -; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm1 -; SSE42-NEXT: paddb 16(%rdx), %xmm1 -; SSE42-NEXT: paddb (%rdx), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm4 +; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: paddb 16(%rdx), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1448,16 +1487,16 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm2 +; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1535,25 +1574,25 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,1,2,3,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1561,20 +1600,20 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: pshufb %xmm3, %xmm2 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE42-NEXT: pshufb %xmm3, %xmm2 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1582,19 +1621,20 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15] +; AVX-NEXT: # xmm3 = mem[0,0] ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1666,20 +1706,20 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,0,65535,65535,65535] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1687,16 +1727,16 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] ; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1704,16 +1744,16 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1785,19 +1825,19 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1805,15 +1845,15 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1821,15 +1861,15 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1901,19 +1941,19 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: @@ -1921,16 +1961,16 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: @@ -1945,11 +1985,11 @@ ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2059,15 +2099,15 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: @@ -2075,15 +2115,15 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: @@ -2097,11 +2137,11 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2211,15 +2251,15 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: @@ -2227,15 +2267,15 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: @@ -2249,11 +2289,11 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2285,12 +2325,12 @@ ; ; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,0,7] -; AVX512F-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-FAST-NEXT: vzeroupper @@ -2311,12 +2351,12 @@ ; ; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,0,7] -; AVX512DQ-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-FAST-NEXT: vzeroupper @@ -2337,9 +2377,9 @@ ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,13,0,15] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper @@ -2415,10 +2455,10 @@ ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -2441,61 +2481,120 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-SLOW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512F-SLOW-NEXT: vpbroadcastb %xmm0, %ymm2 +; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq ; -; AVX512DQ-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512F-FAST-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpbroadcastb %xmm0, %ymm2 +; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq ; -; AVX512BW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq - %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 - %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 - %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias +; AVX512DQ-SLOW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512DQ-SLOW-NEXT: vpbroadcastb %xmm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512DQ-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpbroadcastb %xmm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512BW-SLOW-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512BW-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpbroadcastb %xmm0, %ymm2 +; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512BW-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq + %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 + %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 + %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias %broadcast.of.zextinreg = shufflevector <64 x i8> %in.vec, <64 x i8> zeroinitializer, <48 x i32> %out.bytevec.padded = shufflevector <48 x i8> %broadcast.of.zextinreg, <48 x i8> poison, <64 x i32> %out.vec.bias = load <64 x i8>, ptr %out.vec.bias.ptr, align 64 @@ -2561,13 +2660,13 @@ ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -2604,10 +2703,12 @@ ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2623,10 +2724,12 @@ ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2710,10 +2813,10 @@ ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -2738,39 +2841,43 @@ ; ; AVX512F-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm3 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm3 ; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2857,13 +2964,13 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -2900,10 +3007,12 @@ ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2919,10 +3028,12 @@ ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3007,10 +3118,10 @@ ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -3035,39 +3146,43 @@ ; ; AVX512F-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm3 ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm3 ; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3120,8 +3235,8 @@ ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: @@ -3153,13 +3268,13 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -3196,10 +3311,12 @@ ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3215,10 +3332,12 @@ ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3286,8 +3405,8 @@ ; SSE42-NEXT: paddb %xmm1, %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm3, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: @@ -3300,11 +3419,11 @@ ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: @@ -3340,8 +3459,10 @@ ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper @@ -3360,8 +3481,10 @@ ; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -3450,8 +3573,8 @@ ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -3577,8 +3700,8 @@ ; SSE42-NEXT: paddb %xmm2, %xmm1 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: @@ -3590,91 +3713,176 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; -; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX2-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq ; -; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7],ymm2[8],ymm3[9],ymm2[10],ymm3[11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; -; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47] -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 -; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512BW-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq +; AVX2-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX2-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7],ymm2[8],ymm3[9],ymm2[10],ymm3[11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; -; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47] -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero +; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512F-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7],ymm2[8],ymm3[9],ymm2[10],ymm3[11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7],ymm2[8],ymm3[9],ymm2[10],ymm3[11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero +; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,25,0,27,0,29,0,31] +; AVX512BW-SLOW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 +; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47] +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -3734,8 +3942,8 @@ ; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm3 ; SSE42-NEXT: movdqa %xmm3, 32(%rcx) -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: @@ -3750,47 +3958,91 @@ ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; -; AVX2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14],ymm0[15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14],ymm1[15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) @@ -3799,17 +4051,20 @@ ; ; AVX512F-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] -; AVX512F-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14],ymm2[15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-FAST-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) @@ -3818,18 +4073,22 @@ ; ; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14],ymm0[15] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14],ymm1[15] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx) @@ -3838,17 +4097,20 @@ ; ; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14],ymm2[15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx) @@ -3858,14 +4120,17 @@ ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 +; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,25,26,0,28,29,0,31] +; AVX512BW-SLOW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 +; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -3934,8 +4199,8 @@ ; SSE42-NEXT: paddb %xmm2, %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: @@ -3949,30 +4214,32 @@ ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; AVX2-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7],ymm2[8],ymm3[9,10,11],ymm2[12],ymm3[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -3983,8 +4250,10 @@ ; AVX2-FAST-PERLANE-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7],ymm2[8],ymm3[9,10,11],ymm2[12],ymm3[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4000,8 +4269,10 @@ ; AVX2-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; AVX2-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7],ymm2[8],ymm3[9,10,11],ymm2[12],ymm3[13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4012,32 +4283,38 @@ ; ; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; AVX512F-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7],ymm2[8],ymm3[9,10,11],ymm2[12],ymm3[13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; AVX512F-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7],ymm2[8],ymm3[9,10,11],ymm2[12],ymm3[13,14,15] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4048,32 +4325,38 @@ ; ; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX512DQ-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7],ymm2[8],ymm3[9,10,11],ymm2[12],ymm3[13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7] +; AVX512DQ-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7],ymm2[8],ymm3[9,10,11],ymm2[12],ymm3[13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4085,14 +4368,18 @@ ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 +; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,9,10,11,16,13,14,15,16,9,10,11,16,13,14,15] +; AVX512BW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-SLOW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -4145,8 +4432,8 @@ ; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: @@ -4166,8 +4453,8 @@ ; SSE42-NEXT: paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: @@ -4178,52 +4465,96 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: retq ; -; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) @@ -4232,17 +4563,20 @@ ; ; AVX512F-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] -; AVX512F-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7,8,9,10,11],ymm2[12],ymm3[13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-FAST-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) @@ -4251,18 +4585,22 @@ ; ; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx) @@ -4271,17 +4609,20 @@ ; ; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] -; AVX512DQ-FAST-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7,8,9,10,11],ymm2[12],ymm3[13,14,15] -; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx) @@ -4291,14 +4632,17 @@ ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 +; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <16,9,10,11,12,13,16,15,u,u,u,u,16,u,u,u> +; AVX512BW-SLOW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -4365,8 +4709,8 @@ ; SSE42-NEXT: paddb %xmm2, %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: @@ -4379,12 +4723,12 @@ ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: @@ -4393,15 +4737,17 @@ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4411,15 +4757,19 @@ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4429,32 +4779,55 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,41,42,43,44,45,46,47] -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,9,10,11,12,13,14,15,16,9,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-SLOW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,41,42,43,44,45,46,47] +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 +; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias @@ -4526,72 +4899,162 @@ ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq ; -; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX512F-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; -; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX512DQ-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX2-FAST-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; -; AVX512BW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,40,41,42,43,0,45,46,47] -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512F-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512F-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <16,9,10,11,12,13,14,15,u,u,u,u,16,u,u,u> +; AVX512BW-SLOW-NEXT: vpermi2w %ymm0, %ymm1, %ymm2 +; AVX512BW-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7,8,9,10,11],ymm2[12],ymm0[13,14,15] +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,40,41,42,43,0,45,46,47] +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias @@ -4623,8 +5086,8 @@ ; SSE2-NEXT: paddb %xmm0, %xmm1 ; SSE2-NEXT: paddb 32(%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: movdqa %xmm1, 16(%rcx) +; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: @@ -4642,8 +5105,8 @@ ; SSE42-NEXT: paddb %xmm2, %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: @@ -4654,28 +5117,28 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[1,3],ymm2[4,4],ymm1[5,7] -; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,1,3] -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] +; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -4685,8 +5148,8 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -4736,11 +5199,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u> ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4754,25 +5217,25 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u> ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,29,0,31,0,1,0,1] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,0,15] -; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm0, %zmm1 +; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] ; AVX512BW-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -4782,9 +5245,9 @@ ; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,29,0,31,0,1,0,1] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,0,15] -; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm0, %zmm1 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero @@ -4823,8 +5286,8 @@ ; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: paddb 32(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: @@ -4843,8 +5306,8 @@ ; SSE42-NEXT: paddb 16(%rdx), %xmm1 ; SSE42-NEXT: paddb 32(%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: @@ -4866,8 +5329,8 @@ ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -4907,8 +5370,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -4926,8 +5389,8 @@ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -4941,11 +5404,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u> ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4959,11 +5422,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u> ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5026,8 +5489,8 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: paddb 32(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rcx) -; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: @@ -5044,29 +5507,29 @@ ; SSE42-NEXT: paddb %xmm2, %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7] +; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -5084,8 +5547,8 @@ ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5099,11 +5562,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u> ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5117,11 +5580,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u> ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5304,8 +5767,8 @@ ; SSE2-NEXT: paddb %xmm0, %xmm2 ; SSE2-NEXT: paddb 32(%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm0, 32(%rcx) -; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: @@ -5321,8 +5784,8 @@ ; SSE42-NEXT: paddb %xmm0, %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm0 ; SSE42-NEXT: movdqa %xmm0, 32(%rcx) -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: @@ -5331,18 +5794,18 @@ ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX-NEXT: vmovq {{.*#+}} xmm2 = xmm0[0],zero -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3] +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[2] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -5359,8 +5822,8 @@ ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5374,11 +5837,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,7,0,11,0,13,u,u> ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5392,11 +5855,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,7,0,11,0,13,u,u> ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5424,7 +5887,7 @@ ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper @@ -5479,19 +5942,19 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] +; AVX-NEXT: vmovaps 32(%rdx), %ymm1 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -5595,17 +6058,17 @@ ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: movdqa 16(%rdx), %xmm1 +; SSE2-NEXT: movdqa (%rdx), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rdx), %xmm2 +; SSE2-NEXT: movdqa 16(%rdx), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa 32(%rdx), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, 48(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm1, 16(%rcx) +; SSE2-NEXT: paddb 48(%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rcx) +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: @@ -5613,17 +6076,17 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero -; SSE42-NEXT: movdqa 16(%rdx), %xmm1 +; SSE42-NEXT: movdqa (%rdx), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rdx), %xmm2 +; SSE42-NEXT: movdqa 16(%rdx), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rdx), %xmm3 +; SSE42-NEXT: movdqa 32(%rdx), %xmm3 ; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: @@ -5631,14 +6094,14 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: @@ -5647,10 +6110,10 @@ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5660,10 +6123,10 @@ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5673,18 +6136,19 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -5707,17 +6171,17 @@ ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: movdqa 16(%rdx), %xmm1 +; SSE2-NEXT: movdqa (%rdx), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rdx), %xmm2 +; SSE2-NEXT: movdqa 16(%rdx), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa 32(%rdx), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, 48(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm1, 16(%rcx) +; SSE2-NEXT: paddb 48(%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rcx) +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: @@ -5725,17 +6189,17 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero -; SSE42-NEXT: movdqa 16(%rdx), %xmm1 +; SSE42-NEXT: movdqa (%rdx), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rdx), %xmm2 +; SSE42-NEXT: movdqa 16(%rdx), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rdx), %xmm3 +; SSE42-NEXT: movdqa 32(%rdx), %xmm3 ; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: @@ -5743,14 +6207,14 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: @@ -5759,10 +6223,10 @@ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5772,10 +6236,10 @@ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5785,18 +6249,19 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -5819,17 +6284,17 @@ ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: movdqa 16(%rdx), %xmm1 +; SSE2-NEXT: movdqa (%rdx), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rdx), %xmm2 +; SSE2-NEXT: movdqa 16(%rdx), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa 32(%rdx), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, 48(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm1, 16(%rcx) +; SSE2-NEXT: paddb 48(%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rcx) +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: @@ -5837,17 +6302,17 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: movdqa 16(%rdx), %xmm1 +; SSE42-NEXT: movdqa (%rdx), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rdx), %xmm2 +; SSE42-NEXT: movdqa 16(%rdx), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rdx), %xmm3 +; SSE42-NEXT: movdqa 32(%rdx), %xmm3 ; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: @@ -5855,14 +6320,14 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: @@ -5871,10 +6336,10 @@ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5884,10 +6349,10 @@ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5897,18 +6362,19 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zero,zmm0[16],zero,zero,zero,zero,zero,zero,zero,zmm0[16],zero,zero,zero,zero,zero,zero,zero,zmm0[32],zero,zero,zero,zero,zero,zero,zero,zmm0[32],zero,zero,zero,zero,zero,zero,zero,zmm0[48],zero,zero,zero,zero,zero,zero,zero,zmm0[48],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -5930,17 +6396,17 @@ ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdx), %xmm3 +; SSE-NEXT: movdqa 32(%rdx), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, 48(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rcx) +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: @@ -5948,14 +6414,14 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: @@ -5964,10 +6430,10 @@ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5977,10 +6443,10 @@ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5990,18 +6456,19 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5] ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -6039,14 +6506,14 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: @@ -6055,10 +6522,10 @@ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -6068,10 +6535,10 @@ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6081,10 +6548,10 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -6116,17 +6583,17 @@ ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: movdqa 16(%rdx), %xmm1 +; SSE2-NEXT: movdqa (%rdx), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rdx), %xmm2 +; SSE2-NEXT: movdqa 16(%rdx), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa 32(%rdx), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, 48(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm1, 16(%rcx) +; SSE2-NEXT: paddb 48(%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rcx) +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: @@ -6135,17 +6602,17 @@ ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE42-NEXT: movdqa 16(%rdx), %xmm1 +; SSE42-NEXT: movdqa (%rdx), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rdx), %xmm2 +; SSE42-NEXT: movdqa 16(%rdx), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rdx), %xmm3 +; SSE42-NEXT: movdqa 32(%rdx), %xmm3 ; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: @@ -6154,14 +6621,14 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: @@ -6170,10 +6637,10 @@ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -6183,10 +6650,10 @@ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6196,19 +6663,20 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,33,0,35,0,37,0,39,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,0,57,0,59,0,61,0,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,33,0,35,0,37,0,39,0,41,0,43,0,45,0,47,16,49,16,51,16,53,16,55,16,57,16,59,16,61,16,63] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -6233,17 +6701,17 @@ ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: movdqa 16(%rdx), %xmm1 +; SSE2-NEXT: movdqa (%rdx), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rdx), %xmm2 +; SSE2-NEXT: movdqa 16(%rdx), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa 32(%rdx), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, 48(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm1, 16(%rcx) +; SSE2-NEXT: paddb 48(%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rcx) +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: @@ -6253,17 +6721,17 @@ ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; SSE42-NEXT: movdqa 16(%rdx), %xmm1 +; SSE42-NEXT: movdqa (%rdx), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rdx), %xmm2 +; SSE42-NEXT: movdqa 16(%rdx), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rdx), %xmm3 +; SSE42-NEXT: movdqa 32(%rdx), %xmm3 ; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: @@ -6273,14 +6741,14 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: @@ -6289,10 +6757,10 @@ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -6302,10 +6770,10 @@ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6315,19 +6783,20 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,33,34,35,0,37,38,39,0,41,42,43,0,45,46,47,0,49,50,51,0,53,54,55,0,57,58,59,0,61,62,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,33,34,35,0,37,38,39,0,41,42,43,0,45,46,47,16,49,50,51,16,53,54,55,16,57,58,59,16,61,62,63] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -6351,17 +6820,17 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: movdqa 16(%rdx), %xmm1 +; SSE2-NEXT: movdqa (%rdx), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rdx), %xmm2 +; SSE2-NEXT: movdqa 16(%rdx), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa 32(%rdx), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, 48(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm1, 16(%rcx) +; SSE2-NEXT: paddb 48(%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rcx) +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: @@ -6370,17 +6839,17 @@ ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE42-NEXT: movdqa 16(%rdx), %xmm0 +; SSE42-NEXT: movdqa (%rdx), %xmm0 ; SSE42-NEXT: paddb %xmm1, %xmm0 -; SSE42-NEXT: movdqa (%rdx), %xmm2 +; SSE42-NEXT: movdqa 16(%rdx), %xmm2 ; SSE42-NEXT: paddb %xmm1, %xmm2 -; SSE42-NEXT: movdqa 48(%rdx), %xmm3 +; SSE42-NEXT: movdqa 32(%rdx), %xmm3 ; SSE42-NEXT: paddb %xmm1, %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: @@ -6389,14 +6858,14 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: @@ -6406,10 +6875,10 @@ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -6420,10 +6889,10 @@ ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6434,19 +6903,20 @@ ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,33,34,35,36,37,38,39,0,41,42,43,44,45,46,47,0,49,50,51,52,53,54,55,0,57,58,59,60,61,62,63] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,33,34,35,36,37,38,39,0,41,42,43,44,45,46,47,16,49,50,51,52,53,54,55,16,57,58,59,60,61,62,63] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -6504,54 +6974,106 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; -; AVX2-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-SLOW-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512F-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq ; -; AVX512F-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512DQ-SLOW-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq ; -; AVX512DQ-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512DQ-FAST-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512DQ-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: ; AVX512BW: # %bb.0: @@ -6583,17 +7105,17 @@ ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movdqa 16(%rdx), %xmm1 +; SSE2-NEXT: movdqa (%rdx), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rdx), %xmm2 +; SSE2-NEXT: movdqa 16(%rdx), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa 32(%rdx), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, 48(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm1, 16(%rcx) +; SSE2-NEXT: paddb 48(%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rcx) +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: @@ -6603,17 +7125,17 @@ ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE42-NEXT: movdqa 16(%rdx), %xmm1 +; SSE42-NEXT: movdqa (%rdx), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rdx), %xmm2 +; SSE42-NEXT: movdqa 16(%rdx), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rdx), %xmm3 +; SSE42-NEXT: movdqa 32(%rdx), %xmm3 ; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: @@ -6624,15 +7146,15 @@ ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm3 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -6643,10 +7165,10 @@ ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -6657,11 +7179,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6672,11 +7194,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -6710,17 +7232,17 @@ ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE2-NEXT: movdqa 16(%rdx), %xmm0 +; SSE2-NEXT: movdqa (%rdx), %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm0 -; SSE2-NEXT: movdqa (%rdx), %xmm2 +; SSE2-NEXT: movdqa 16(%rdx), %xmm2 ; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: movdqa 48(%rdx), %xmm3 +; SSE2-NEXT: movdqa 32(%rdx), %xmm3 ; SSE2-NEXT: paddb %xmm1, %xmm3 -; SSE2-NEXT: paddb 32(%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, 48(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: paddb 48(%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 48(%rcx) +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: @@ -6729,17 +7251,17 @@ ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE42-NEXT: movdqa 16(%rdx), %xmm0 +; SSE42-NEXT: movdqa (%rdx), %xmm0 ; SSE42-NEXT: paddb %xmm1, %xmm0 -; SSE42-NEXT: movdqa (%rdx), %xmm2 +; SSE42-NEXT: movdqa 16(%rdx), %xmm2 ; SSE42-NEXT: paddb %xmm1, %xmm2 -; SSE42-NEXT: movdqa 48(%rdx), %xmm3 +; SSE42-NEXT: movdqa 32(%rdx), %xmm3 ; SSE42-NEXT: paddb %xmm1, %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 48(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 48(%rcx) +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: @@ -6749,15 +7271,15 @@ ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm3 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -6768,10 +7290,10 @@ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -6782,11 +7304,11 @@ ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6797,11 +7319,11 @@ ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -6869,14 +7391,14 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: @@ -6885,10 +7407,10 @@ ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -6899,11 +7421,11 @@ ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -6914,11 +7436,11 @@ ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -6950,17 +7472,17 @@ ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero -; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdx), %xmm3 +; SSE-NEXT: movdqa 32(%rdx), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, 48(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rcx) +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: @@ -6970,15 +7492,15 @@ ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm3 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -6989,10 +7511,10 @@ ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -7003,11 +7525,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -7018,11 +7540,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -7071,14 +7593,14 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: @@ -7086,10 +7608,10 @@ ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -7100,11 +7622,11 @@ ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,1,2,3,8,5,6,7] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -7115,11 +7637,11 @@ ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,1,2,3,8,5,6,7] ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -7165,24 +7687,24 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rdx), %xmm2 ; AVX-NEXT: vmovaps 48(%rdx), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rcx) ; AVX-NEXT: vmovaps %xmm3, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovaps %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -7193,11 +7715,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -7208,11 +7730,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -283,7 +283,7 @@ ; AVX512F-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper @@ -292,7 +292,7 @@ ; AVX512DQ-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper @@ -301,7 +301,7 @@ ; AVX512BW-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper @@ -609,16 +609,16 @@ ; ; AVX512F-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX512F-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512F-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX512F-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512F-NEXT: movl (%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX512F-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] +; AVX512F-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] +; AVX512F-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper @@ -626,40 +626,37 @@ ; ; AVX512DQ-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-NEXT: vmovd %xmm0, %eax -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX512DQ-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQ-NEXT: movl (%rdi), %eax +; AVX512DQ-NEXT: vmovd %eax, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] +; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] +; AVX512DQ-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq -; -; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0 -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7] -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX512BW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512BW-NEXT: movl (%rdi), %eax +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX512BW-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] +; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] +; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <8 x i32> @@ -711,12 +708,12 @@ ; ; AVX512F-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] -; AVX512F-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512F-NEXT: movl (%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper @@ -724,36 +721,29 @@ ; ; AVX512DQ-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-NEXT: vmovd %xmm0, %eax -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQ-NEXT: movl (%rdi), %eax +; AVX512DQ-NEXT: vmovd %eax, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq -; -; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7] -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX512BW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512BW-NEXT: movl (%rdi), %eax +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <8 x i32> @@ -855,19 +845,19 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm3 -; SSE2-NEXT: movdqa %xmm3, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -876,17 +866,17 @@ ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> -; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: pshufb %xmm3, %xmm2 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE42-NEXT: pshufb %xmm3, %xmm2 -; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -896,16 +886,16 @@ ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -966,18 +956,18 @@ ; SSE2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -989,10 +979,10 @@ ; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; SSE42-NEXT: pshufb %xmm2, %xmm0 -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1003,10 +993,10 @@ ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1064,18 +1054,18 @@ ; SSE2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,1,0,1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1087,10 +1077,10 @@ ; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; SSE42-NEXT: pshufb %xmm2, %xmm0 -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1101,10 +1091,10 @@ ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1164,15 +1154,15 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pandn (%rdi), %xmm1 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1180,24 +1170,24 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm1 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE42-NEXT: movdqa %xmm1, %xmm2 -; SSE42-NEXT: pblendvb %xmm0, 32(%rdi), %xmm2 -; SSE42-NEXT: pblendvb %xmm0, 48(%rdi), %xmm1 -; SSE42-NEXT: paddb 16(%rsi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm2 -; SSE42-NEXT: movdqa %xmm2, (%rdx) -; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: pblendvb %xmm0, 48(%rdi), %xmm2 +; SSE42-NEXT: pblendvb %xmm0, 32(%rdi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: paddb 16(%rsi), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vpblendvb %xmm1, 32(%rdi), %xmm0, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm2 +; AVX-NEXT: vpblendvb %xmm1, 32(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1268,10 +1258,10 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1279,16 +1269,16 @@ ; SSE42-NEXT: movdqa 32(%rdi), %xmm0 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; SSE42-NEXT: pshufb %xmm2, %xmm0 +; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = mem[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm3, %xmm4 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE42-NEXT: pshufb %xmm2, %xmm1 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE42-NEXT: paddb 16(%rsi), %xmm3 -; SSE42-NEXT: paddb (%rsi), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rdx) -; SSE42-NEXT: movdqa %xmm3, 16(%rdx) +; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE42-NEXT: pshufb %xmm2, %xmm0 +; SSE42-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE42-NEXT: paddb (%rsi), %xmm3 +; SSE42-NEXT: paddb 16(%rsi), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rdx) +; SSE42-NEXT: movdqa %xmm3, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1296,15 +1286,15 @@ ; AVX-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = mem[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = mem[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1361,30 +1351,30 @@ ; SSE2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,0,65535,65535,65535] -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,1,0,1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1392,10 +1382,10 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1452,27 +1442,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,65535] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pandn (%rdi), %xmm1 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1480,10 +1470,10 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1542,22 +1532,22 @@ ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: paddb 16(%rsi), %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) -; SSE2-NEXT: retq -; -; SSE42-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: +; SSE2-NEXT: paddb (%rsi), %xmm1 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: retq +; +; SSE42-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: @@ -1565,11 +1555,11 @@ ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],mem[1,3],ymm0[4,4],mem[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1626,35 +1616,35 @@ ; SSE2-NEXT: movaps (%rdi), %xmm0 ; SSE2-NEXT: movaps 32(%rdi), %xmm1 ; SSE2-NEXT: movaps 48(%rdi), %xmm2 -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: paddb 16(%rsi), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1738,36 +1728,36 @@ ; SSE2-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movapd (%rdi), %xmm0 -; SSE2-NEXT: movapd 32(%rdi), %xmm1 +; SSE2-NEXT: movapd 48(%rdi), %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],mem[1] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],mem[4,5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1780,62 +1770,32 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vpbroadcastq (%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-SLOW-NEXT: vzeroupper -; AVX512F-SLOW-NEXT: retq -; -; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] -; AVX512F-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-FAST-NEXT: vzeroupper -; AVX512F-FAST-NEXT: retq -; -; AVX512DQ-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vpbroadcastq (%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-SLOW-NEXT: vzeroupper -; AVX512DQ-SLOW-NEXT: retq -; -; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] -; AVX512DQ-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-FAST-NEXT: vzeroupper -; AVX512DQ-FAST-NEXT: retq +; AVX512F-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq ; -; AVX512BW-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vpbroadcastq (%rdi), %ymm0 -; AVX512BW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq +; AVX512DQ-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] -; AVX512BW-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm1, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX512BW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> %broadcast.of.zextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> zeroinitializer, <4 x i32> @@ -1897,10 +1857,10 @@ ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; @@ -1922,33 +1882,33 @@ ; ; AVX512F-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm1 -; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm0 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm2 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm1 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm0 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm2 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2024,13 +1984,13 @@ ; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] ; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; @@ -2061,10 +2021,12 @@ ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2077,10 +2039,12 @@ ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero,xmm0[0],zero,zero -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2150,10 +2114,10 @@ ; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; @@ -2183,10 +2147,12 @@ ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2200,10 +2166,12 @@ ; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2277,13 +2245,13 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] ; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; @@ -2314,10 +2282,12 @@ ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2330,10 +2300,12 @@ ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2404,10 +2376,10 @@ ; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; @@ -2429,35 +2401,39 @@ ; ; AVX512F-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm3 -; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 +; AVX512F-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm3 -; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 +; AVX512DQ-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX512DQ-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2465,7 +2441,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm2 +; AVX512BW-NEXT: vpbroadcastq (%rdi), %ymm2 ; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} @@ -2504,8 +2480,8 @@ ; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rdx) -; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: @@ -2531,13 +2507,13 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] ; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; @@ -2568,10 +2544,12 @@ ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2584,10 +2562,12 @@ ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2645,8 +2625,8 @@ ; SSE42-NEXT: paddb %xmm1, %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rdx) -; SSE42-NEXT: movdqa %xmm2, (%rdx) ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: movdqa %xmm2, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: @@ -2656,11 +2636,11 @@ ; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm1 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: @@ -2691,10 +2671,12 @@ ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2709,10 +2691,12 @@ ; AVX512DQ-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2772,8 +2756,8 @@ ; SSE42-NEXT: paddb (%rsi), %xmm2 ; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: movaps %xmm0, 32(%rdx) -; SSE42-NEXT: movdqa %xmm2, (%rdx) ; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm2, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: @@ -2786,8 +2770,8 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2905,61 +2889,65 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3014,109 +3002,155 @@ ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] -; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: paddb 32(%rsi), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rdx) -; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] -; AVX-NEXT: vmovdqa (%rdi), %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[0,1],zero,zero,zero,zero,xmm2[0,1],zero,zero,zero,zero +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] +; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] ; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2],xmm1[3],mem[4,5],xmm1[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero -; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2],xmm1[3],mem[4,5],xmm1[6],mem[7] +; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] ; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; -; AVX512F-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] -; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] +; AVX512F-SLOW-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq ; -; AVX512DQ-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512F-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] +; AVX512F-FAST-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] +; AVX512DQ-FAST-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512BW: # %bb.0: @@ -3161,110 +3195,115 @@ ; SSE42-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; SSE42: # %bb.0: ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; SSE42-NEXT: movdqa 16(%rsi), %xmm0 -; SSE42-NEXT: paddb %xmm2, %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: paddb 32(%rsi), %xmm2 -; SSE42-NEXT: movdqa %xmm2, 32(%rdx) -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 +; SSE42-NEXT: paddb %xmm1, %xmm2 +; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX: # %bb.0: ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] +; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15] -; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm1 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] -; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[0,1],zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[0,1],zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512F-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpbroadcastw (%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] -; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[0,1],zero,zero,zero,zero,zero,zero ; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-FAST-NEXT: vzeroupper @@ -3272,31 +3311,34 @@ ; ; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] -; AVX512DQ-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw (%rdi), %xmm1 ; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-SLOW-NEXT: vzeroupper ; AVX512DQ-SLOW-NEXT: retq ; ; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero -; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] -; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[0,1],zero,zero,zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-FAST-NEXT: vzeroupper @@ -3342,8 +3384,8 @@ ; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rdx) -; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: @@ -3352,115 +3394,161 @@ ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],mem[1,2,3,4,5],xmm3[6],mem[7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm3 +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm3 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rdx) -; SSE42-NEXT: movdqa %xmm3, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: movdqa %xmm3, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],mem[1,2,3,4,5],xmm2[6],mem[7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; ; AVX2-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] ; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX2-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7] +; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] ; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; -; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] -; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] +; AVX512F-SLOW-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq ; -; AVX512DQ-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512F-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] +; AVX512F-FAST-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] +; AVX512DQ-FAST-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512BW: # %bb.0: @@ -3504,76 +3592,84 @@ ; SSE42-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; SSE42-NEXT: movdqa 16(%rsi), %xmm0 -; SSE42-NEXT: paddb %xmm2, %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: paddb 32(%rsi), %xmm2 -; SSE42-NEXT: movdqa %xmm2, 32(%rdx) -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 +; SSE42-NEXT: paddb %xmm1, %xmm2 +; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7] -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm1[1,2,3,4,5,6,7] +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],mem[1,2,3,4,5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],mem[1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3648,47 +3744,109 @@ ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] -; AVX2-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq ; -; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] -; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX512F-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq ; -; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX512DQ-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX2-FAST-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX512F-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-FAST-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX512F-FAST-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512BW: # %bb.0: @@ -3713,35 +3871,34 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.elt.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = mem[0,0,1,1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movdqa 16(%rsi), %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, 32(%rdx) -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,1,1] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = mem[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 +; SSE2-NEXT: paddb 32(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 32(%rdx) +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; SSE42: # %bb.0: ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE42-NEXT: movdqa 16(%rsi), %xmm0 -; SSE42-NEXT: paddb %xmm2, %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: paddb 32(%rsi), %xmm2 -; SSE42-NEXT: movdqa %xmm2, 32(%rdx) -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 +; SSE42-NEXT: paddb %xmm1, %xmm2 +; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: @@ -3752,70 +3909,36 @@ ; AVX-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[1,3],ymm2[4,4],ymm1[5,7] -; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,1,3] -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2 +; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7] +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm2, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX2-SLOW-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastd (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] -; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rdx) -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq -; -; AVX2-FAST-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rdx) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd (%rdi), %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7] +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] +; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: @@ -3823,11 +3946,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u> ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3837,11 +3960,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u> ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3873,15 +3996,15 @@ ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],mem[1,2] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] -; SSE2-NEXT: paddb 16(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 ; SSE2-NEXT: paddb 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rdx) -; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: @@ -3890,88 +4013,53 @@ ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5],xmm0[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] -; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: paddb 32(%rsi), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rdx) -; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX-NEXT: vbroadcastss (%rdi), %ymm2 -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6],ymm3[7] -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3] -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1] -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vbroadcastss (%rdi), %ymm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = mem[0,1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,1,1] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX2-SLOW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,1,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] -; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6],ymm2[7] -; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq -; -; AVX2-FAST-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6],ymm2[7] -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,1,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,6,0] +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6],ymm2[7] +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX512F: # %bb.0: @@ -3979,11 +4067,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u> ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3993,11 +4081,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u> ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4030,45 +4118,45 @@ ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: paddb (%rsi), %xmm1 ; SSE2-NEXT: movdqa 16(%rsi), %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 ; SSE2-NEXT: paddb 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rdx) -; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; SSE42-NEXT: movdqa 16(%rsi), %xmm0 -; SSE42-NEXT: paddb %xmm2, %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: paddb 32(%rsi), %xmm2 -; SSE42-NEXT: movdqa %xmm2, 32(%rdx) -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 +; SSE42-NEXT: paddb %xmm1, %xmm2 +; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3] -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -4093,11 +4181,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u> ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4107,11 +4195,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u> ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4254,13 +4342,13 @@ ; SSE2-NEXT: movapd 48(%rdi), %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE2-NEXT: paddb (%rsi), %xmm1 ; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm1 ; SSE2-NEXT: paddb 32(%rsi), %xmm0 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: @@ -4269,29 +4357,30 @@ ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE42-NEXT: paddb (%rsi), %xmm1 ; SSE42-NEXT: movdqa 16(%rsi), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm1 ; SSE42-NEXT: paddb 32(%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, 32(%rdx) -; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -4316,11 +4405,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,7,0,11,0,13,u,u> ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4330,11 +4419,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,7,0,11,0,13,u,u> ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4367,11 +4456,11 @@ ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; SSE2-NEXT: movaps 32(%rsi), %xmm2 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm1 +; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: movaps %xmm2, 32(%rdx) -; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: @@ -4381,28 +4470,27 @@ ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; SSE42-NEXT: movaps 32(%rsi), %xmm2 -; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: movaps %xmm2, 32(%rdx) -; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps 32(%rdi), %ymm0 -; AVX-NEXT: vmovdqa (%rdi), %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX-NEXT: vmovaps 32(%rsi), %ymm2 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovapd (%rdi), %ymm0 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm0[0,1] +; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] +; AVX-NEXT: vmovaps 32(%rsi), %ymm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -4477,58 +4565,58 @@ ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa (%rsi), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rsi), %xmm2 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 +; SSE2-NEXT: movdqa 32(%rsi), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rsi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm3, 48(%rdx) -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: paddb 48(%rsi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm3, 32(%rdx) +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero -; SSE42-NEXT: movdqa 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa (%rsi), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rsi), %xmm2 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rsi), %xmm3 +; SSE42-NEXT: movdqa 32(%rsi), %xmm3 ; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rsi), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rdx) -; SSE42-NEXT: movdqa %xmm3, 48(%rdx) -; SSE42-NEXT: movdqa %xmm2, (%rdx) -; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: paddb 48(%rsi), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rdx) +; SSE42-NEXT: movdqa %xmm3, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4536,10 +4624,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4547,10 +4635,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[0],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero,ymm0[16],zero -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4576,58 +4664,58 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa (%rsi), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rsi), %xmm2 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 +; SSE2-NEXT: movdqa 32(%rsi), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rsi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm3, 48(%rdx) -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: paddb 48(%rsi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm3, 32(%rdx) +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero -; SSE42-NEXT: movdqa 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa (%rsi), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rsi), %xmm2 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rsi), %xmm3 +; SSE42-NEXT: movdqa 32(%rsi), %xmm3 ; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rsi), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rdx) -; SSE42-NEXT: movdqa %xmm3, 48(%rdx) -; SSE42-NEXT: movdqa %xmm2, (%rdx) -; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: paddb 48(%rsi), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rdx) +; SSE42-NEXT: movdqa %xmm3, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4635,10 +4723,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4646,10 +4734,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[0],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16],zero,zero,zero -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4675,58 +4763,58 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa (%rsi), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rsi), %xmm2 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 +; SSE2-NEXT: movdqa 32(%rsi), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rsi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm3, 48(%rdx) -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: paddb 48(%rsi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm3, 32(%rdx) +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; SSE42-NEXT: movdqa 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa (%rsi), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rsi), %xmm2 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rsi), %xmm3 +; SSE42-NEXT: movdqa 32(%rsi), %xmm3 ; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rsi), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rdx) -; SSE42-NEXT: movdqa %xmm3, 48(%rdx) -; SSE42-NEXT: movdqa %xmm2, (%rdx) -; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: paddb 48(%rsi), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rdx) +; SSE42-NEXT: movdqa %xmm3, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4734,10 +4822,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4745,10 +4833,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4774,41 +4862,41 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4816,10 +4904,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4827,10 +4915,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4871,24 +4959,24 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rsi), %xmm2 ; AVX-NEXT: vmovaps 48(%rsi), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rdx) ; AVX-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX-NEXT: vmovaps %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0] ; AVX2-NEXT: vpand (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4896,10 +4984,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0] ; AVX512F-NEXT: vpand (%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4907,10 +4995,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0] ; AVX512DQ-NEXT: vpand (%rdi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4939,58 +5027,58 @@ ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa (%rsi), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rsi), %xmm2 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 +; SSE2-NEXT: movdqa 32(%rsi), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rsi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm3, 48(%rdx) -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: paddb 48(%rsi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm3, 32(%rdx) +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: ; SSE42: # %bb.0: ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE42-NEXT: movdqa 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa (%rsi), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rsi), %xmm2 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rsi), %xmm3 +; SSE42-NEXT: movdqa 32(%rsi), %xmm3 ; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rsi), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rdx) -; SSE42-NEXT: movdqa %xmm3, 48(%rdx) -; SSE42-NEXT: movdqa %xmm2, (%rdx) -; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: paddb 48(%rsi), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rdx) +; SSE42-NEXT: movdqa %xmm3, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: ; AVX: # %bb.0: ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4998,10 +5086,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5009,10 +5097,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[0,1],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero,ymm0[16,17],zero,zero -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5041,17 +5129,17 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa (%rsi), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rsi), %xmm2 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 +; SSE2-NEXT: movdqa 32(%rsi), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rsi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm3, 48(%rdx) -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: paddb 48(%rsi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm3, 32(%rdx) +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: @@ -5059,17 +5147,17 @@ ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; SSE42-NEXT: movdqa 16(%rsi), %xmm0 +; SSE42-NEXT: movdqa (%rsi), %xmm0 ; SSE42-NEXT: paddb %xmm1, %xmm0 -; SSE42-NEXT: movdqa (%rsi), %xmm2 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 ; SSE42-NEXT: paddb %xmm1, %xmm2 -; SSE42-NEXT: movdqa 48(%rsi), %xmm3 +; SSE42-NEXT: movdqa 32(%rsi), %xmm3 ; SSE42-NEXT: paddb %xmm1, %xmm3 -; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rdx) -; SSE42-NEXT: movdqa %xmm3, 48(%rdx) -; SSE42-NEXT: movdqa %xmm2, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb 48(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 48(%rdx) +; SSE42-NEXT: movdqa %xmm3, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: @@ -5077,24 +5165,24 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1] ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5102,10 +5190,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5113,10 +5201,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5144,48 +5232,48 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa (%rsi), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rsi), %xmm2 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 +; SSE2-NEXT: movdqa 32(%rsi), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rsi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm3, 48(%rdx) -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: paddb 48(%rsi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm3, 32(%rdx) +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: pxor %xmm0, %xmm0 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] -; SSE42-NEXT: movdqa 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa (%rsi), %xmm1 ; SSE42-NEXT: paddb %xmm0, %xmm1 -; SSE42-NEXT: movdqa (%rsi), %xmm2 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 ; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: movdqa 48(%rsi), %xmm3 +; SSE42-NEXT: movdqa 32(%rsi), %xmm3 ; SSE42-NEXT: paddb %xmm0, %xmm3 -; SSE42-NEXT: paddb 32(%rsi), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rdx) -; SSE42-NEXT: movdqa %xmm3, 48(%rdx) -; SSE42-NEXT: movdqa %xmm2, (%rdx) -; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: paddb 48(%rsi), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 48(%rdx) +; SSE42-NEXT: movdqa %xmm3, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: @@ -5193,10 +5281,10 @@ ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5205,10 +5293,10 @@ ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5217,10 +5305,10 @@ ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5278,48 +5366,92 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rsi), %xmm2 ; AVX-NEXT: vmovaps 48(%rsi), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rdx) ; AVX-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX-NEXT: vmovaps %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; -; AVX2-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0] -; AVX2-NEXT: vpand (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpand (%rdi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0] +; AVX2-FAST-NEXT: vpand (%rdi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq +; +; AVX512F-SLOW-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq +; +; AVX512F-FAST-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0] +; AVX512F-FAST-NEXT: vpand (%rdi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq ; -; AVX512F-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0] -; AVX512F-NEXT: vpand (%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512DQ-SLOW-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq ; -; AVX512DQ-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0] -; AVX512DQ-NEXT: vpand (%rdi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512DQ-FAST-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0] +; AVX512DQ-FAST-NEXT: vpand (%rdi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: ; AVX512BW: # %bb.0: @@ -5346,17 +5478,17 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,1,1] ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa (%rsi), %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa (%rsi), %xmm2 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 +; SSE2-NEXT: movdqa 32(%rsi), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 32(%rsi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm3, 48(%rdx) -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: paddb 48(%rsi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 48(%rdx) +; SSE2-NEXT: movdqa %xmm3, 32(%rdx) +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: @@ -5364,17 +5496,17 @@ ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE42-NEXT: movdqa 16(%rsi), %xmm0 +; SSE42-NEXT: movdqa (%rsi), %xmm0 ; SSE42-NEXT: paddb %xmm1, %xmm0 -; SSE42-NEXT: movdqa (%rsi), %xmm2 +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 ; SSE42-NEXT: paddb %xmm1, %xmm2 -; SSE42-NEXT: movdqa 48(%rsi), %xmm3 +; SSE42-NEXT: movdqa 32(%rsi), %xmm3 ; SSE42-NEXT: paddb %xmm1, %xmm3 -; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rdx) -; SSE42-NEXT: movdqa %xmm3, 48(%rdx) -; SSE42-NEXT: movdqa %xmm2, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb 48(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 48(%rdx) +; SSE42-NEXT: movdqa %xmm3, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: @@ -5383,15 +5515,15 @@ ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -5401,10 +5533,10 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -5412,10 +5544,10 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -5423,10 +5555,10 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero,ymm0[16,17,18,19],zero,zero,zero,zero -; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -5436,11 +5568,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5450,11 +5582,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5482,17 +5614,17 @@ ; SSE-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; SSE: # %bb.0: ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: @@ -5500,15 +5632,15 @@ ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -5517,10 +5649,10 @@ ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5529,11 +5661,11 @@ ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5542,11 +5674,11 @@ ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,2,3,16,5,6,7,16,9,10,11,16,13,14,15] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5587,23 +5719,24 @@ ; AVX-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rsi), %xmm2 ; AVX-NEXT: vmovaps 48(%rsi), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rdx) ; AVX-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX-NEXT: vmovaps %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5612,11 +5745,11 @@ ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5625,11 +5758,11 @@ ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [16,1,2,3,4,5,6,7,16,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5656,17 +5789,17 @@ ; SSE-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: ; SSE: # %bb.0: ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: @@ -5674,15 +5807,15 @@ ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -5691,10 +5824,10 @@ ; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5704,11 +5837,11 @@ ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5718,11 +5851,11 @@ ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,0,11,0,13,0,15] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5764,23 +5897,24 @@ ; AVX-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rsi), %xmm2 ; AVX-NEXT: vmovaps 48(%rsi), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rdx) ; AVX-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX-NEXT: vmovaps %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5789,11 +5923,11 @@ ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,1,2,3,8,5,6,7] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5802,11 +5936,11 @@ ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [8,1,2,3,8,5,6,7] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2q (%rdi), %zmm0, %zmm1 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5835,13 +5969,13 @@ ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movaps 16(%rsi), %xmm1 ; SSE-NEXT: movaps 48(%rsi), %xmm2 -; SSE-NEXT: movdqa 32(%rsi), %xmm3 +; SSE-NEXT: movdqa (%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: paddb 32(%rsi), %xmm0 ; SSE-NEXT: movaps %xmm2, 48(%rdx) ; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movdqa %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: movdqa %xmm3, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: @@ -5891,14 +6025,14 @@ ; AVX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; AVX-NEXT: shrq $56, %rax ; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovaps 16(%rsi), %xmm2 ; AVX-NEXT: vmovaps 48(%rsi), %xmm3 -; AVX-NEXT: vmovaps %xmm2, 16(%rdx) ; AVX-NEXT: vmovaps %xmm3, 48(%rdx) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX-NEXT: vmovaps %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: popq %rbx ; AVX-NEXT: retq ; @@ -5949,50 +6083,175 @@ ; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: shrq $56, %rax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0 +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: movq (%rdi), %rax +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: movq %rax, %r8 +; AVX512F-NEXT: movq %rax, %r9 +; AVX512F-NEXT: movq %rax, %r10 +; AVX512F-NEXT: movl %eax, %r11d +; AVX512F-NEXT: movl %eax, %ebx +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: shrl $16, %ebx +; AVX512F-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 +; AVX512F-NEXT: shrl $24, %r11d +; AVX512F-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; AVX512F-NEXT: shrq $32, %r10 +; AVX512F-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0 +; AVX512F-NEXT: shrq $40, %r9 +; AVX512F-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 +; AVX512F-NEXT: shrq $48, %r8 +; AVX512F-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 +; AVX512F-NEXT: movq 8(%rdi), %rax +; AVX512F-NEXT: shrq $56, %rcx +; AVX512F-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $8, %ecx +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $16, %ecx +; AVX512F-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $24, %ecx +; AVX512F-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $32, %rcx +; AVX512F-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $40, %rcx +; AVX512F-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $48, %rcx +; AVX512F-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: shrq $56, %rax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: popq %rbx ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 +; AVX512DQ-NEXT: pushq %rbx +; AVX512DQ-NEXT: movq (%rdi), %rax +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: movq %rax, %r9 +; AVX512DQ-NEXT: movq %rax, %r10 +; AVX512DQ-NEXT: movl %eax, %r11d +; AVX512DQ-NEXT: movl %eax, %ebx +; AVX512DQ-NEXT: vmovd %eax, %xmm0 +; AVX512DQ-NEXT: shrl $8, %eax +; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrl $16, %ebx +; AVX512DQ-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrl $24, %r11d +; AVX512DQ-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrq $32, %r10 +; AVX512DQ-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrq $40, %r9 +; AVX512DQ-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrq $48, %r8 +; AVX512DQ-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq 8(%rdi), %rax +; AVX512DQ-NEXT: shrq $56, %rcx +; AVX512DQ-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movl %eax, %ecx +; AVX512DQ-NEXT: shrl $8, %ecx +; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movl %eax, %ecx +; AVX512DQ-NEXT: shrl $16, %ecx +; AVX512DQ-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movl %eax, %ecx +; AVX512DQ-NEXT: shrl $24, %ecx +; AVX512DQ-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shrq $32, %rcx +; AVX512DQ-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shrq $40, %rcx +; AVX512DQ-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shrq $48, %rcx +; AVX512DQ-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrq $56, %rax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: popq %rbx ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,10,11,0,1,14,15] -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm2, %zmm0 +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: movq (%rdi), %rax +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: movq %rax, %r9 +; AVX512BW-NEXT: movq %rax, %r10 +; AVX512BW-NEXT: movl %eax, %r11d +; AVX512BW-NEXT: movl %eax, %ebx +; AVX512BW-NEXT: vmovd %eax, %xmm0 +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: shrl $16, %ebx +; AVX512BW-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 +; AVX512BW-NEXT: shrl $24, %r11d +; AVX512BW-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; AVX512BW-NEXT: shrq $32, %r10 +; AVX512BW-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0 +; AVX512BW-NEXT: shrq $40, %r9 +; AVX512BW-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 +; AVX512BW-NEXT: shrq $48, %r8 +; AVX512BW-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 +; AVX512BW-NEXT: movq 8(%rdi), %rax +; AVX512BW-NEXT: shrq $56, %rcx +; AVX512BW-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $8, %ecx +; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $24, %ecx +; AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shrq $32, %rcx +; AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shrq $40, %rcx +; AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shrq $48, %rcx +; AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: shrq $56, %rax +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: popq %rbx ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64 diff --git a/llvm/test/CodeGen/X86/zext-demanded.ll b/llvm/test/CodeGen/X86/zext-demanded.ll --- a/llvm/test/CodeGen/X86/zext-demanded.ll +++ b/llvm/test/CodeGen/X86/zext-demanded.ll @@ -140,7 +140,8 @@ define i32 @PR36689(ptr) { ; CHECK-LABEL: PR36689: ; CHECK: # %bb.0: -; CHECK-NEXT: movzwl (%rdi), %eax +; CHECK-NEXT: movzbl 1(%rdi), %eax +; CHECK-NEXT: shll $8, %eax ; CHECK-NEXT: orl $255, %eax ; CHECK-NEXT: retq %2 = load i32, ptr %0 diff --git a/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll b/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll --- a/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll +++ b/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll @@ -14,7 +14,7 @@ ; ; X64-LABEL: test1: ; X64: # %bb.0: # %entry -; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movzbl (%rdi), %eax ; X64-NEXT: shll $2, %eax ; X64-NEXT: andl $60, %eax ; X64-NEXT: retq @@ -37,7 +37,7 @@ ; ; X64-LABEL: test2: ; X64: # %bb.0: # %entry -; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movzbl (%rdi), %eax ; X64-NEXT: andl $15, %eax ; X64-NEXT: leaq (%rdi,%rax,4), %rax ; X64-NEXT: retq @@ -89,9 +89,9 @@ ; ; X64-LABEL: test4: ; X64: # %bb.0: # %entry -; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movzbl (%rdi), %eax ; X64-NEXT: shrq $2, %rax -; X64-NEXT: andl $60, %eax +; X64-NEXT: andl $-4, %eax ; X64-NEXT: retq entry: %bf.load = load i8, ptr %data, align 4 diff --git a/llvm/test/CodeGen/X86/zext-lshr.ll b/llvm/test/CodeGen/X86/zext-lshr.ll --- a/llvm/test/CodeGen/X86/zext-lshr.ll +++ b/llvm/test/CodeGen/X86/zext-lshr.ll @@ -42,7 +42,7 @@ ; X64-LABEL: i64_zext_shift_i16_zext_i8: ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: shrq $5, %rax +; X64-NEXT: shrl $5, %eax ; X64-NEXT: retq %t0 = zext i8 %a0 to i16 %t1 = lshr i16 %t0, 5 diff --git a/llvm/test/CodeGen/X86/zext-shl.ll b/llvm/test/CodeGen/X86/zext-shl.ll --- a/llvm/test/CodeGen/X86/zext-shl.ll +++ b/llvm/test/CodeGen/X86/zext-shl.ll @@ -51,7 +51,7 @@ ; X64-LABEL: i64_zext_shift_i16_zext_i8: ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: shlq $5, %rax +; X64-NEXT: shll $5, %eax ; X64-NEXT: retq %t0 = zext i8 %a0 to i16 %t1 = shl i16 %t0, 5